Commit 4b097dee authored by liangjing's avatar liangjing
Browse files

update to core_v0.9

parent 3aca1415
{
"COMMENT": "Sources for these prompts include https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/viewer and https://huggingface.co/datasets/HuggingFaceM4/M3IT",
"Captioning": {
"raw": [
"Can you briefly explain what you see in the image?",
"Describe what's happening in this image in one short sentence.",
"Write a short caption that accurately represents the content of this image.",
"Please generate a descriptive caption for the image provided.",
"How would you summarize the scene depicted in the picture in short?",
"Describe the image briefly.",
"Write a succinct description of the image, capturing its main components, the relationships between them, and any notable details.",
"Create a concise caption that accurately describes the main elements in the image provided.",
"Write a brief, yet comprehensive, description of the image.",
"Describe the image in a clear and concise manner.",
"For the given image, provide a one-sentence summary that captures the most important details.",
"Generate a short caption for the picture.",
"Write a short and informative description that highlights the primary subjects and actions occurring in the given image.",
"Provide a concise and informative caption for the image, focusing on the primary subjects.",
"Write a clear description of the image, make sure the key features are well covered.",
"Offer a succinct explanation of the picture presented."
]
},
"CaptioningPretraining": {
"raw": [
"Generate a short caption of the image.",
"Describe the image concisely.",
"Provide a brief description of the given image."
],
"llava": [
"Give a brief description of image.",
"Give a brief description of the image.",
"Provide a brief description of the given image.",
"Provide a one-sentence caption for the provided image.",
"Write a terse but informative summary of the picture.",
"Describe the image concisely.",
"Generate a clear and concise summary of the photo."
]
},
"OCR": {
"raw": [
"Can you read the text from image and output here?",
"Extract and document the text from the provided image.",
"Converting the text embedded in this image into a readable document.",
"Transcribe all the text you find.",
"Can you extract all visible text from the image here?"
]
}
}
__module__: megatron.energon
__class__: Metadataset
splits:
train:
datasets:
- weight: 1.
path: <path_to_pretraining_dataset_in_energon_format>
subflavors:
augmentation: false
val:
datasets:
- weight: 1.
path: <path_to_pretraining_dataset_in_energon_format>
subflavors:
augmentation: false
#!/bin/bash
# Pretrain a multimodal model.
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
MODEL_NAME="mcore-llava-mistral-7b-instruct-clip336-pretraining"
# Check that the user has set an output path for model checkpoints.
if [[ -z $WORKSPACE ]]; then
echo "Please set WORKSPACE for storing your model checkpoints."
exit 1
fi
SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR=${OUTPUT}/checkpoints
LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard"
if [[ -z $LOAD_NAME ]]; then
echo "Please set LOAD_NAME for input model name."
exit 1
fi
if [[ -z $TOKENIZER_MODEL ]]; then
echo "Please set TOKENIZER_MODEL for tokenizer model name."
exit 1
fi
CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
DATA_TRAIN="${SOURCE}/examples/multimodal/pretrain_dataset.yaml"
DATA_VALID="${SOURCE}/examples/multimodal/pretrain_dataset.yaml"
DEBUG=0
if [[ $DEBUG -eq 1 ]]; then
BZ=32
NW=2
HD=0.0
LI=1
EXTRA_ARGS=""
NONDETERMINISTIC_ATTN=1
else
BZ=256
NW=2
HD=0.1
LI=10
EXTRA_ARGS=""
NONDETERMINISTIC_ATTN=1
fi
OPTIONS=" \
--apply-layernorm-1p \
--attention-softmax-in-fp32 \
--use-checkpoint-args \
--use-distributed-optimizer \
--transformer-impl transformer_engine \
--use-te \
--normalization RMSNorm \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--num-workers ${NW} \
--exit-duration-in-mins 230 \
--use-flash-attn \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--attention-dropout 0.0 \
--hidden-dropout ${HD} \
--tensor-model-parallel-size 4 \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--num-attention-heads 32 \
--seq-length 2048 \
--max-position-embeddings 4096 \
--ffn-hidden-size 14336 \
--train-iters 20000 \
--micro-batch-size 1 \
--global-batch-size ${BZ} \
--lr-decay-iters 20000 \
--lr-warmup-fraction .01 \
--lr 0.00015 \
--min-lr 1.0e-5 \
--lr-decay-style cosine \
--log-interval ${LI} \
--eval-iters 10 \
--eval-interval 1000 \
--tokenizer-type HuggingFaceTokenizer \
--tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \
--data-path ${DATA_TRAIN} \
--valid-path ${DATA_VALID} \
--prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
--save-interval 1000 \
--save ${FINETUNE_DIR} \
--load ${FINETUNE_DIR} \
--dataloader-save ${FINETUNE_DIR}/dataloader \
--pretrained-checkpoint ${CHECKPOINT_DIR} \
--split 100,0,0 \
--clip-grad 1.0 \
--weight-decay 1e-2 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.014 \
--log-params-norm \
--log-num-zeros-in-grad \
--bf16 \
--eod-mask-loss \
--freeze-LM \
--freeze-ViT \
--patch-dim 14 \
--img-h 336 \
--img-w 336 \
--dataloader-type external \
--tensorboard-dir ${TENSORBOARD_DIR} \
--language-model-type=mistral_7b \
--disable-vision-class-token \
${EXTRA_ARGS} \
--distributed-timeout-minutes 60 \
--allow-missing-vision-projection-checkpoint \
"
export NVTE_APPLY_QK_LAYER_SCALING=0
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${NONDETERMINISTIC_ATTN}
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
\ No newline at end of file
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""Generate text using a vision language model."""
import glob
import json
import logging
import os
import sys
from collections import defaultdict
from functools import partial
# Add megatron to the path.
sys.path.append(
os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
)
import datasets
import numpy as np
import torch
from dataset_helpers import tokenizer_image_token
from image_processing import get_visual_transform
from MMMU.eval.utils.data_utils import (
CAT_SHORT2LONG,
construct_prompt,
load_yaml,
process_single_sample,
)
from MMMU.eval.utils.eval_utils import parse_multi_choice_response
from PIL import Image
from train import add_multimodal_extra_args, get_num_image_embeddings, model_provider
from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN_INDEX
from megatron.inference.text_generation.api import generate_and_post_process
from megatron.inference.text_generation.forward_step import ForwardStep
from megatron.training import get_args, get_model, get_tokenizer, print_rank_0
from megatron.training.checkpointing import load_checkpoint
from megatron.training.initialize import initialize_megatron
def add_text_generation_args(parser):
"""Text generation arguments."""
group = parser.add_argument_group(title='Vision language model text generation arguments')
group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
group.add_argument("--top_k", type=int, default=0, help='Top k sampling.')
group.add_argument(
"--out-seq-length", type=int, default=1024, help='Length of the output generated text.'
)
group.add_argument("--output-path", type=str, required=True, help='Output file path')
group.add_argument('--input-image-path', type=str, required=True, help="Input image directory")
group.add_argument('--input-metadata-path', type=str, help="Input metadata path")
group.add_argument(
'--num-partitions', type=int, default=0, help="Number of partitions for inputs."
)
group.add_argument('--partition-id', type=int, default=0, help="Partition index")
group.add_argument("--drop-vision-class-token", action="store_true", default=False)
group.add_argument("--gt-path", type=str, help="Optional ground truth file")
group.add_argument(
"--task",
type=str,
choices=["captioning", "TextVQA", "VQAv2", "ChartQA", "MMMU"],
help="Generation task to run",
)
group.add_argument(
"--num-samples-per-partition", type=int, default=0, help="Number of samples per partition"
)
group.add_argument(
"--prompt-format",
type=str,
required=True,
choices=["llama3", "mistral"],
help="Prompting format to use",
)
# Add common multimodal arguments needed for e.g. building the model.
parser = add_multimodal_extra_args(parser)
return parser
def _get_partition_bounds(
total_num_samples, num_samples_per_partition, num_partitions, partition_id
):
if num_samples_per_partition == 0:
num_samples_per_partition = total_num_samples // num_partitions
return num_samples_per_partition * partition_id, num_samples_per_partition * (partition_id + 1)
def get_evaluation_dataset(
task,
input_image_path,
gt_path,
img_h,
img_w,
use_tiling,
max_num_tiles,
use_thumbnail,
num_samples_per_partition,
num_partitions,
partition_id,
):
"""Build evaluation dataset."""
images = []
tile_counts = []
questions, answers = [], []
samples, sample_ids = [], []
if task == "TextVQA":
samples = json.load(open(gt_path, encoding='utf-8'))['data']
# Optionally, process only a subset of the input files.
if num_partitions > 0:
lb, ub = _get_partition_bounds(
len(samples), num_samples_per_partition, num_partitions, partition_id
)
samples = samples[lb:ub]
for i in range(len(samples)):
sample = samples[i]
img_file = "{}/{}.jpg".format(input_image_path, sample["image_id"])
if not os.path.exists(img_file):
img_file = img_file.replace('.jpg', '.png')
img = Image.open(img_file)
imgs = get_visual_transform(
img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False
)
images.append(imgs)
tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int))
questions.append(sample["question"])
answers.append(sample["answers"])
sample_ids.append(sample["question_id"])
elif task == "VQAv2":
samples = json.load(open(gt_path, encoding='utf-8'))
# Optionally, process only a subset of the input files.
if num_partitions > 0:
lb, ub = _get_partition_bounds(
len(samples), num_samples_per_partition, num_partitions, partition_id
)
samples = samples[lb:ub]
for i in range(len(samples)):
sample = samples[i]
img_file = "{}/{}".format(input_image_path, sample["image"])
img = Image.open(img_file)
imgs = get_visual_transform(
img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False
)
images.append(imgs)
tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int))
questions.append(sample["question"])
answers.append(sample["answer"])
sample_ids.append(sample["question_id"])
elif task == "ChartQA":
samples = json.load(open(gt_path, encoding='utf-8'))
# Optionally, process only a subset of the input files.
if num_partitions > 0:
lb, ub = _get_partition_bounds(
len(samples), num_samples_per_partition, num_partitions, partition_id
)
samples = samples[lb:ub]
for i in range(len(samples)):
sample = samples[i]
img_file = "{}/{}".format(input_image_path, sample["imgname"])
img = Image.open(img_file)
imgs = get_visual_transform(
img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False
)
images.append(imgs)
tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int))
questions.append(sample["query"])
answers.append(sample["label"])
sample_ids.append(i)
elif task == "captioning":
image_files = sorted(glob.glob(input_image_path + "/*"))
# Optionally, process only a subset of the input files.
if num_partitions > 0:
lb, ub = _get_partition_bounds(
len(image_files), num_samples_per_partition, num_partitions, partition_id
)
image_files = image_files[lb:ub]
gts = json.load(open(gt_path))
answers = defaultdict(list)
for gt in gts["annotations"]:
answers[gt["image_id"]].append(gt['caption'])
# Run image preprocessing.
for i in range(len(image_files)):
image_file = image_files[i]
img = Image.open(image_file)
imgs = get_visual_transform(
img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False
)
images.append(imgs)
tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int))
image_id = int(image_file.split("_")[-1].split(".")[0])
sample_ids.append(image_id)
elif task == 'MMMU':
# The following downloads the MMMU dataset from HuggingFace and uses the API from the MMMU github repo to run MMMU evaluation.
all_mmmu_datasets = []
hf_datasets_cache = os.environ["HF_DATASETS_CACHE"]
assert hf_datasets_cache != "", "Please set the environment variable HF_DATASETS_CACHE."
for subject in CAT_SHORT2LONG.values():
subject_dataset = datasets.load_dataset(
"MMMU/MMMU", subject, split=datasets.Split.VALIDATION, cache_dir=hf_datasets_cache
)
all_mmmu_datasets.append(subject_dataset)
dataset = datasets.concatenate_datasets(all_mmmu_datasets)
dataset = [s for s in dataset if s['id'].startswith("val")]
# Optionally, process only a subset of the input files.
start_idx = 0
end_idx = len(dataset)
if num_partitions > 0:
start_idx, end_idx = _get_partition_bounds(
len(dataset), num_samples_per_partition, num_partitions, partition_id
)
end_idx = min(len(dataset), end_idx)
# Using the LLaVA config from the MMMU repo.
config = load_yaml("examples/multimodal/MMMU/eval/configs/llava1.5.yaml")
for k, v in config.items():
if isinstance(v, list):
assert len(v) == 1, "only one value supported."
config[k] = v[0]
for idx in range(start_idx, end_idx):
sample = dataset[idx]
sample = process_single_sample(sample)
sample = construct_prompt(sample, config)
img = sample["image"]
imgs = get_visual_transform(
img, img_h, img_w, use_tiling, max_num_tiles, use_thumbnail, augment=False
)
images.append(imgs)
tile_counts.append(torch.tensor([len(imgs)], dtype=torch.int))
sample_ids.append(sample['id'])
# TODO: Support multiple input images and the original image position. Note: <image> is added back in the prompt construction below.
prompt = sample['final_input_prompt']
for i in range(8):
prompt = prompt.replace(f"<image {i}>", "")
questions.append(prompt)
answers.append(sample['answer'])
samples.append(sample)
else:
raise NotImplementedError("unsupported task")
return images, tile_counts, samples, sample_ids, questions, answers
def generate_samples(model):
"""Text generation using a trained vision language model."""
args = get_args()
images, tile_counts, samples, sample_ids, questions, answers = get_evaluation_dataset(
args.task,
args.input_image_path,
args.gt_path,
args.img_h,
args.img_w,
args.use_tiling,
args.max_num_tiles,
args.use_thumbnail,
args.num_samples_per_partition,
args.num_partitions,
args.partition_id,
)
num_samples = len(sample_ids)
idx = 0
while idx < num_samples:
imgs = torch.stack(images[idx]).cuda()
num_tiles = tile_counts[idx].cuda()
sample_id = sample_ids[idx]
prompt = get_prompt(args.task, questions, idx, args.prompt_format)
forward_step = partial(VLMForwardStep, imgs, num_tiles)
if torch.distributed.get_rank() == 0:
resp_sentences, _, _, _ = generate_and_post_process(
model,
forward_step=forward_step,
prompts=[prompt],
tokens_to_generate=args.out_seq_length,
top_k_sampling=args.top_k,
top_p_sampling=args.top_p,
add_BOS=False,
temperature=args.temperature,
random_seed=args.seed,
detokenize_segments=False,
)
for prompt, generation in zip([prompt], resp_sentences):
output = {"sample_id": sample_id, "prompt": prompt}
output_name = ""
if args.task == "captioning":
output_name = "caption"
elif args.task in ("TextVQA", "VQAv2", "ChartQA"):
output_name = "answer"
elif args.task in ("MMMU"):
output_name = "text"
generated = get_generated(prompt, args.prompt_format, generation)
output[output_name] = generated
if args.task == "captioning":
output["ground_truth"] = answers[sample_id]
elif args.task in ("TextVQA", "VQAv2"):
output["gt_answer"] = [ans for ans in answers[idx]]
elif args.task == "ChartQA":
output["gt_answer"] = [answers[idx]]
elif args.task == "MMMU":
sample = samples[idx]
prediction = generated
if sample["question_type"] == "multiple-choice":
prediction = parse_multi_choice_response(
generated, sample["all_choices"], sample["index2ans"]
)
output["prediction"] = prediction
print_rank_0(output)
yield output
idx += 1
else:
generate_and_post_process(model, forward_step=forward_step, detokenize_segments=False)
idx += 1
def generate_and_write_samples(model):
"""Generate text and write to an output file."""
args = get_args()
for output in generate_samples(model):
if torch.distributed.get_rank() == 0:
with open(args.output_path, 'a') as f:
f.write(json.dumps(output) + "\n")
class VLMForwardStep(ForwardStep):
"""Inference forward step for a multimodal model."""
def __init__(self, images, num_tiles, model, max_batch_size, max_sequence_length):
"""Create multimodal forward step."""
total_num_tiles = torch.sum(num_tiles).item()
num_img_embeddings = get_num_image_embeddings() * total_num_tiles
super().__init__(model, max_batch_size, max_sequence_length + num_img_embeddings)
self._images = images
self._num_tiles = num_tiles
def _forward(self, tokens, position_ids, attention_mask):
return self.model(
self._images,
tokens,
position_ids,
attention_mask=None,
inference_params=self.inference_params,
num_image_tiles=self._num_tiles,
)
def __call__(self, tokens, position_ids, attention_mask):
logits = super().__call__(tokens, position_ids, attention_mask)
# On the first inference iteration, we compute image tokens.
# Update the sequence length offset by the number of image tokens.
num_images = (tokens == -200).sum().item()
num_tokens = tokens.size(1)
if num_tokens > 1 and num_images > 0:
self.inference_params.sequence_len_offset += (
self.inference_params.key_value_memory_dict["image_tokens_count"] - num_images
)
return logits
def get_prompt(task, questions, idx, prompt_format):
"""Get a prompt for the evaluation task."""
if task == "captioning":
if prompt_format == "llama3":
prompt = "<|start_header_id|>system<|end_header_id|>\n\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\nProvide a one-sentence caption for provided image.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
elif prompt_format == "mistral":
prompt = "<image>Give a short and clear explanation of the subsequent image.\n"
elif task == "TextVQA":
question = questions[idx]
if prompt_format == "llama3":
prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\n{}\nAnswer the question using a single word or phrase.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".format(
question
)
elif prompt_format == "mistral":
prompt = "<image>\n{}\nAnswer the question using a single word or phrase.".format(
question
)
elif task == "VQAv2":
question = questions[idx]
if prompt_format == "llama3":
prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\n{}\nAnswer the question using a single word or phrase.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".format(
question
)
elif prompt_format == "mistral":
prompt = "<image>\n{}\nAnswer the question using a single word or phrase.".format(
question
)
elif task == "ChartQA":
question = questions[idx]
if prompt_format == "llama3":
prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\n{}\nAnswer the question using a single word or phrase.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".format(
questions
)
elif prompt_format == "mistral":
prompt = "<image>\n{}\nAnswer the question using a single word or phrase.".format(
question
)
elif task == "MMMU":
question = questions[idx]
if prompt_format == "llama3":
prompt = "<|start_header_id|>system<|end_header_id|>\n\nAnswer the questions.<|eot_id|>{}<|start_header_id|>user<|end_header_id|>\n\n<image>\n{}\nAnswer the question using a single word or phrase.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
prompt = prompt.format("", question)
elif prompt_format == "mistral":
prompt = "<image>\n{}\nAnswer the question using a single word or phrase.".format(
question
)
return prompt
def get_generated(prompt, prompt_format, prompt_and_generation):
"""Strip prompt and other unnecessary text from generation."""
start = len(prompt.replace("<image>", ""))
if prompt_format == "llama3":
start += len("<|begin_of_text|>")
start += 1
elif prompt_format == "mistral":
start += len("<s><unk><s> ")
generated = prompt_and_generation[start:]
generated = generated.split("<|eot_id|>")[0]
generated = generated.split("</s>")[0]
generated = generated.strip()
generated = generated.split("\n\n")[0]
generated = generated.split("\n")[0]
return generated
def patch_tokenizer(args):
"""Patch tokenizer with image token support."""
def _decorate_tokenize(f):
# When tokenizing, replace <image> with the image token index (-200)
def wrapper(prompt):
tokens = tokenizer_image_token(args, prompt, f)
return tokens
return wrapper
def _decorate_detokenize(f):
# When detokenizing, replace image token index (-200) with a dummy value.
def wrapper(tokens):
tokens = np.array(tokens)
tokens[tokens == IMAGE_TOKEN_INDEX] = 0
tokens = tokens.tolist()
return f(tokens)
return wrapper
tokenizer = get_tokenizer()
tokenizer.tokenize = _decorate_tokenize(tokenizer.tokenize)
tokenizer.detokenize = _decorate_detokenize(tokenizer.detokenize)
def main():
"""Vision language model text generation."""
logging.getLogger(__name__).warning("Models using pipeline parallelism are not supported yet.")
initialize_megatron(extra_args_provider=add_text_generation_args)
args = get_args()
patch_tokenizer(args) # Make the tokenizer support image tokens.
def wrapped_model_provider(pre_process, post_process):
return model_provider(pre_process, post_process, parallel_output=False)
# Set up model and load checkpoint.
model = get_model(wrapped_model_provider, wrap_with_ddp=False)
if args.load is not None:
_ = load_checkpoint(model, None, None)
model = model[0]
model.eval()
generate_and_write_samples(model)
if __name__ == "__main__":
main()
__module__: megatron.energon
__class__: Metadataset
splits:
train:
datasets:
- weight: 1.
path: <path_to_sft_dataset_in_energon_format>
subflavors:
augmentation: false
val:
datasets:
- weight: 1.
path: <path_to_sft_dataset_in_energon_format>
subflavors:
augmentation: false
#!/bin/bash
# Run SFT on a pretrained multimodal model
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
MODEL_NAME="mcore-llava-mistral-7b-instruct-clip336-sft"
# Check that the user has set an output path for model checkpoints.
if [[ -z $WORKSPACE ]]; then
echo "Please set WORKSPACE for storing your model checkpoints."
exit 1
fi
SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR=${OUTPUT}/checkpoints
LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard"
if [[ -z $LOAD_NAME ]]; then
echo "Please set LOAD_NAME for input model name."
exit 1
fi
if [[ -z $LOAD_ITER ]]; then
echo "Please set LOAD_ITER for pre-trained input model iteration."
exit 1
fi
if [[ -z $TOKENIZER_MODEL ]]; then
echo "Please set TOKENIZER_MODEL for tokenizer model name."
exit 1
fi
CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
DATA_TRAIN="${SOURCE}/examples/multimodal/sft_dataset.yaml"
DATA_VALID="${SOURCE}/examples/multimodal/sft_dataset.yaml"
DEBUG=0
if [[ $DEBUG -eq 1 ]]; then
BZ=8
NW=1
HD=0.0
LI=1
EXTRA_ARGS=""
NONDETERMINISTIC_ATTN=1
else
BZ=128
NW=2
HD=0.1
LI=10
EXTRA_ARGS=""
NONDETERMINISTIC_ATTN=1
fi
OPTIONS=" \
--apply-layernorm-1p \
--attention-softmax-in-fp32 \
--use-checkpoint-args \
--use-distributed-optimizer \
--transformer-impl transformer_engine \
--use-te \
--normalization RMSNorm \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--num-workers ${NW} \
--exit-duration-in-mins 230 \
--use-flash-attn \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--attention-dropout 0.0 \
--hidden-dropout ${HD} \
--tensor-model-parallel-size 4 \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--num-attention-heads 32 \
--seq-length 2048 \
--max-position-embeddings 4096 \
--ffn-hidden-size 14336 \
--train-iters 20000 \
--micro-batch-size 1 \
--global-batch-size ${BZ} \
--lr-decay-iters 20000 \
--lr-warmup-fraction .01 \
--lr 1e-6 \
--min-lr 1e-7 \
--lr-decay-style cosine \
--log-interval ${LI} \
--eval-iters 10 \
--eval-interval 500 \
--tokenizer-type HuggingFaceTokenizer \
--tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \
--data-path ${DATA_TRAIN} \
--valid-path ${DATA_VALID} \
--prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
--save-interval 500 \
--save ${FINETUNE_DIR} \
--load ${FINETUNE_DIR} \
--pretrained-checkpoint ${CHECKPOINT_DIR} \
--dataloader-save ${FINETUNE_DIR}/dataloader \
--split 100,0,0 \
--clip-grad 0.5 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.014 \
--log-params-norm \
--log-num-zeros-in-grad \
--eod-mask-loss \
--freeze-ViT \
--patch-dim 14 \
--img-h 336 \
--img-w 336 \
--dataloader-type external \
--tensorboard-dir ${TENSORBOARD_DIR} \
--language-model-type=mistral_7b \
--disable-vision-class-token \
${EXTRA_ARGS} \
--distributed-timeout-minutes 60 \
"
export NVTE_APPLY_QK_LAYER_SCALING=0
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${NONDETERMINISTIC_ATTN}
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
#!/bin/bash
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0
INPUT_METADATA_PATH="placeholder"
GROUNDTRUTH_PATH="placeholder"
while [[ $# -gt 0 ]]; do
case $1 in
--input-image-path)
INPUT_IMAGE_PATH="$2"
shift
shift
;;
--input-metadata-path)
INPUT_METADATA_PATH="$2"
shift
shift
;;
-g|--groundtruth-path)
GROUNDTRUTH_PATH="$2"
shift
shift
;;
-o|--output-path)
OUTPUT_PATH="$2"
shift
shift
;;
-m|--model-path)
MODEL_PATH="$2"
shift
shift
;;
-t|--tokenizer-path)
TOKENIZER_PATH="$2"
shift
shift
;;
--task)
TASK="$2"
shift
shift
;;
-g|--gt-path)
GROUNDTRUTH_PATH="$2"
shift
shift
;;
-*|--*)
echo "Invalid option $1"
exit 1
;;
esac
done
# Please modify these as needed.
NUM_PARTITIONS=100
START=0
END=0
for PARTITION_ID in $( eval echo {$START..$END} )
do
torchrun --nproc_per_node 4 examples/multimodal/run_text_generation.py \
--apply-layernorm-1p \
--attention-softmax-in-fp32 \
--use-flash-attn \
--transformer-impl transformer_engine \
--use-te \
--use-checkpoint-args \
--normalization RMSNorm \
--language-model-type mistral_7b \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--tensor-model-parallel-size 4 \
--pipeline-model-parallel-size 1 \
--group-query-attention \
--num-query-groups 8 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 14336 \
--num-attention-heads 32 \
--max-position-embeddings 4096 \
--no-masked-softmax-fusion \
--load ${MODEL_PATH} \
--tokenizer-type HuggingFaceTokenizer \
--tokenizer-model ${TOKENIZER_PATH} \
--bf16 \
--micro-batch-size 1 \
--seq-length 2048 \
--out-seq-length 700 \
--temperature 1.0 \
--img-h 336 \
--img-w 336 \
--patch-dim 14 \
--seed 153 \
--top_k 1 \
--no-load-rng \
--no-load-optim \
--input-image-path ${INPUT_IMAGE_PATH} \
--input-metadata-path ${INPUT_METADATA_PATH} \
--num-partitions ${NUM_PARTITIONS} \
--partition-id ${PARTITION_ID} \
--output-path ${OUTPUT_PATH}-${TASK}-${PARTITION_ID}.jsonl \
--gt-path ${GROUNDTRUTH_PATH} \
--task ${TASK} \
--disable-vision-class-token \
--prompt-format mistral
done
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""Pretrain or SFT multimodal."""
from copy import deepcopy
from functools import partial
import os
import sys
import warnings
import torch
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
os.path.pardir, os.path.pardir)))
from megatron.training import get_args, get_timers, get_tokenizer, print_rank_0
from megatron.training.arguments import core_transformer_config_from_args
from megatron.core import mpu, tensor_parallel
from megatron.core.enums import ModelType
from megatron.core.parallel_state import get_tensor_model_parallel_rank
from config import get_language_model_config, get_vision_model_config, get_vision_projection_config
from megatron.core.models.multimodal.llava_model import LLaVAModel
from layer_specs import get_layer_spec, get_mlp_module_spec, get_layer_spec_te
from megatron.training import pretrain
from dataloader_provider import train_valid_test_dataloaders_provider
def model_provider(
pre_process=True, post_process=True, add_encoder=True, add_decoder=True,
parallel_output=True) -> LLaVAModel:
"""Builds the model.
Args:
pre_process (bool): Include the embedding layer in the gpt decoder (used with pipeline parallelism). Defaults to True.
post_process (bool): Include an output layer and a layernorm in the gpt decoder (used with pipeline parallelism). Defaults to True.
add_encoder (bool): Construct the encoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the encoder
will live on only a subset of the pipeline stages (specifically, only the first stage).
add_decoder (bool): Construct the decoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the decoder
will live on only a subset of the pipeline stages (specifically, every stage after the first one).
parallel_output (bool): Enable parallel model output.
Returns:
model: A multimodal model.
"""
args = get_args()
use_te = args.use_te
print_rank_0('building a multimodal model ...')
num_image_tokens = get_num_image_embeddings()
old_seq_length = args.seq_length
args.decoder_seq_length = args.seq_length + num_image_tokens
args.seq_length = num_image_tokens
if torch.distributed.get_rank() == 0:
warnings.warn("Changed decoder_seq_length to num_image_tokens ({num_image_tokens}) + user-specified seq_length ({old_seq_length}).")
if args.decoder_seq_length > args.max_position_embeddings:
args.max_position_embeddings = args.decoder_seq_length
warnings.warn("Expanded max_position_embeddings to {args.max_position_embeddings} to accommodate the full sequence of vit output + llm output.")
base_config = core_transformer_config_from_args(get_args())
base_config.language_model_type = args.language_model_type
base_config.vision_model_type = args.vision_model_type
base_config.calculate_per_token_loss = True
language_config = deepcopy(base_config)
language_config = get_language_model_config(language_config)
if use_te:
language_transformer_layer_spec = get_layer_spec_te(is_vit=False) # TENorm detects LayerNorm/RMS automatically.
else:
language_transformer_layer_spec = get_layer_spec(is_vit=False, normalization=language_config.normalization)
vision_config = deepcopy(base_config)
vision_config = get_vision_model_config(vision_config, apply_query_key_layer_scaling=args.apply_query_key_layer_scaling)
vision_model_type = args.vision_model_type
if vision_model_type == "clip":
if use_te:
vision_transformer_layer_spec = get_layer_spec_te(is_vit=True) # TENorm detects LayerNorm/RMS automatically.
else:
vision_transformer_layer_spec = get_layer_spec(is_vit=True, normalization=vision_config.normalization)
else:
raise RuntimeError("unsupported vision model type", vision_model_type)
vision_projection_config = deepcopy(base_config)
vision_projection_config = get_vision_projection_config(vision_projection_config, language_config.hidden_size)
if args.encoder_pipeline_model_parallel_size > 0:
assert args.encoder_pipeline_model_parallel_size == 1, "ViT can only live on 1 pipeline stage."
vision_config.pipeline_model_parallel_size = args.encoder_pipeline_model_parallel_size
vision_projection_config.pipeline_model_parallel_size = args.encoder_pipeline_model_parallel_size
if args.encoder_tensor_model_parallel_size > 0:
vision_config.tensor_model_parallel_size = args.encoder_tensor_model_parallel_size
vision_projection_config.tensor_model_parallel_size = args.encoder_tensor_model_parallel_size
vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules
model = LLaVAModel(
language_transformer_config=language_config,
language_transformer_layer_spec=language_transformer_layer_spec,
language_vocab_size=args.padded_vocab_size,
language_max_sequence_length=args.max_position_embeddings,
vision_transformer_config=vision_config,
vision_transformer_layer_spec=vision_transformer_layer_spec,
drop_vision_class_token=args.disable_vision_class_token,
vision_projection_config=vision_projection_config,
vision_projection_layer_spec=vision_projection_layer_spec,
vision_projection_type="mlp",
allow_missing_vision_projection_checkpoint=args.allow_missing_vision_projection_checkpoint,
parallel_output=parallel_output,
language_position_embedding_type=args.position_embedding_type,
language_rotary_percent=args.rotary_percent,
pre_process=pre_process,
post_process=post_process,
add_encoder=add_encoder,
add_decoder=add_decoder,
img_h=args.img_h,
img_w=args.img_w,
patch_dim=args.patch_dim,
language_rotary_base=args.rotary_base,
)
model.freeze(freeze_language_model=args.freeze_LM, freeze_vision_model=args.freeze_ViT, freeze_vision_projection=False)
return model
def get_batch(data_iterator):
"""Generate a batch"""
args = get_args()
imgs = None
tokens = None
labels = None
loss_mask = None
attention_mask = None
position_ids = None
num_tiles = None
# Broadcast data.
torch.cuda.nvtx.range_push("get_data")
if data_iterator is not None and get_tensor_model_parallel_rank() == 0:
data = next(data_iterator)
else:
data = None
data_text = tensor_parallel.broadcast_data(["text"], data, torch.int64)["text"]
prompt_len = tensor_parallel.broadcast_data(["prompt_len"], data, torch.int64)["prompt_len"]
target = tensor_parallel.broadcast_data(["target"], data, torch.int64)["target"]
imgs = tensor_parallel.broadcast_data(["imgs"], data, torch.float32)["imgs"]
num_tiles = tensor_parallel.broadcast_data(["num_tiles"], data, torch.int)["num_tiles"]
# Dummy image, no image.
if imgs.shape == torch.Size([1, 1]):
imgs = torch.tensor([], dtype=torch.float32, device=data_text.device)
num_tiles = torch.tensor([], dtype=torch.int, device=data_text.device)
torch.cuda.nvtx.range_pop()
tokens_ = data_text.long()
torch.cuda.nvtx.range_push("index tokens")
tokenizer = get_tokenizer()
text_length = args.decoder_seq_length - args.seq_length
tokens = tokens_[:, :text_length].contiguous()
labels = tokens_[:, 1:text_length+1].contiguous()
assert tokens.shape == labels.shape, f"tokens: {tokens.shape} != labels: {labels.shape}"
torch.cuda.nvtx.range_pop()
torch.cuda.nvtx.range_push("get_ltor_masks_and_position_ids")
if hasattr(tokenizer, 'eod'):
eod_token = tokenizer.eod
elif hasattr(tokenizer, 'eos_id'):
eod_token = tokenizer.eos_id
attention_mask, loss_mask, position_ids = \
get_ltor_masks_and_position_ids(tokens, eod_token,
args.reset_position_ids,
args.reset_attention_mask,
args.eod_mask_loss,
question_length=prompt_len,
target=target[:, 1:text_length+1]
)
torch.cuda.nvtx.range_pop()
return tokens, labels, loss_mask, attention_mask, position_ids, imgs, num_tiles
def get_num_image_embeddings():
"""Get the number of image embeddings per tile."""
args = get_args()
add_class_token = not args.disable_vision_class_token
num_patches_per_dim_h = args.img_h // args.patch_dim
num_patches_per_dim_w = args.img_w // args.patch_dim
num_patches = num_patches_per_dim_h * num_patches_per_dim_w
num_image_embeddings_per_tile = num_patches + (1 if add_class_token else 0)
max_num_image_embeddings = (args.max_num_tiles + int(args.use_thumbnail)) * num_image_embeddings_per_tile
if max_num_image_embeddings > args.max_position_embeddings:
raise RuntimeError(f"Too many image embeddings {max_num_image_embeddings} for language model max embedding size {args.max_position_embeddings}")
return num_image_embeddings_per_tile
def get_ltor_masks_and_position_ids(data,
eod_token,
reset_position_ids,
reset_attention_mask,
eod_mask_loss,
question_length=None,
target=None,
weights=None):
"""Build masks and position id for left to right model."""
# Extract batch size and sequence length.
micro_batch_size, seq_length = data.size()
# Attention mask (lower triangular).
if reset_attention_mask:
att_mask_batch = micro_batch_size
else:
att_mask_batch = 1
attention_mask = torch.tril(torch.ones(
(att_mask_batch, seq_length, seq_length), device=data.device)).view(
att_mask_batch, 1, seq_length, seq_length)
# Loss mask.
if target != None: # use target to create loss mask that is created in data preparation step
loss_mask = torch.ones(target.size(), dtype=torch.float, device=data.device)
loss_mask[target == eod_token] = 0.0 # mask paddings
loss_mask[target == -100] = 0.0 # mask prompts
else: # default creation
loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
if eod_mask_loss:
loss_mask[data == eod_token] = 0.0
if question_length is not None:
for b in range(micro_batch_size):
loss_mask[b, :max(0, question_length[b].item() - 1)] = 0.0
# Position ids.
position_ids = torch.arange(seq_length, dtype=torch.long,
device=data.device)
position_ids = position_ids.unsqueeze(0).expand_as(data)
# We need to clone as the ids will be modifed based on batch index.
if reset_position_ids:
position_ids = position_ids.clone()
if question_length is not None:
# Create a mask based on question_length
question_length_mask = torch.arange(loss_mask.size(1), device=loss_mask.device)[None, :] < question_length[:, None]
# Invert the mask (1 where we want to keep the loss, 0 where we want to zero it out)
inverted_mask = ~question_length_mask
# Apply the mask to loss_mask
loss_mask = loss_mask * inverted_mask.float()
if reset_position_ids or reset_attention_mask:
# Loop through the batches:
for b in range(micro_batch_size):
# Find indecies where EOD token is.
eod_index = position_ids[b, data[b] == eod_token]
# Detach indecies from positions if going to modify positions.
if reset_position_ids:
eod_index = eod_index.clone()
# Loop through EOD indecies:
prev_index = 0
for j in range(eod_index.size()[0]):
i = eod_index[j]
# Mask attention loss.
if reset_attention_mask:
attention_mask[b, 0, (i + 1):, :(i + 1)] = 0
# Reset positions.
if reset_position_ids:
position_ids[b, (i + 1):] -= (i + 1 - prev_index)
prev_index = i + 1
# Convert attention mask to binary:
attention_mask = (attention_mask < 0.5)
if weights is not None:
loss_mask = loss_mask * weights
return attention_mask, loss_mask, position_ids
def loss_func(loss_mask, output_tensor):
losses = output_tensor.float()
loss_mask = loss_mask.contiguous().view(-1).float()
total_tokens = loss_mask.sum()
total_loss = torch.sum(losses.view(-1) * loss_mask)
loss = torch.cat([total_loss.view(1), total_tokens.view(1)])
reporting_loss = loss.clone().detach()
torch.distributed.all_reduce(reporting_loss, group=mpu.get_data_parallel_group())
local_num_tokens = loss[1].clone().detach().to(torch.int)
return (
total_loss,
local_num_tokens,
{'lm loss': (reporting_loss[0], reporting_loss[1])},
)
def forward_step(data_iterator, model: LLaVAModel):
"""Forward training step.
Args:
data_iterator (torch.utils.data.dataloader): Input data iterator
model: Multimodal model
Returns:
output_tensor (torch.Tensor): Loss of shape [b, s] if labels are provided, otherwise logits of shape [b, s, vocab_size].
loss_func (callable): Loss function with a loss mask specified.
"""
timers = get_timers()
# Get the batch.
timers('batch-generator', log_level=2).start()
tokens, labels, loss_mask, attention_mask, position_ids, images, num_image_tiles = get_batch(data_iterator)
timers('batch-generator').stop()
output_tensor, loss_mask = model(images, tokens, position_ids, attention_mask, labels, loss_mask, num_image_tiles=num_image_tiles)
return output_tensor, partial(loss_func, loss_mask)
def add_multimodal_extra_args(parser):
"""Extra arguments."""
group = parser.add_argument_group(title='multimodal arguments')
group.add_argument('--valid-path', nargs='*', default=None,
help='Path to the training dataset. Accepted format:'
'1) a single data path, 2) multiple datasets in the'
'form: dataset1-weight dataset1-path dataset2-weight '
'dataset2-path ...')
group.add_argument('--dataset-config', type=str, default=None)
group.add_argument("--prompt-path", type=str, default=None)
group.add_argument('--freeze-LM', action='store_true', default=False)
group.add_argument('--freeze-ViT', action='store_true', default=False)
group.add_argument('--language-model-type', type=str, required=True)
group.add_argument('--vision-model-type', type=str, default="clip")
group.add_argument("--disable-vision-class-token", action="store_true", default=False)
group.add_argument("--allow-missing-vision-projection-checkpoint", action="store_true", default=False)
group.add_argument("--use-te", action="store_true", default=False)
group.add_argument("--dataloader-save", type=str, default=None, help="Energon dataloader state save path")
group.add_argument("--use-tiling", action="store_true", default=False, help="Use input image tiling")
group.add_argument("--max-num-tiles", type=int, default=1, help="Maximum number of image tiles")
group.add_argument("--use-thumbnail", action="store_true", default=False, help="Add image thumbnail as a tile")
return parser
def llava_embedding_ranks(pp_ranks):
"""LLava's embedding ranks consist of the decoder's first and last ranks (ie, the ViT has no embeddings).
Args:
pp_ranks: A list of global ranks that constitute a pipeline group.
"""
args = get_args()
# encoder size is also the index to the first rank of the decoder.
epp = args.encoder_pipeline_model_parallel_size
last_rank = pp_ranks[-1]
if len(pp_ranks) == 1 or pp_ranks[epp] == last_rank:
return [last_rank]
else:
return [pp_ranks[epp], last_rank]
def llava_position_embedding_ranks(pp_ranks):
"""LLava's embedding ranks consist of the singular rank of the model or the decoder's first rank.
Args:
pp_ranks: A list of global ranks that constitute a pipeline group.
"""
args = get_args()
# encoder size is also the index to the first rank of the decoder.
epp = args.encoder_pipeline_model_parallel_size
last_rank = pp_ranks[-1]
if len(pp_ranks) == 1:
return [last_rank]
else:
return [pp_ranks[epp]]
if __name__ == "__main__":
train_valid_test_dataloaders_provider.is_distributed = True
pretrain(
train_valid_test_dataloaders_provider,
model_provider,
ModelType.encoder_and_decoder,
forward_step,
args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
extra_args_provider=add_multimodal_extra_args,
get_embedding_ranks=llava_embedding_ranks,
get_position_embedding_ranks=llava_position_embedding_ranks,
)
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
CHECKPOINT_PATH=<Specify path>
VOCAB_FILE=<Specify path to file>/bert-vocab.txt
DATA_PATH=<Specify path and file prefix>_text_sentence
BERT_ARGS="
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 512 \
--max-position-embeddings 512 \
--micro-batch-size 4 \
--global-batch-size 8 \
--lr 0.0001 \
--train-iters 2000000 \
--lr-decay-iters 990000 \
--lr-decay-style linear \
--min-lr 0.00001 \
--weight-decay 1e-2 \
--lr-warmup-fraction .01 \
--clip-grad 1.0 \
--fp16
"
DATA_ARGS="
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--data-impl mmap \
--split 949,50,1
"
OUTPUT_ARGS="
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10
"
torchrun pretrain_bert.py \
$BERT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
CHECKPOINT_PATH=<Specify path>
VOCAB_FILE=<Specify path to file>/bert-vocab.txt
DATA_PATH=<Specify path and file prefix>_text_sentence
DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
BERT_ARGS="
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 512 \
--max-position-embeddings 512 \
--micro-batch-size 4 \
--global-batch-size 32 \
--lr 0.0001 \
--train-iters 1000000 \
--lr-decay-iters 990000 \
--lr-decay-style linear \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--lr-warmup-fraction .01 \
--clip-grad 1.0 \
--fp16
"
DATA_ARGS="
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--data-impl mmap \
--split 949,50,1
"
OUTPUT_ARGS="
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10
"
torchrun $DISTRIBUTED_ARGS pretrain_bert.py \
$BERT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
CHECKPOINT_PATH=<Specify path>
VOCAB_FILE=<Specify path to file>/bert-vocab.txt
DATA_PATH=<Specify path and file prefix>_text_sentence
DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
BERT_ARGS="
--tensor-model-parallel-size 2 \
--pipeline-model-parallel-size 2 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 512 \
--max-position-embeddings 512 \
--micro-batch-size 2 \
--global-batch-size 16 \
--lr 0.0001 \
--train-iters 1000000 \
--lr-decay-iters 990000 \
--lr-decay-style linear \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--lr-warmup-fraction .01 \
--clip-grad 1.0 \
--fp16
"
DATA_ARGS="
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--data-impl mmap \
--split 949,50,1
"
OUTPUT_ARGS="
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10
"
torchrun $DISTRIBUTED_ARGS pretrain_bert.py \
$BERT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH
#!/bin/bash
# Runs the "345M" parameter model
export CUDA_DEVICE_MAX_CONNECTIONS=1
CHECKPOINT_PATH=<Specify path>
VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
MERGE_FILE=<Specify path to file>/gpt2-merges.txt
DATA_PATH=<Specify path and file prefix>_text_document
GPT_ARGS="
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--micro-batch-size 4 \
--global-batch-size 8 \
--lr 0.00015 \
--train-iters 500000 \
--lr-decay-iters 320000 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--lr-warmup-fraction .01 \
--clip-grad 1.0 \
--fp16
"
DATA_ARGS="
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--data-impl mmap \
--split 949,50,1
"
OUTPUT_ARGS="
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10
"
torchrun pretrain_gpt.py \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH
#!/bin/bash
#SBATCH <SLURM OPTIONS> --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b
DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
mkdir -p $DIR/logs
DATASET_1="<PATH TO THE FIRST DATASET>"
DATASET_2="<PATH TO THE SECOND DATASET>"
DATASET_3="<PATH TO THE THIRD DATASET>"
DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
options=" \
--tensor-model-parallel-size 8 \
--pipeline-model-parallel-size 16 \
--num-layers 96 \
--hidden-size 12288 \
--num-attention-heads 96 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--micro-batch-size 1 \
--global-batch-size 1536 \
--rampup-batch-size 16 16 5859375 \
--train-samples 146484375 \
--lr-decay-samples 126953125 \
--lr-warmup-samples 183105 \
--lr 6.0e-5 \
--min-lr 6.0e-6 \
--lr-decay-style cosine \
--log-interval 10 \
--eval-iters 40 \
--eval-interval 1000 \
--data-path ${DATASET} \
--vocab-file <PATH TO gpt-vocab.json> \
--merge-file <PATH TO gpt-merges.txt> \
--save-interval 1000 \
--save <PATH TO CHECKPOINTS DIRECTORY> \
--load <PATH TO CHECKPOINTS DIRECTORY> \
--split 98,2,0 \
--clip-grad 1.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.006 \
--tensorboard-dir <TENSORBOARD DIRECTORY> \
--fp16 "
run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}"
srun -l \
--container-image "nvcr.io/nvidia/pytorch:20.12-py3" \
--container-mounts "<DIRECTORIES TO MOUNT>" \
--output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
set +x
#!/bin/bash
# Runs the "345M" parameter model
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
CHECKPOINT_PATH=<Specify path>
VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
MERGE_FILE=<Specify path to file>/gpt2-merges.txt
DATA_PATH=<Specify path and file prefix>_text_document
DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
GPT_ARGS="
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--micro-batch-size 8 \
--global-batch-size 64 \
--lr 0.00015 \
--train-iters 500000 \
--lr-decay-iters 320000 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--lr-warmup-fraction .01 \
--clip-grad 1.0 \
--fp16
"
DATA_ARGS="
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--data-impl mmap \
--split 949,50,1
"
OUTPUT_ARGS="
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10
"
torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH
#!/bin/bash
# Runs the "345M" parameter model
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
CHECKPOINT_PATH=<Specify path>
VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
MERGE_FILE=<Specify path to file>/gpt2-merges.txt
DATA_PATH=<Specify path and file prefix>_text_document
DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
GPT_ARGS="
--tensor-model-parallel-size 2 \
--pipeline-model-parallel-size 2 \
--sequence-parallel \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--micro-batch-size 4 \
--global-batch-size 16 \
--lr 0.00015 \
--train-iters 500000 \
--lr-decay-iters 320000 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--lr-warmup-fraction .01 \
--clip-grad 1.0 \
--fp16
"
DATA_ARGS="
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--data-impl mmap \
--split 949,50,1
"
OUTPUT_ARGS="
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10
"
torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH
#! /bin/bash
# Runs the "217M" parameter biencoder model for ICT retriever
RANK=0
WORLD_SIZE=1
PRETRAINED_BERT_PATH=<Specify path of pretrained BERT model>
TEXT_DATA_PATH=<Specify path and file prefix of the text data>
TITLE_DATA_PATH=<Specify path and file prefix od the titles>
CHECKPOINT_PATH=<Specify path>
python pretrain_ict.py \
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--tensor-model-parallel-size 1 \
--micro-batch-size 32 \
--seq-length 256 \
--max-position-embeddings 512 \
--train-iters 100000 \
--vocab-file bert-vocab.txt \
--tokenizer-type BertWordPieceLowerCase \
--DDP-impl torch \
--bert-load ${PRETRAINED_BERT_PATH} \
--log-interval 100 \
--eval-interval 1000 \
--eval-iters 10 \
--retriever-report-topk-accuracies 1 5 10 20 100 \
--retriever-score-scaling \
--load $CHECKPOINT_PATH \
--save $CHECKPOINT_PATH \
--data-path ${TEXT_DATA_PATH} \
--titles-data-path ${TITLE_DATA_PATH} \
--lr 0.0001 \
--lr-decay-style linear \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--lr-warmup-fraction 0.01 \
--save-interval 4000 \
--exit-interval 8000 \
--query-in-block-prob 0.1 \
--fp16
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
CHECKPOINT_PATH=<Specify path>
VOCAB_FILE=<Specify path to file>/t5-vocab.txt
DATA_PATH=<Specify path and file prefix>_text_sentence
T5_ARGS="
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--kv-channels 64 \
--ffn-hidden-size 3072 \
--encoder-seq-length 512 \
--decoder-seq-length 128 \
--max-position-embeddings 512 \
--micro-batch-size 16 \
--global-batch-size 16 \
--lr 0.0001 \
--train-iters 1000000 \
--lr-decay-iters 1000000 \
--lr-decay-style linear \
--min-lr 0.00001 \
--weight-decay 1e-2 \
--lr-warmup-fraction .01 \
--clip-grad 1.0 \
--fp16 \
--vocab-extra-ids 100
"
DATA_ARGS="
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--data-impl mmap \
--split 949,50,1
"
OUTPUT_ARGS="
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10
"
torchrun pretrain_t5.py \
$T5_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
CHECKPOINT_PATH=<Specify path>
VOCAB_FILE=<Specify path to file>/t5-vocab.txt
DATA_PATH=<Specify path and file prefix>_text_sentence
DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
T5_ARGS="
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--kv-channels 64 \
--ffn-hidden-size 3072 \
--encoder-seq-length 512 \
--decoder-seq-length 128 \
--max-position-embeddings 512 \
--micro-batch-size 16 \
--global-batch-size 128 \
--lr 0.0001 \
--train-iters 1000000 \
--lr-decay-iters 1000000 \
--lr-decay-style linear \
--min-lr 0.00001 \
--weight-decay 1e-2 \
--lr-warmup-fraction .01 \
--clip-grad 1.0 \
--fp16 \
--vocab-extra-ids 100
"
DATA_ARGS="
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--data-impl mmap \
--split 949,50,1
"
OUTPUT_ARGS="
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10
"
torchrun $DISTRIBUTED_ARGS pretrain_t5.py \
$T5_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH
# RETRO MODEL
## Table of contents
- [1. Training Setup](#1-training-setup)
- [2. Data Preprocessing](#2-data-preprocessing)
- [3. Configurations](#3-configurations)
## 1. Training setup
<a id="markdown-training-setup" name="training-setup"></a>
To run the model using a docker container run it as follows
```
PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:23.09-py3
CHECKPOINT_PATH="" #<Specify path>
TENSORBOARD_LOGS_PATH=""#<Specify path>
docker run \
--gpus=all \
--ipc=host \
--workdir /workspace/megatron-lm \
-v /path/to/data:/path/to/data \
-v /path/to/megatron-lm:/workspace/megatron-lm \
megatron-lm nvcr.io/nvidia/pytorch:23.09-py3 \
bash examples/retro/train_retro_2b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH"
```
NOTE: Depending on the environment you are running it the above command might look slightly different.
NOTE: Due to how Retro preprocess and caches elements of the pretraining dataset before training begins, some arguments are auto-loaded from the Retro preprocessing configuration. These loaded arguments include:
- `--data-path`
- `--data-cache-path`
- `--eval-interval`
- `--eval-iters`
- `--global-batch-size`
- `--tokenizer-type`
- `--tokenizer-model`
- `--vocab-file`
- `--merge-file`
- `--seed`
- `--seq-length`
- `--train-samples`
## 2. Data Preprocessing
<a id="markdown-data-preprocessing" name="data-preprocessing"></a>
Retro preprocesses and caches data prior to pretraining, to greatly speed up pretraining. During data preprocessing, the retrieval database is built, and neighbor IDs are queried for each sample within the pretraining dataset. Please see `preprocess_data.sh` for an example script to preprocess data for Retro. The reference documentation for data preprocessing can be found [here](tools/retro/README.md).
## 3. Configurations
<a id="markdown-configurations" name="configurations"></a>
The example in this folder shows you how to run a 2B model. Below are a few other example configurations.
### 857M
```
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
```
### 4B
```
--num-layers 48 \
--hidden-size 2560 \
--num-attention-heads 32 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
```
......@@ -7,23 +7,31 @@ unset NCCL_DEBUG
######## Megatron, Retro dirs. ########
REPO_DIR="<path/to/megatron/repo>"
RETRO_WORKDIR="<path/to/retro/data/directory>"
RETRO_PROJECT_DIR="<path/to/retro/project/directory>"
######## Task (e.g., db, index, query). ########
RETRO_TASKS="db-build"
# RETRO_TASKS="index-train"
# RETRO_TASKS="index-add"
# RETRO_TASKS="query-pretraining-neighbors"
# This script takes a single argument, which specifies the retro task to be
# performed. The available tasks are: db-build, index-train, index-add, and
# query-neighbors.
######## Data. ########
# ~~ Examples ~~
# RETRO_TASKS="db-build" # Build the retrieval database
# RETRO_TASKS="index-train" # Train the index
# RETRO_TASKS="index-add" # Add data to the index
# RETRO_TASKS="query-neighbors" # Perform query pretraining for neighbors
# You can also provide the task as a command-line argument when executing the
# script. Example: ./preprocess_data.sh index-add
RETRO_TASKS=$1
######## Data. ########
DATA_BLEND="<see --data-path in arguments.py>"
######## Index. ########
RETRO_INDEX_STR="OPQ32_64,IVF65536_HNSW8,PQ32"
RETRO_INDEX_NTRAIN=1000000
RETRO_INDEX_NTRAIN=66625331
RETRO_INDEX_TRAIN_LOAD_FRACTION=0.97
RETRO_INDEX_ADD_LOAD_FRACTION=0.95
......@@ -32,20 +40,19 @@ RETRO_INDEX_ADD_LOAD_FRACTION=0.95
RETRO_GPT_SEED=1234
RETRO_GPT_SPLIT="98,2,0"
RETRO_GPT_DATA_PATH=${DATA_BLEND}
RETRO_GPT_DATA_IMPL=mmap
RETRO_GPT_DATALOADER_TYPE=single
RETRO_GPT_TRAIN_SAMPLES=200000
RETRO_GPT_EVAL_INTERVAL=2000
RETRO_GPT_EVAL_ITERS=50
RETRO_GPT_TRAIN_SAMPLES=200000
RETRO_GPT_LR_DECAY_SAMPLES=175000
RETRO_GPT_LR_WARMUP_SAMPLES=10000
RETRO_GPT_SEQ_LENGTH=512
RETRO_GPT_SEQ_LENGTH=2048
RETRO_GPT_GLOBAL_BATCH_SIZE=256
RETRO_GPT_CHUNK_LENGTH=64
######## Query. ########
RETRO_QUERY_NUM_NEIGHBORS_QUERY=200 RETRO_QUERY_NUM_NEIGHBORS_SAVE=20
RETRO_QUERY_NUM_NEIGHBORS_QUERY=200
RETRO_QUERY_NUM_NEIGHBORS_SAVE=20
RETRO_QUERY_EF_SEARCH=32
RETRO_QUERY_NPROBE=4096
......@@ -62,13 +69,12 @@ ARGS=" \
--global-batch-size ${RETRO_GPT_GLOBAL_BATCH_SIZE} \
--seq-length 512 \
--max-position-embeddings 512 \
--load <path/to/bert/checkpoint> \
--load ${RETRO_PROJECT_DIR}/checkpoints/bert \
--exit-on-missing-checkpoint \
--no-load-optim \
--data-path ${RETRO_GPT_DATA_PATH} \
--data-path [null] \
--tokenizer-type BertWordPieceLowerCase \
--vocab-file <path/to/bert/vocab> \
--data-impl ${RETRO_GPT_DATA_IMPL} \
--vocab-file ${RETRO_PROJECT_DIR}/tokenizer/bert-large-uncased-vocab.txt \
--split ${RETRO_GPT_SPLIT} \
--distributed-backend nccl \
--lr 0.0001 \
......@@ -81,37 +87,37 @@ ARGS=" \
--clip-grad 1.0 \
--eval-interval ${RETRO_GPT_EVAL_INTERVAL} \
--eval-iters ${RETRO_GPT_EVAL_ITERS} \
--fp16 \
--DDP-impl local \
--dataloader-type ${RETRO_GPT_DATALOADER_TYPE} \
--bf16 \
--no-data-sharding \
--no-gradient-accumulation-fusion \
--no-async-tensor-model-parallel-allreduce \
--bert-embedder-type megatron \
--output-bert-embeddings \
\
--retro-workdir ${RETRO_WORKDIR} \
--retro-project-dir ${RETRO_PROJECT_DIR} \
--retro-tasks ${RETRO_TASKS} \
--retro-return-doc-ids \
--retro-bert-vocab-file <path/to/bert/vocab> \
--retro-bert-vocab-file tokenizer/bert-large-uncased-vocab.txt \
--retro-bert-tokenizer-type BertWordPieceLowerCase \
\
--retro-gpt-seed ${RETRO_GPT_SEED} \
--retro-gpt-tokenizer-type GPTSentencePieceTokenizer \
--retro-gpt-tokenizer-model <path/to/gpt/tokenizer/model> \
--retro-gpt-tokenizer-model /path/to/tokenizer/model \
--retro-gpt-seq-length ${RETRO_GPT_SEQ_LENGTH} \
--retro-gpt-chunk-length ${RETRO_GPT_CHUNK_LENGTH} \
--retro-gpt-global-batch-size ${RETRO_GPT_GLOBAL_BATCH_SIZE} \
--retro-gpt-eval-interval ${RETRO_GPT_EVAL_INTERVAL} \
--retro-gpt-eval-iters ${RETRO_GPT_EVAL_ITERS} \
--retro-gpt-split ${RETRO_GPT_SPLIT} \
--retro-gpt-data-impl ${RETRO_GPT_DATA_IMPL} \
--retro-gpt-data-path ${RETRO_GPT_DATA_PATH} \
--retro-gpt-train-samples ${RETRO_GPT_TRAIN_SAMPLES} \
\
--retro-index-str ${RETRO_INDEX_STR} \
--retro-index-ntrain ${RETRO_INDEX_NTRAIN} \
--retro-index-train-load-fraction ${RETRO_INDEX_TRAIN_LOAD_FRACTION} \
--retro-index-add-load-fraction ${RETRO_INDEX_ADD_LOAD_FRACTION} \
--retro-index-no-delete-training-embeddings \
--retro-index-no-delete-added-codes \
--no-retro-index-delete-training-embeddings \
--no-retro-index-delete-added-codes \
\
--retro-query-num-neighbors-query ${RETRO_QUERY_NUM_NEIGHBORS_QUERY} \
--retro-query-num-neighbors-save ${RETRO_QUERY_NUM_NEIGHBORS_SAVE} \
--retro-query-ef-search ${RETRO_QUERY_EF_SEARCH} \
......@@ -130,7 +136,7 @@ CMD="\
--node_rank ${NODE_RANK} \
--master_addr ${MASTER_ADDR} \
--master_port 6000 \
tools/retro/main.py ${ARGS} \
tools/retro/preprocess_data.py ${ARGS} \
"
echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
echo "CMD = '$CMD'."
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment