Commit 99a0c39e authored by xingjinliang's avatar xingjinliang
Browse files

同步最新代码

parent 50fe58fa
Pipeline #2152 passed with stage
......@@ -75,7 +75,7 @@ OPTIONS=" \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings ${MAX_POS_EMBED} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model ${WORKSPACE}/<path to tokenizer> \
--tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \
--tokenizer-prompt-format nvlm-yi-34b \
--vocab-size 64000 \
--make-vocab-size-divisible-by 1 \
......
......@@ -97,7 +97,7 @@ do
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings ${MAX_POS_EMBED} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model <tokenizer model path> \
--tokenizer-model Qwen/Qwen2-72B-Instruct \
--tokenizer-prompt-format qwen2p0 \
--position-embedding-type rope \
--rotary-percent 1.0 \
......
#!/bin/bash
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0
export TOKENIZERS_PARALLELISM="false"
INPUT_IMAGE_PATH="placeholder"
GROUNDTRUTH_PATH="placeholder"
while [[ $# -gt 0 ]]; do
case $1 in
-i|--input-image-path)
INPUT_IMAGE_PATH="$2"
shift
shift
;;
-o|--output-path)
OUTPUT_PATH="$2"
shift
shift
;;
-m|--model-path)
MODEL_PATH="$2"
shift
shift
;;
-t|--task)
TASK="$2"
shift
shift
;;
-g|--gt-path)
GROUNDTRUTH_PATH="$2"
shift
shift
;;
-*|--*)
echo "Invalid option $1"
exit 1
;;
esac
done
# Please modify these as needed.
NUM_PARTITIONS=0
START=0
END=0
SEQ_LEN=256
DECODER_SEQ_LEN=8192
EXTRA_ARGS=" --pixel-shuffle --use-tiling --max-num-tiles 12 --use-thumbnail"
for PARTITION_ID in $( eval echo {$START..$END} )
do
torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
--attention-softmax-in-fp32 \
--transformer-impl transformer_engine \
--use-te \
--use-checkpoint-args \
--normalization RMSNorm \
--norm-epsilon 1e-06 \
--language-model-type=qwen2.5_7B \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--tensor-model-parallel-size 4 \
--pipeline-model-parallel-size 1 \
--group-query-attention \
--num-query-groups 4 \
--num-layers 28 \
--hidden-size 3584 \
--ffn-hidden-size 18944 \
--add-qkv-bias \
--num-attention-heads 28 \
--max-position-embeddings 32768 \
--no-masked-softmax-fusion \
--load ${MODEL_PATH} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model Qwen/Qwen2.5-7B-Instruct \
--tokenizer-prompt-format qwen2p5 \
--bf16 \
--micro-batch-size 1 \
--seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--out-seq-length 128 \
--temperature 1.0 \
--img-h 448 \
--img-w 448 \
--patch-dim 14 \
--seed 153 \
--top_k 1 \
--no-load-rng \
--no-load-optim \
--input-image-path ${INPUT_IMAGE_PATH} \
--num-partitions ${NUM_PARTITIONS} \
--partition-id ${PARTITION_ID} \
--output-path ${OUTPUT_PATH} \
--gt-path ${GROUNDTRUTH_PATH} \
--task ${TASK} \
${EXTRA_ARGS} \
--special-tokens "<image>" "<img>" "</img>" \
--vision-model-type siglip \
--ckpt-format torch
done
......@@ -95,7 +95,7 @@ do
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings ${MAX_POS_EMBED} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model <tokenizer model path> \
--tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \
--tokenizer-prompt-format nvlm-yi-34b \
--vocab-size 64000 \
--make-vocab-size-divisible-by 1 \
......@@ -135,6 +135,6 @@ do
--gt-path ${GROUNDTRUTH_PATH} \
${EXTRA_ARGS} \
--task ${TASK} \
--image-tag-type nlvm \
--image-tag-type nvlm \
--ckpt-format torch
done
......@@ -80,7 +80,7 @@ OPTIONS=" \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings ${MAX_POS_EMBED} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model ${WORKSPACE}/<tokenizer path> \
--tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \
--tokenizer-prompt-format nvlm-yi-34b \
--vocab-size 64000 \
--make-vocab-size-divisible-by 1 \
......
File mode changed from 100755 to 100644
......@@ -67,7 +67,7 @@ OPTIONS=" \
--exit-duration-in-mins 230 \
--disable-bias-linear \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model ${WORKSPACE}/<tokenizer model path> \
--tokenizer-model Qwen/Qwen2-72B-Instruct \
--tokenizer-prompt-format qwen2p0 \
--transformer-impl transformer_engine \
--normalization RMSNorm \
......
File mode changed from 100755 to 100644
......@@ -24,11 +24,6 @@ if [[ -z $LOAD_NAME ]]; then
exit 1
fi
if [[ -z $TOKENIZER_MODEL ]]; then
echo "Please set TOKENIZER_MODEL for tokenizer model name."
exit 1
fi
CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
DATA_TRAIN="${SOURCE}/examples/multimodal/pretrain_dataset.yaml"
......@@ -93,7 +88,7 @@ OPTIONS=" \
--eval-iters 10 \
--eval-interval 1000 \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \
--tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \
--tokenizer-prompt-format mistral \
--data-path ${DATA_TRAIN} \
--prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
......
......@@ -14,11 +14,13 @@ sys.path.append(
import torch
import yaml
from config import EvaluationConfig
from evaluation_datasets import get_evaluation_dataset
from evaluation.evaluation_datasets import get_evaluation_dataset
from model import model_provider
from multimodal_args import add_multimodal_extra_args
from megatron.core import parallel_state
from megatron.core.enums import ModelType
from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN
from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
from megatron.inference.text_generation.api import generate_and_post_process
from megatron.inference.text_generation.forward_step import ForwardStep
......@@ -36,7 +38,7 @@ def add_text_generation_args(parser):
group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
group.add_argument("--top_k", type=int, default=0, help='Top k sampling.')
group.add_argument(
"--out-seq-length", type=int, default=1024, help='Length of the output generated text.'
"--out-seq-length", type=int, default=128, help='Length of the output generated text.'
)
group.add_argument("--output-path", type=str, help='Output file path')
group.add_argument('--input-image-path', type=str, help="Input image directory")
......@@ -206,8 +208,8 @@ def generate_samples(model, config: EvaluationConfig, print_output):
if config.task == "VideoMME":
output["questions"][0][output_name] = generated
else:
output[output_name] = generated
output["prompt"] = prompt
output[output_name] = generated
if config.task == "captioning":
output["ground_truth"] = answers
......@@ -354,7 +356,7 @@ class VLMForwardStep(ForwardStep):
)
def __call__(self, tokens, position_ids, attention_mask):
num_image_tokens = (tokens == self.model.image_token_index).sum().item()
num_image_tokens = (tokens == self.model.module.image_token_index).sum().item()
num_tokens = tokens.size(1)
recv_buffer_seq_length = None
if num_image_tokens > 0:
......@@ -406,7 +408,7 @@ def get_conversation(task, question):
{"role": "system", "content": "Answer the questions."},
{
"role": "user",
"content": "<image>\nProvide a one-sentence caption for provided image.",
"content": f"{IMAGE_TOKEN}\nProvide a one-sentence caption for provided image.",
},
]
elif task in ("TextVQA", "VQAv2", "ChartQA"):
......@@ -414,13 +416,13 @@ def get_conversation(task, question):
{"role": "system", "content": "Answer the questions."},
{
"role": "user",
"content": f"<image>\n{question}\nAnswer the question using a single word or phrase.",
"content": f"{IMAGE_TOKEN}\n{question}\nAnswer the question using a single word or phrase.",
},
]
elif task in ("OCRBench", "MathVista", "AI2D"):
conversation = [
{"role": "system", "content": "Answer the questions."},
{"role": "user", "content": f"<image>\n{question}"},
{"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"},
]
elif task == "MMMU":
conversation = [
......@@ -441,7 +443,7 @@ def get_conversation(task, question):
conversation = [
{"role": "system", "content": "Answer the questions."},
{"role": "user", "content": f"<image>\n{question}"},
{"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"},
]
return conversation
......@@ -464,11 +466,13 @@ def get_prompt_and_generated(prompt_and_generation, prompt_format):
prompt = splitted[0]
generated = splitted[1]
generated = generated.split("<|im_end|>")[0]
elif prompt_format in ("nvlm-yi-34b", "qwen2p0"):
elif prompt_format in ("nvlm-yi-34b", "qwen2p0", "qwen2p5"):
splitted = prompt_and_generation.split("<|im_start|>assistant\n")
prompt = splitted[0]
generated = splitted[1]
generated = generated.split("<|im_end|>")[0]
else:
raise ValueError(f"Prompt format {prompt_format} is not supported.")
# Remove possible garbage.
generated = generated.strip()
......@@ -489,11 +493,11 @@ def main():
args = get_args()
def wrapped_model_provider(pre_process, post_process):
return model_provider(pre_process, post_process, parallel_output=False)
def wrapped_model_provider(pre_process, post_process, add_encoder, add_decoder):
return model_provider(pre_process, post_process, add_encoder, add_decoder, parallel_output=False)
# Set up model and load checkpoint.
model = get_model(wrapped_model_provider, wrap_with_ddp=False)
model = get_model(wrapped_model_provider, model_type=ModelType.encoder_and_decoder, wrap_with_ddp=False)
if args.load is not None:
_ = load_checkpoint(model, None, None)
......
File mode changed from 100755 to 100644
......@@ -29,11 +29,6 @@ if [[ -z $LOAD_ITER ]]; then
exit 1
fi
if [[ -z $TOKENIZER_MODEL ]]; then
echo "Please set TOKENIZER_MODEL for tokenizer model name."
exit 1
fi
CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
DATA_TRAIN="${SOURCE}/examples/multimodal/sft_dataset.yaml"
......@@ -98,7 +93,7 @@ OPTIONS=" \
--eval-iters 10 \
--eval-interval 500 \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \
--tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \
--tokenizer-prompt-format mistral \
--data-path ${DATA_TRAIN} \
--prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
......
......@@ -4,12 +4,13 @@ export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0
INPUT_IMAGE_PATH="placeholder"
GROUNDTRUTH_PATH="placeholder"
NUM_FRAMES=1
while [[ $# -gt 0 ]]; do
case $1 in
--input-image-path)
-i|--input-image-path)
INPUT_IMAGE_PATH="$2"
shift
shift
......@@ -19,11 +20,6 @@ while [[ $# -gt 0 ]]; do
shift
shift
;;
-g|--groundtruth-path)
GROUNDTRUTH_PATH="$2"
shift
shift
;;
-o|--output-path)
OUTPUT_PATH="$2"
shift
......@@ -34,12 +30,7 @@ while [[ $# -gt 0 ]]; do
shift
shift
;;
-t|--tokenizer-path)
TOKENIZER_PATH="$2"
shift
shift
;;
--task)
-t|--task)
TASK="$2"
shift
shift
......@@ -92,7 +83,7 @@ do
--no-masked-softmax-fusion \
--load ${MODEL_PATH} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model ${TOKENIZER_PATH} \
--tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \
--tokenizer-prompt-format mistral \
--bf16 \
--micro-batch-size 1 \
......
......@@ -48,7 +48,7 @@ def get_batch(data_iterator):
pp_size = get_pipeline_model_parallel_world_size()
if not is_first_or_last_stage(pp_size, args.encoder_pipeline_model_parallel_size):
# Note these are all set to None above.
return tokens, labels, loss_mask, attention_mask, position_ids, imgs, num_tiles
return tokens, labels, loss_mask, attention_mask, position_ids, imgs, num_tiles, packed_seq_params
# Broadcast data.
torch.cuda.nvtx.range_push("get_data")
......@@ -66,7 +66,7 @@ def get_batch(data_iterator):
cu_lengths = tensor_parallel.broadcast_data(["cu_lengths"], data, torch.int32)["cu_lengths"]
max_lengths = tensor_parallel.broadcast_data(["max_lengths"], data, torch.int32)["max_lengths"]
# Dummy image, no image.
# No image input (text-only sample) if the dataloader produced a dummy image.
if imgs.shape == torch.Size([1, 1]):
# FIXME: text-only data can cause a hang if the vision model is own its own pipeline rank and --freeze-ViT is enabled.
imgs = torch.tensor([], dtype=torch.float32, device=data_text.device)
......
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment