Commit 0d99ae1f authored by silencealiang's avatar silencealiang
Browse files

add

parent c271aaae
Pipeline #2498 canceled with stages
......@@ -95,7 +95,7 @@ do
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings ${MAX_POS_EMBED} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model <tokenizer model path> \
--tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \
--tokenizer-prompt-format nvlm-yi-34b \
--vocab-size 64000 \
--make-vocab-size-divisible-by 1 \
......@@ -135,6 +135,6 @@ do
--gt-path ${GROUNDTRUTH_PATH} \
${EXTRA_ARGS} \
--task ${TASK} \
--image-tag-type nlvm \
--image-tag-type nvlm \
--ckpt-format torch
done
......@@ -80,7 +80,7 @@ OPTIONS=" \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings ${MAX_POS_EMBED} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model ${WORKSPACE}/<tokenizer path> \
--tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \
--tokenizer-prompt-format nvlm-yi-34b \
--vocab-size 64000 \
--make-vocab-size-divisible-by 1 \
......
File mode changed from 100755 to 100644
......@@ -67,7 +67,7 @@ OPTIONS=" \
--exit-duration-in-mins 230 \
--disable-bias-linear \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model ${WORKSPACE}/<tokenizer model path> \
--tokenizer-model Qwen/Qwen2-72B-Instruct \
--tokenizer-prompt-format qwen2p0 \
--transformer-impl transformer_engine \
--normalization RMSNorm \
......
File mode changed from 100755 to 100644
......@@ -24,11 +24,6 @@ if [[ -z $LOAD_NAME ]]; then
exit 1
fi
if [[ -z $TOKENIZER_MODEL ]]; then
echo "Please set TOKENIZER_MODEL for tokenizer model name."
exit 1
fi
CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
DATA_TRAIN="${SOURCE}/examples/multimodal/pretrain_dataset.yaml"
......@@ -93,7 +88,7 @@ OPTIONS=" \
--eval-iters 10 \
--eval-interval 1000 \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \
--tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \
--tokenizer-prompt-format mistral \
--data-path ${DATA_TRAIN} \
--prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
......
......@@ -14,11 +14,13 @@ sys.path.append(
import torch
import yaml
from config import EvaluationConfig
from evaluation_datasets import get_evaluation_dataset
from evaluation.evaluation_datasets import get_evaluation_dataset
from model import model_provider
from multimodal_args import add_multimodal_extra_args
from megatron.core import parallel_state
from megatron.core.enums import ModelType
from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN
from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
from megatron.inference.text_generation.api import generate_and_post_process
from megatron.inference.text_generation.forward_step import ForwardStep
......@@ -36,7 +38,7 @@ def add_text_generation_args(parser):
group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
group.add_argument("--top_k", type=int, default=0, help='Top k sampling.')
group.add_argument(
"--out-seq-length", type=int, default=1024, help='Length of the output generated text.'
"--out-seq-length", type=int, default=128, help='Length of the output generated text.'
)
group.add_argument("--output-path", type=str, help='Output file path')
group.add_argument('--input-image-path', type=str, help="Input image directory")
......@@ -206,8 +208,8 @@ def generate_samples(model, config: EvaluationConfig, print_output):
if config.task == "VideoMME":
output["questions"][0][output_name] = generated
else:
output[output_name] = generated
output["prompt"] = prompt
output[output_name] = generated
if config.task == "captioning":
output["ground_truth"] = answers
......@@ -354,7 +356,7 @@ class VLMForwardStep(ForwardStep):
)
def __call__(self, tokens, position_ids, attention_mask):
num_image_tokens = (tokens == self.model.image_token_index).sum().item()
num_image_tokens = (tokens == self.model.module.image_token_index).sum().item()
num_tokens = tokens.size(1)
recv_buffer_seq_length = None
if num_image_tokens > 0:
......@@ -406,7 +408,7 @@ def get_conversation(task, question):
{"role": "system", "content": "Answer the questions."},
{
"role": "user",
"content": "<image>\nProvide a one-sentence caption for provided image.",
"content": f"{IMAGE_TOKEN}\nProvide a one-sentence caption for provided image.",
},
]
elif task in ("TextVQA", "VQAv2", "ChartQA"):
......@@ -414,13 +416,13 @@ def get_conversation(task, question):
{"role": "system", "content": "Answer the questions."},
{
"role": "user",
"content": f"<image>\n{question}\nAnswer the question using a single word or phrase.",
"content": f"{IMAGE_TOKEN}\n{question}\nAnswer the question using a single word or phrase.",
},
]
elif task in ("OCRBench", "MathVista", "AI2D"):
conversation = [
{"role": "system", "content": "Answer the questions."},
{"role": "user", "content": f"<image>\n{question}"},
{"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"},
]
elif task == "MMMU":
conversation = [
......@@ -441,7 +443,7 @@ def get_conversation(task, question):
conversation = [
{"role": "system", "content": "Answer the questions."},
{"role": "user", "content": f"<image>\n{question}"},
{"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"},
]
return conversation
......@@ -464,11 +466,13 @@ def get_prompt_and_generated(prompt_and_generation, prompt_format):
prompt = splitted[0]
generated = splitted[1]
generated = generated.split("<|im_end|>")[0]
elif prompt_format in ("nvlm-yi-34b", "qwen2p0"):
elif prompt_format in ("nvlm-yi-34b", "qwen2p0", "qwen2p5"):
splitted = prompt_and_generation.split("<|im_start|>assistant\n")
prompt = splitted[0]
generated = splitted[1]
generated = generated.split("<|im_end|>")[0]
else:
raise ValueError(f"Prompt format {prompt_format} is not supported.")
# Remove possible garbage.
generated = generated.strip()
......@@ -489,11 +493,11 @@ def main():
args = get_args()
def wrapped_model_provider(pre_process, post_process):
return model_provider(pre_process, post_process, parallel_output=False)
def wrapped_model_provider(pre_process, post_process, add_encoder, add_decoder):
return model_provider(pre_process, post_process, add_encoder, add_decoder, parallel_output=False)
# Set up model and load checkpoint.
model = get_model(wrapped_model_provider, wrap_with_ddp=False)
model = get_model(wrapped_model_provider, model_type=ModelType.encoder_and_decoder, wrap_with_ddp=False)
if args.load is not None:
_ = load_checkpoint(model, None, None)
......
File mode changed from 100755 to 100644
......@@ -29,11 +29,6 @@ if [[ -z $LOAD_ITER ]]; then
exit 1
fi
if [[ -z $TOKENIZER_MODEL ]]; then
echo "Please set TOKENIZER_MODEL for tokenizer model name."
exit 1
fi
CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
DATA_TRAIN="${SOURCE}/examples/multimodal/sft_dataset.yaml"
......@@ -98,7 +93,7 @@ OPTIONS=" \
--eval-iters 10 \
--eval-interval 500 \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \
--tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \
--tokenizer-prompt-format mistral \
--data-path ${DATA_TRAIN} \
--prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
......
......@@ -4,12 +4,13 @@ export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0
INPUT_IMAGE_PATH="placeholder"
GROUNDTRUTH_PATH="placeholder"
NUM_FRAMES=1
while [[ $# -gt 0 ]]; do
case $1 in
--input-image-path)
-i|--input-image-path)
INPUT_IMAGE_PATH="$2"
shift
shift
......@@ -19,11 +20,6 @@ while [[ $# -gt 0 ]]; do
shift
shift
;;
-g|--groundtruth-path)
GROUNDTRUTH_PATH="$2"
shift
shift
;;
-o|--output-path)
OUTPUT_PATH="$2"
shift
......@@ -34,12 +30,7 @@ while [[ $# -gt 0 ]]; do
shift
shift
;;
-t|--tokenizer-path)
TOKENIZER_PATH="$2"
shift
shift
;;
--task)
-t|--task)
TASK="$2"
shift
shift
......@@ -92,7 +83,7 @@ do
--no-masked-softmax-fusion \
--load ${MODEL_PATH} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model ${TOKENIZER_PATH} \
--tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \
--tokenizer-prompt-format mistral \
--bf16 \
--micro-batch-size 1 \
......
......@@ -48,7 +48,7 @@ def get_batch(data_iterator):
pp_size = get_pipeline_model_parallel_world_size()
if not is_first_or_last_stage(pp_size, args.encoder_pipeline_model_parallel_size):
# Note these are all set to None above.
return tokens, labels, loss_mask, attention_mask, position_ids, imgs, num_tiles
return tokens, labels, loss_mask, attention_mask, position_ids, imgs, num_tiles, packed_seq_params
# Broadcast data.
torch.cuda.nvtx.range_push("get_data")
......@@ -66,7 +66,7 @@ def get_batch(data_iterator):
cu_lengths = tensor_parallel.broadcast_data(["cu_lengths"], data, torch.int32)["cu_lengths"]
max_lengths = tensor_parallel.broadcast_data(["max_lengths"], data, torch.int32)["max_lengths"]
# Dummy image, no image.
# No image input (text-only sample) if the dataloader produced a dummy image.
if imgs.shape == torch.Size([1, 1]):
# FIXME: text-only data can cause a hang if the vision model is own its own pipeline rank and --freeze-ViT is enabled.
imgs = torch.tensor([], dtype=torch.float32, device=data_text.device)
......
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
node007 slots=8
node008 slots=8
node009 slots=8
node010 slots=8
node011 slots=8
node012 slots=8
node013 slots=8
node014 slots=8
node015 slots=8
node016 slots=8
node017 slots=8
node018 slots=8
node019 slots=8
node020 slots=8
node021 slots=8
node022 slots=8
node007 slots=8
node008 slots=8
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment