Commit 0d99ae1f authored by silencealiang's avatar silencealiang
Browse files

add

parent c271aaae
Pipeline #2498 canceled with stages
......@@ -91,11 +91,11 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
if use_te:
transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
args.num_experts, args.moe_grouped_gemm,
args.qk_layernorm, args.multi_latent_attention, args.fp8)
args.qk_layernorm, args.multi_latent_attention, args.moe_use_legacy_grouped_gemm)
else:
transformer_layer_spec = get_gpt_layer_local_spec(
args.num_experts, args.moe_grouped_gemm,
args.qk_layernorm, args.multi_latent_attention)
args.qk_layernorm, args.multi_latent_attention, args.moe_use_legacy_grouped_gemm)
build_model_context = nullcontext
build_model_context_args = {}
......@@ -129,6 +129,8 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
rope_scaling=args.use_rope_scaling
)
print_rank_0(model)
return model
......
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
......@@ -83,12 +83,6 @@ def model_provider(
assert args.ckpt_format == 'torch', "Only ckpt-format torch is supported for VLM training currently."
if args.pipeline_model_parallel_size > 1:
assert not args.freeze_LM, "Freezing a pipeline parallel language model is not currently supported"
if args.encoder_pipeline_model_parallel_size == 1:
assert not args.freeze_ViT, "Freezing a vision encoder on its own pipeline rank is not currently supported"
num_image_embeddings = get_num_image_embeddings(
args.img_h, args.img_w, args.patch_dim, vision_model_type, args.disable_vision_class_token,
class_token_len=1, pixel_shuffle=False, use_tile_tags=False
......@@ -129,7 +123,7 @@ def model_provider(
language_transformer_layer_spec = decoder_model_with_local_default_spec(
args.num_experts, args.moe_grouped_gemm
)
# Prepare mask type for any required padding to support CP/SP sequence sharding.
if mp_padding_needed > 0:
if language_transformer_layer_spec.submodules.self_attention.params.get('attn_mask_type', '') == AttnMaskType.causal:
......@@ -351,10 +345,10 @@ def get_batch(data_iterator):
labels = data_i["labels"].long()
loss_mask = data_f["loss_mask"].float()
images = data_f["image"].float()
if cp_size > 1 or args.sequence_parallel:
vision_model_type = "clip"
# Calculate the number of image embedding tokens will be added to text tokens
# Calculate the number of image embedding tokens will be added to text tokens
num_image_embeddings_per_tile = get_num_image_embeddings(
args.img_h, args.img_w, args.patch_dim, vision_model_type, args.disable_vision_class_token, 1
)
......@@ -367,7 +361,7 @@ def get_batch(data_iterator):
num_images_per_sample = torch.sum(image_token_mask, dim=-1)
img_seq_len = (num_image_embeddings_per_tile * num_images_per_sample - num_images_per_sample).max()
packed_seq_params = _get_packed_seq_params(tokens, img_seq_len, mp_padding_needed_for_text)
# slice batch along sequence dimension for context parallelism
batch = get_batch_on_this_cp_rank({"tokens": tokens, "position_ids": position_ids})
attention_mask = None # Use the attention mask type defined in layer spec. Typically no mask for the vision model and causal mask for the vision model.
......
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
......@@ -6,6 +6,7 @@ regex
pyyaml
sentencepiece
pybind11
blobfile
# ==== test ====
nltk
......
export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
export TORCHINDUCTOR_BENCHMARK_FUSION=1
export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
# export TORCHINDUCTOR_BENCHMARK_KERNEL=1
export TORCHINDUCTOR_MAX_AUTOTUNE=1
#export FLASH_ATTENTION_PRINT_PARAM=1
export TORCHINDUCTOR_CACHE_DIR=./cache
# export USE_AOTRITON_FA=1
# export USE_BSHD=1 # use fa bsdh layout
#for uniq kernel name
#export TORCHINDUCTOR_UNIQUE_KERNEL_NAMES=1
mpirun --allow-run-as-root -np 8 ./Llama_pretraining.sh localhost
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
export GPU_FLUSH_ON_EXECUTION=1
export HIP_DIRECT_DISPATCH=0
fi
done
mpirun -np 512 --hostfile hostfile_gptmoe \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_GPT-MOE_567B.sh node002 --profiling=$profiling > output.log 2>&1
wait
rm -rf CKPT
#rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
export GPU_FLUSH_ON_EXECUTION=1
export HIP_DIRECT_DISPATCH=0
fi
done
mpirun -np 8 --allow-run-as-root \
train_GPT-MOE_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
wait
rm -rf CKPT
rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
export GPU_FLUSH_ON_EXECUTION=1
export HIP_DIRECT_DISPATCH=0
fi
done
mpirun -np 8 --allow-run-as-root \
train_mixtral_8x7B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
wait
rm -rf CKPT
rm -rf mixtral_dataset/my-mixtral_text_document
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
export GPU_FLUSH_ON_EXECUTION=1
export HIP_DIRECT_DISPATCH=0
fi
done
mpirun -np 32 --hostfile hostfile_mixtral8x7B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_mixtral_8x7B_4nodes.sh node066 --profiling=$profiling > output.log 2>&1
wait
rm -rf CKPT
#rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
File mode changed from 100755 to 100644
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment