Commit bc5c7fa7 authored by wxj's avatar wxj
Browse files

第一次测试提交

parent 70fddd0f
# WARNING: Yaml configs is currently an experimental feature
language_model:
# model architecture
num_layers: 24
hidden_size: 1024
num_attention_heads: 16
num_query_groups: null
ffn_hidden_size: null
kv_channels: null
hidden_dropout: 0.0
attention_dropout: 0.0
fp32_residual_connection: False
apply_residual_connection_post_layernorm: False
layernorm_epsilon: 1.e-5
layernorm_zero_centered_gamma: True
add_bias_linear: False
bias_activation_fusion: False
add_qkv_bias: False
gated_linear_unit: False
activation_func: swiglu
num_moe_experts: null
rotary_interleaved: False
window_size: null
# initialization
init_method: null
init_method_std: 0.02
output_layer_init_method: null
# mixed-precision
apply_query_key_layer_scaling: False
attention_softmax_in_fp32: False
# fusion
bias_swiglu_fusion: True
masked_softmax_fusion: True
persist_layer_norm: False
memory_efficient_layer_norm: False
bias_dropout_fusion: True
apply_rope_fusion: True
# activation recomputation
recompute_granularity: null
recompute_method: null
recompute_num_layers: null
distribute_saved_activations: null
# fp8 related
fp8: null
fp8_margin: 0
fp8_interval: 1
fp8_amax_history_len: 1
fp8_amax_compute_algo: "most_recent"
fp8_wgrad: True
# miscellaneous
clone_scatter_output_in_embedding: True
normalization: "LayerNorm" # alt value supported by TE: "RMSNorm"
# MoE related
moe_router_load_balancing_type: "aux_loss"
moe_router_topk: 2
moe_grouped_gemm: False
moe_aux_loss_coeff: 0 # 1e-2 would be a good start value for load balance loss.
moe_z_loss_coeff: null # 1e-3 would be a good start value for z-loss
moe_input_jitter_eps: null
moe_token_dropping: False
model_parallel:
# Model parallelism
tensor_model_parallel_size: 1
context_parallel_size: 1
pipeline_model_parallel_size: 1
virtual_pipeline_model_parallel_size: null
sequence_parallel: True
expert_model_parallel_size: 1
# Initialization
perform_initialization: True
use_cpu_initialization: null
# Training
fp16: False
bf16: True
params_dtype: null # Set from above arguments for core
timers: null
# Optimizations
gradient_accumulation_fusion: True
async_tensor_model_parallel_allreduce: True
tp_comm_overlap: False
# Debug Options
tp_comm_split_ag: True
tp_comm_atomic_ag: True
tp_comm_split_rs: True
tp_comm_atomic_rs: True
tp_comm_bulk_wgrad: True
tp_comm_bulk_dgrad: True
# Parallelism
finalize_model_grads_func: null
# Pipeline Parallel
pipeline_dtype: null
grad_scale_func: null
enable_autocast: False
autocast_dtype: null
variable_seq_lengths: False
num_microbatches_with_partial_activation_checkpoints: null
overlap_p2p_comm: False
batch_p2p_comm: True
batch_p2p_sync: True
use_ring_exchange_p2p: False
deallocate_pipeline_outputs: False
no_sync_func: null
grad_sync_func: null
param_sync_func: null
pipeline_model_parallel_split_rank: null
# CPU Offloading
cpu_offloading: False
cpu_offloading_num_layers: 0
_cpu_offloading_context: null
cpu_offloading_weights: False
cpu_offloading_activations: True
# Timing
barrier_with_L1_time: True
# training:
use_mcore_models: True
spec: null
micro_batch_size: 2
global_batch_size: 128
rampup_batch_size: [32, 32, 65324160]
check_for_nan_in_loss_and_grad: True
num_layers_per_virtual_pipeline_stage: null
encoder_num_layers: null
decoder_num_layers: null
rotary_seq_len_interpolation_factor: null
add_position_embedding: False
make_vocab_size_divisible_by: 128
group_query_attention: False
exit_signal_handler: False
exit_duration_in_mins: null
exit_interval: null
untie_embeddings_and_output_weights: True
position_embedding_type: rope
rotary_percent: 0.5
openai_gelu: False
squared_relu: False
swiglu: True
onnx_safe: null
bert_binary_head: True
max_position_embeddings: 4096
transformer_impl: local
use_flash_attn: False
seed: 1234
data_parallel_random_init: False
# Optimizer
optimizer: adam
lr: 2.5e-4
lr_decay_style: cosine
lr_decay_iters: null
lr_decay_samples: 255126953
lr_warmup_fraction: null
lr_warmup_iters: 0
lr_warmup_samples: 81381
lr_warmup_init: 0.0
min_lr: 2.5e-5
weight_decay: 0.1
start_weight_decay: null
end_weight_decay: null
weight_decay_incr_style: constant
clip_grad: 1.0
adam_beta1: 0.9
adam_beta2: 0.95
adam_eps: 1.e-08
sgd_momentum: 0.9
override_opt_param_scheduler: False
use_checkpoint_opt_param_scheduler: False
# checkpointing arguments
save: null
save_interval: 20000
no_save_optim: null
no_save_rng: null
load: null
no_load_optim: null
no_load_rng: null
finetune: False
use_checkpoint_args: False
exit_on_missing_checkpoint: False
# loss arguments
loss_scale: null
initial_loss_scale: 4294967296
min_loss_scale: 1.0
loss_scale_window: 1000
hysteresis: 2
accumulate_allreduce_grads_in_fp32: False
fp16_lm_cross_entropy: False
# distributed arguments
distributed_backend: nccl
distributed_timeout_minutes: 10
overlap_grad_reduce: False
delay_grad_reduce: True
overlap_param_gather: False
delay_param_gather: False
scatter_gather_tensors_in_pipeline: True
local_rank: null
lazy_mpu_init: null
empty_unused_memory_level: 0
standalone_embedding_stage: False
use_distributed_optimizer: False
nccl_communicator_config_path: null
train_iters: null
eval_iters: 32
eval_interval: 2000
skip_train: False
adlr_autoresume: False
adlr_autoresume_interval: 1000
# garbage collection
manual_gc: False
manual_gc_interval: 0
manual_gc_eval: True
tp_comm_overlap_cfg: null
#data
data_path: null
split: '99,1,0'
train_data_path: null
valid_data_path: null
test_data_path: null
data_cache_path: null
mock_data: False
vocab_size: null
vocab_file: null
merge_file: null
vocab_extra_ids: 0
seq_length: 4096
encoder_seq_length: null
decoder_seq_length: null
retriever_seq_length: 256
sample_rate: 1.0
mask_prob: 0.15
short_seq_prob: 0.1
num_workers: 2
tokenizer_type: GPTSentencePieceTokenizer
tokenizer_model: null
reset_position_ids: False
reset_attention_mask: False
eod_mask_loss: False
train_samples: 268554688
dataloader_type: null
#profile:
profile: False
profile_ranks: [0]
profile_step_end: 12
profile_step_start: 10
#logging:
log_params_norm: True
log_num_zeros_in_grad: True
log_throughput: False
log_progress: False
timing_log_level: 0
timing_log_option: minmax
tensorboard_log_interval: 1
tensorboard_queue_size: 1000
log_timers_to_tensorboard: False
log_batch_size_to_tensorboard: False
log_learning_rate_to_tensorboard: True
log_learning_rate_to_tensorboard: True
log_validation_ppl_to_tensorboard: False
log_memory_to_tensorboard: False
log_world_size_to_tensorboard: False
log_loss_scale_to_tensorboard: True
wandb_project: ''
wandb_exp_name: ''
wandb_save_dir: ''
enable_one_logger: False
one_logger_project: e2e-tracking
one_logger_entity: hwinf_dcm
one_logger_run_name: null
log_interval: 100
tensorboard_dir: null
#!/bin/bash
# Runs the "175B" parameter model
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NUM_NODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
CHECKPOINT_PATH=$1 #<Specify path>
TENSORBOARD_LOGS_PATH=$2 #<Specify path>
VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
DATA_PATH=$5 #<Specify path and file prefix>_text_document
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
GPT_MODEL_ARGS=(
--num-layers 96
--hidden-size 12288
--num-attention-heads 96
--seq-length 2048
--max-position-embeddings 2048
)
TRAINING_ARGS=(
--micro-batch-size 1
--global-batch-size 1536
--rampup-batch-size 16 16 5859375
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--fp16
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
--use-mcore-models
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 8
--pipeline-model-parallel-size 16
)
DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 100
--save-interval 10000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]}
# Megatron Model Optimization and Deployment
## Installation
We recommend that users follow TensorRT-LLM's official installation guide to build it from source
and proceed with a containerized environment (`docker.io/tensorrt_llm/release:latest`):
```
git clone https://github.com/NVIDIA/TensorRT-LLM.git
cd TensorRT-LLM
git checkout v0.7.1
make -C docker release_build
```
> **TROUBLE SHOOTING:** rather than copying each folder separately in `docker/Dockerfile.multi`,
> you may need to copy the entire dir as `COPY ./ /src/tensorrt_llm` since a `git submodule` is
> called later which requires `.git` to continue.
Once the container is built, install `nvidia-ammo` and additional dependencies for sharded checkpoint support:
```
pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo
pip install zarr tensorstore==0.1.45
```
TensorRT-LLM quantization functionalities are currently packaged in `nvidia-ammo`.
You can find more documentation about `nvidia-ammo` in [TensorRT-LLM's quantization
examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/quantization).
## Support Matrix
The following matrix shows the current support for the PTQ + TensorRT-LLM export flow.
| model | fp16 | int8_sq | fp8 | int4_awq |
|-----------------------------|------|---------| ----| -------- |
| nextllm-2b | x | x | x | |
| nemotron3-8b | x | | x | |
| nemotron3-15b | x | | x | |
| llama2-text-7b | x | x | x | TP2 |
| llama2-chat-70b | x | x | x | TP4 |
Our PTQ + TensorRT-LLM flow has native support on MCore `GPTModel` with a mixed layer spec (native ParallelLinear
and Transformer-Engine Norm (`TENorm`). Note that this is not the default mcore gpt spec. You can still load the
following checkpoint formats with some remedy:
| GPTModel | sharded | remedy arguments |
|-----------------------------------|---------|-----------------------------------------|
| megatron.legacy.model | | `--ammo-load-classic-megatron-to-mcore` |
| TE-Fused (default mcore gpt spec) | | `--ammo-convert-te-to-local-spec` |
| TE-Fused (default mcore gpt spec) | x | |
> **TROUBLE SHOOTING:** If you are trying to load an unpacked `.nemo` sharded checkpoint, then typically you will
> need to adding `additional_sharded_prefix="model."` to `ammo_load_checkpoint()` since NeMo has an additional
> `model.` wrapper on top of the `GPTModel`.
> **NOTE:** flag `--ammo-load-classic-megatron-to-mcore` may not work on all legacy checkpoint versions.
## Examples
> **NOTE:** we only provide a simple text generation script to test the generated TensorRT-LLM engines. For
> a production-level API server or enterprise support, see [NeMo](https://github.com/NVIDIA/NeMo) and TensorRT-LLM's
> backend for [NVIDIA Triton Inference Server](https://developer.nvidia.com/nvidia-triton-inference-server).
### nemotron3-8B FP8 Quantization and TensorRT-LLM Deployment
First download the nemotron checkpoint from https://huggingface.co/nvidia/nemotron-3-8b-base-4k, extract the
sharded checkpoint from the `.nemo` tarbal and fix the tokenizer file name.
> **NOTE:** The following cloning method uses `ssh`, and assume you have registered the `ssh-key` in Hugging Face.
> If you are want to clone with `https`, then `git clone https://huggingface.co/nvidia/nemotron-3-8b-base-4k` with an access token.
```sh
git lfs install
git clone git@hf.co:nvidia/nemotron-3-8b-base-4k
cd nemotron-3-8b-base-4k
tar -xvf Nemotron-3-8B-Base-4k.nemo
mv 586f3f51a9cf43bc9369bd53fa08868c_a934dc7c3e1e46a6838bb63379916563_3feba89c944047c19d5a1d0c07a85c32_mt_nlg_plus_multilingual_ja_zh_the_stack_frac_015_256k.model mt_nlg_plus_multilingual_ja_zh_the_stack_frac_015_256k.model
cd ..
```
Now launch the PTQ + TensorRT-LLM export script,
```
bash examples/inference/ptq_trtllm_nemotron3_8b ./nemotron-3-8b-base-4k None
```
By default, `cnn_dailymail` is used for calibration. The `GPTModel` will have quantizers for simulating the
quantization effect. The checkpoint will be saved optionally (with quantizers as additional states) and can
be restored for further evaluation. TensorRT-LLM engine is exported to `/tmo/ammo` by default.
The script expects `${CHECKPOINT_DIR}` (`./nemotron-3-8b-base-4k`) to have the following structure:
```
├── model_weights
│ ├── common.pt
│ ...
├── model_config.yaml
├── mt_nlg_plus_multilingual_ja_zh_the_stack_frac_015_256k.model
```
> **NOTE:** The script is using `TP=8`. Change `$TP` in the script if your checkpoint has a different tensor
> model parallelism.
> **KNOWN ISSUES:** The `mt_nlg_plus_multilingual_ja_zh_the_stack_frac_015_256k.model` in the checkpoint is for
> Megatron-LM's `GPTSentencePiece` tokenizer.
> For TensorRT-LLM, we are trying to load this tokenizer as a Hugging Face `T5Tokenizer` by changing
> some special tokens, `encode`, and `batch_decode`. As a result, the tokenizer behavior in TensorRT-LLM engine may
> not match exactly.
> **TROUBLE SHOOTING:** If you are loading `.nemo` sharded checkpoint here, call
> `ammo_load_checkpoint(..., additional_sharded_prefix="model.")` with additional sharded prefix in
> `text_generation_ptq.py` to align the sharded keys.
### llama2-text-7b INT8 SmoothQuant and TensorRT-LLM Deployment
> **NOTE:** Due to the LICENSE issue, we do not provide a MCore checkpoint to download. Users can follow
> the instruction in `docs/llama2.md` to convert the checkpoint to megatron classic `GPTModel` format and
> use `--ammo-load-classic-megatron-to-mcore` flag which will remap the checkpoint to the MCore `GPTModel` spec
> that we support.
```sh
bash examples/inference/ptq_trtllm_llama_7b.sh ${CHECKPOINT_DIR}
```
The script expect `${CHECKPOINT_DIR}` to have the following structure:
```
├── hf
│ ├── tokenizer.config
│ ├── tokenizer.model
│ ...
├── iter_0000001
│ ├── mp_rank_00
│ ...
├── latest_checkpointed_iteration.txt
```
In short, other than the converted llama megatron checkpoint, also put the Hugging Face checkpoint inside as
the source of the tokenizer.
#!/bin/bash
DEFAULT_NAME="/checkpoints/llama2-text-7b_v0.2.0"
NAME="${1:-$DEFAULT_NAME}"
DEFAULT_QUANT_CFG="int8_sq"
QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
TP="8"
PP=1
INFERENCE_TP=${TP}
DECODER_TYPE="llama"
CHECKPOINT_LOAD_DIR="${NAME}"
TOKENIZER_MODEL="${CHECKPOINT_LOAD_DIR}/hf/tokenizer.model"
# LLaMA2 text 7b has ffn_hidden_size 11008. int4_awq requires a block_size of 128 as a result the TP can at most be 2
if [ "$QUANT_CFG" = "int4_awq" ]; then
INFERENCE_TP="2"
fi
additional_options=" \
--ammo-quant-cfg ${QUANT_CFG} \
--ammo-load-classic-megatron-to-mcore \
--decoder ${DECODER_TYPE} \
--engine-dir /tmp/ammo \
--max-input-len 2048 \
--max-output-len 512 \
--max-batch-size 8 \
--inference-tensor-parallel ${INFERENCE_TP} "
trtllm_options=" \
--engine-dir /tmp/ammo \
--tokenizer ${CHECKPOINT_LOAD_DIR}/hf \
--max-output-len 512 "
# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
export CUDA_DEVICE_MAX_CONNECTIONS=1
options=" \
--disable-bias-linear \
--swiglu \
--untie-embeddings-and-output-weights \
--use-rotary-position-embeddings \
--normalization RMSNorm \
--norm-epsilon 1e-5 \
--no-position-embedding \
--no-masked-softmax-fusion \
--no-bias-gelu-fusion \
--no-bias-dropout-fusion \
--no-async-tensor-model-parallel-allreduce \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 11008 \
--num-attention-heads 32 \
--seq-length 2048 \
--max-position-embeddings 4096 \
--micro-batch-size 1 \
--make-vocab-size-divisible-by 1 \
--tokenizer-type Llama2Tokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--save-interval 1000000 \
--bf16 \
--use-mcore-models "
set +x
# Precompile CUDA extentions
python -c "import ammo.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
# Acquire launch configuration where variable launch_config will be set
launch_config="--nproc_per_node=${TP}"
# Launch multi-process with torchrun
torchrun ${launch_config} examples/inference/text_generation_ptq.py ${options} ${additional_options} --load ${CHECKPOINT_LOAD_DIR}
# This script is using mpi4py which will fork multiple processes.
python examples/inference/trtllm_text_generation.py ${trtllm_options}
#!/bin/bash
DEFAULT_NAME="/checkpoints/nemotron3-8b_v0.2.0"
NAME="${1:-$DEFAULT_NAME}"
DEFAULT_QUANT_CFG="fp8"
QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
TP="8"
INFERENCE_TP=${TP}
DECODER_TYPE="gptnext"
CHECKPOINT_LOAD_DIR="${NAME}"
TOKENIZER_MODEL="${CHECKPOINT_LOAD_DIR}/mt_nlg_plus_multilingual_ja_zh_the_stack_frac_015_256k.model"
if [ "$QUANT_CFG" = "int4_awq" ]; then
INFERENCE_TP="1"
fi
additional_options=" \
--ammo-quant-cfg ${QUANT_CFG} \
--ammo-load-classic-megatron-to-mcore \
--decoder ${DECODER_TYPE} \
--engine-dir /tmp/ammo \
--max-input-len 2048 \
--max-output-len 512 \
--max-batch-size 8 \
--inference-tensor-parallel ${INFERENCE_TP} "
trtllm_options=" \
--engine-dir /tmp/ammo \
--tokenizer ${TOKENIZER_MODEL} \
--max-output-len 512 "
# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
export CUDA_DEVICE_MAX_CONNECTIONS=1
options=" \
--apply-layernorm-1p \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--no-position-embedding \
--use-rotary-position-embeddings \
--rotary-percent 0.5 \
--squared-relu \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--num-attention-heads 32 \
--seq-length 4096 \
--max-position-embeddings 4096 \
--micro-batch-size 1 \
--tokenizer-type GPTSentencePieceTokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--save-interval 1000000 \
--load ${CHECKPOINT_LOAD_DIR} \
--bf16 \
--use-mcore-models "
set +x
# Precompile CUDA extentions
python -c "import ammo.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
# Acquire launch configuration where variable launch_config will be set
launch_config="--nproc_per_node=${TP}"
# Launch multi-process with torchrun
torchrun ${launch_config} examples/inference/text_generation_ptq.py ${options} ${additional_options} --load ${CHECKPOINT_LOAD_DIR}
# This script is using mpi4py which will fork multiple processes.
python examples/inference/trtllm_text_generation.py ${trtllm_options}
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""Sample Generate GPT."""
import functools
import os
import sys
from pathlib import Path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
import ammo.torch.quantization as atq
import torch
from datasets import load_dataset
# [ModelOpt]: changing the default model provider to the AMMO version
from megatron.training import get_args, print_rank_0
from megatron.training.checkpointing import load_checkpoint, save_checkpoint
from megatron.core import mpu
from megatron.core.dist_checkpointing import load
from megatron.inference.arguments import add_ammo_args
from megatron.inference.gpt.model_provider import model_provider
from megatron.training.initialize import initialize_megatron
from megatron.inference.text_generation import generate_and_post_process
from megatron.training import get_model
from megatron.training.utils import unwrap_model
QUANT_CFG_CHOICES = {
"int8": atq.INT8_DEFAULT_CFG,
"int8_sq": atq.INT8_SMOOTHQUANT_CFG,
"fp8": atq.FP8_DEFAULT_CFG,
"int4_awq": atq.INT4_AWQ_CFG,
"w4a8_awq": atq.W4A8_AWQ_BETA_CFG,
}
def add_trtllm_args(parser):
"""Add additional arguments for TensorRT-LLM."""
group = parser.add_argument_group(title="trtllm")
group.add_argument(
"--engine-dir", type=str, help="The output TensorRT-LLM engine dir.",
)
group.add_argument(
"--decoder", type=str, choices=["gptnext", 'llama'], help="The decoder type of the model.",
)
group.add_argument("--max-input-len", type=int, help="Max input sequence length.", default=2048)
group.add_argument(
"--max-output-len", type=int, help="Max output sequence length.", default=512
)
group.add_argument("--max-batch-size", type=int, help="Max batch size.", default=32)
group.add_argument(
"--inference-tensor-parallel",
type=int,
help="Tensor parallel for the inference time, can be different from the training config.",
default=1,
)
def add_text_generate_ptq_args(parser):
"""Add additional arguments for AMMO text generation PTQ."""
group = parser.add_argument_group(title='AMMO text generation ptq')
group.add_argument(
"--calib-dataset",
type=str,
default="cnn_dailymail",
help="Calibration datasets from HuggingFace datasets.",
)
group.add_argument(
"--calib-steps", type=int, default=512, help="Steps to perform atq.quantize calibration."
)
parser.add_argument(
"--prompts",
type=str,
default=(
"Born in north-east France, Soyer trained as a|Born in California, Soyer trained as a"
),
help="Input texts. Please use | to separate different batches.",
)
add_ammo_args(parser)
add_trtllm_args(parser)
return parser
def get_calib_dataloader(
data="cnn_dailymail", batch_size=4, calib_size=512, max_sequence_length=512
):
if data == "wikitext":
dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")
text_column = "text"
elif data == "cnn_dailymail":
dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train")
text_column = "article"
calib_size = max(min(len(dataset), calib_size), batch_size)
for i in range(calib_size // batch_size):
batch = dataset[i * batch_size : (i + 1) * batch_size][text_column]
for j in range(len(batch)):
batch[j] = batch[j][:max_sequence_length]
yield batch
def ammo_load_checkpoint(
model, optimizer=None, opt_param_scheduler=None, strict=True, additional_sharded_prefix=""
):
"""Load a megatron checkpoint depending its format.
Args:
model: MCoreGPTModel instance
optimizer: Megatron optimizer instance
opt_param_scheduler: Megatron scheduler instance
strict: if True, no extra or missing keys are allowed while loading the state_dict
additional_sharded_prefix (str): Append additional prefix to align the sharded checkpoint keys. When loading
an .nemo sharded checkpoint, this is usually `model.`. Otherwise, this is typically an empty string.
"""
def _remove_prefix_state_dict_pre_hook(
state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs,
):
"""Pytorch _load_state_dict_pre_hook to remap the state_dict with the additional sharded prefix."""
if additional_sharded_prefix is None:
return
key_rewrite_list = []
for key, _ in state_dict.items():
if key.startswith(additional_sharded_prefix):
key_rewrite_list.append(key)
for old_key in key_rewrite_list:
new_key = old_key[len(additional_sharded_prefix) :]
state_dict[new_key] = state_dict.pop(old_key)
args = get_args()
load_dir = args.load
shared_model_state_dir = "model_weights"
sharded_load_dir = Path(load_dir + "/" + shared_model_state_dir)
if sharded_load_dir.exists() and optimizer is None and opt_param_scheduler is None:
unwrapped_model = unwrap_model(model)
shareded_state_dict = unwrapped_model[0].sharded_state_dict(
prefix=additional_sharded_prefix
)
if additional_sharded_prefix:
unwrapped_model[0]._register_load_state_dict_pre_hook(
_remove_prefix_state_dict_pre_hook
)
unwrapped_model[0].load_state_dict(load(shareded_state_dict, sharded_load_dir))
else:
_ = load_checkpoint(model, optimizer, opt_param_scheduler, strict=strict)
if __name__ == "__main__":
initialize_megatron(
extra_args_provider=add_text_generate_ptq_args,
args_defaults={
'tokenizer_type': 'GPT2BPETokenizer',
'no_load_rng': True,
'no_load_optim': True,
},
)
args = get_args()
if args.num_layers_per_virtual_pipeline_stage is not None:
print("Interleaved pipeline schedule is not yet supported for text generation.")
exit()
text_generation_model_provider = functools.partial(model_provider, parallel_output=False)
model = get_model(text_generation_model_provider, wrap_with_ddp=False)
assert len(model) == 1, "Above condition should have caught this"
if args.load is not None:
_ = ammo_load_checkpoint(
model,
None,
None,
strict=not args.untie_embeddings_and_output_weights,
additional_sharded_prefix="model.",
)
else:
print_rank_0("WARNING: No checkpoint is loaded for PTQ! The process will still continue.")
all_prompts = args.prompts.split("|")
def custom_prompt_forward_loop_func():
for prompt in all_prompts:
if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
(
prompts_plus_generations,
prompts_plus_generations_segments,
logprobs,
_,
) = generate_and_post_process(
model[0],
prompts=[prompt],
tokens_to_generate=128,
return_output_log_probs=True,
temperature=1.0,
)
print_rank_0(prompts_plus_generations)
else:
generate_and_post_process(model[0])
def hf_dataset_forword_loop_func():
dataloader = get_calib_dataloader(args.calib_dataset, calib_size=args.calib_steps)
for prompts in dataloader:
if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
(
prompts_plus_generations,
prompts_plus_generations_segments,
logprobs,
_,
) = generate_and_post_process(
model[0],
prompts=prompts,
tokens_to_generate=0,
return_output_log_probs=True,
temperature=1.0,
)
else:
generate_and_post_process(model[0])
ptq_forward_loop_func = custom_prompt_forward_loop_func
if args.calib_dataset is not None:
ptq_forward_loop_func = hf_dataset_forword_loop_func
if args.ammo_quant_cfg in QUANT_CFG_CHOICES:
atq_config = QUANT_CFG_CHOICES[args.ammo_quant_cfg]
if "awq" in args.ammo_quant_cfg:
weight_quantizer = atq_config["quant_cfg"]["*weight_quantizer"] # type: ignore
if isinstance(weight_quantizer, list):
weight_quantizer = weight_quantizer[0]
weight_quantizer["block_sizes"][-1] = 128
atq_config["quant_cfg"]["*.output_layer.*"] = {"enable": False}
print_rank_0("atq.quantize: output_layer quantization is disable")
atq.quantize(model[0], atq_config, ptq_forward_loop_func)
custom_prompt_forward_loop_func()
if args.save:
save_checkpoint(1, model, None, None)
else:
custom_prompt_forward_loop_func()
if args.engine_dir:
from ammo.deploy.llm import model_config_to_tensorrt_llm
from ammo.torch.export import torch_to_model_config
assert args.decoder in ["gptnext", "llama"], f"Decoder type {args.decoder} not supported."
Path(args.engine_dir).mkdir(parents=True, exist_ok=True)
print_rank_0("Exporting model_configs for TRT LLM.")
model = unwrap_model(model)
model = model[0]
# In TRT LLM, squared relu activation does not support bf16. So we use fp16 by default.
model_configs = torch_to_model_config(
model,
args.decoder,
torch.float16,
inference_tensor_parallel=args.inference_tensor_parallel,
)
print_rank_0("Building TRT LLM engines.")
for model_config in model_configs:
model_config_to_tensorrt_llm(
model_config,
args.engine_dir,
max_input_len=args.max_input_len,
max_output_len=args.max_output_len,
max_batch_size=args.max_batch_size,
max_beam_width=1,
num_build_workers=1,
inflight_batching=False,
enable_sparsity=False,
)
print_rank_0(f"TRT LLM engines saved to {args.engine_dir}")
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""An example script to run the tensorrt_llm engine."""
import argparse
from pathlib import Path
import numpy as np
import torch
from ammo.deploy.llm import generate, load, unload
from transformers import AutoTokenizer, T5Tokenizer
class CustomSentencePieceTokenizer(T5Tokenizer):
"""This is a custom GPTSentencePiece Tokenizer modified from the T5Tokenizer.
Note:
The modification is kept minimal to make `encode` and `batch_decode` working
properly (used in TensorRT-LLM engine). Other functions have not been tested.
"""
def __init__(self, model):
super().__init__(model, extra_ids=0, bos_token="<s>", pad_token="<pad>")
def encode(self, text, add_special_tokens: bool = True, **kwargs):
return self.sp_model.encode_as_ids(text)
def batch_decode(self, sequences, skip_special_tokens: bool = False, **kwargs):
if isinstance(sequences, np.ndarray) or torch.is_tensor(sequences):
sequences = sequences.tolist()
return self.sp_model.decode(sequences)
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("--tokenizer", type=str, default="")
parser.add_argument("--max-output-len", type=int, default=100)
parser.add_argument("--engine-dir", type=str, default="/tmp/ammo")
parser.add_argument(
"--input-texts",
type=str,
default=(
"Born in north-east France, Soyer trained as a|Born in California, Soyer trained as a"
),
help="Input texts. Please use | to separate different batches.",
)
parser.add_argument("--max-num-beams", type=int, default=1)
parser.add_argument("--profiler-output", type=str, default="")
return parser.parse_args()
def run(args):
tokenizer_path = Path(args.tokenizer)
if tokenizer_path.is_dir():
# For llama models, use local HF tokenizer which is a folder.
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, trust_remote_code=True)
elif tokenizer_path.is_file():
# For nextllm and nemotron models, use local Megatron GPTSentencePiece tokenizer which is a model file.
tokenizer = CustomSentencePieceTokenizer(args.tokenizer)
else:
raise ValueError(
"arg.tokenizer must be a dir to a hf tokenizer checkpoint for llama or a SentencePiece .model file for gptnext"
)
if not hasattr(args, "profiler_output"):
args.profiler_output = ""
input_texts = args.input_texts.split("|")
assert input_texts, "input_text not specified"
print(input_texts)
free_memory_before = torch.cuda.mem_get_info()
host_context = load(
tokenizer=tokenizer, engine_dir=args.engine_dir, num_beams=args.max_num_beams
)
torch.cuda.cudart().cudaProfilerStart()
outputs = generate(input_texts, args.max_output_len, host_context, None, args.profiler_output)
print(outputs)
torch.cuda.cudart().cudaProfilerStop()
free_memory_after = torch.cuda.mem_get_info()
print(
f"Use GPU memory: {(free_memory_before[0] - free_memory_after[0]) / 1024 / 1024 / 1024} GB"
)
unload(host_context)
if __name__ == "__main__":
args = parse_arguments()
run(args)
#!/bin/bash
TENSOR_MODEL_PARALLEL_SIZE=2
VOCAB_FILE=bert-vocab.txt
CHECKPOINT_PATH=checkpoints/bert_345m
WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \
--model-type BERT \
--tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \
--tokenizer-type BertWordPieceLowerCase \
--vocab-file $VOCAB_FILE \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 512 \
--max-position-embeddings 512 \
--load $CHECKPOINT_PATH
# Multi-Stage Prompting for Knowledgeable Dialogue Generation
This directory contains all the scripts of multi-stage prompting for knowledgeable dialogue generation that includes data preparation, and knowledge and response generations. More details are available on [`knowledgeable task directory`](../../tasks/msdp).
#!/bin/bash
# Data preparation for our framework: preprocessing the WoW and WoI datasets
# The datasets can be downloaded through the following links:
# WoW: https://parl.ai/projects/wizard_of_wikipedia/
# WoI: https://parl.ai/projects/sea/
DIR=`pwd`
# Before running the preprocessing, please download
# the wizard of wikipedia and wizard datasets
WOW_DATA_FOLDER=<PATH_OF_WIZARD_OF_WIKIPEDIA_DATA_FOLDER>
WOI_DATA_FOLDER=<PATH_OF_WIZARD_OF_INTERNET_DATA_FOLDER>
# We provide examples for processing the raw data from Wizard of Wikipedia
# Processing the train dataset (train.json)
python ${DIR}/tasks/msdp/preprocessing.py \
--func process_wow_dataset \
--raw_file ${WOW_DATA_FOLDER}/train.json \
--processed_file ${WOW_DATA_FOLDER}/train_processed.txt
# Processing test seen dataset (test_random_split.json)
python ${DIR}/tasks/msdp/preprocessing.py \
--func process_wow_dataset \
--raw_file ${WOW_DATA_FOLDER}/test_random_split.json \
--processed_file ${WOW_DATA_FOLDER}/testseen_processed.txt \
--knwl_ref_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_reference.txt \
--resp_ref_file ${WOW_DATA_FOLDER}/output_testseen_response_reference.txt
# processing test unseen dataset (test_topic_split.json)
python ${DIR}/tasks/msdp/preprocessing.py \
--func process_wow_dataset \
--raw_file ${WOW_DATA_FOLDER}/test_topic_split.json \
--processed_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \
--knwl_ref_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_reference.txt \
--resp_ref_file ${WOW_DATA_FOLDER}/output_testunseen_response_reference.txt
# We provide the following script to process the raw data from Wizard of Internet
# Processing the test dataset (test.jsonl)
python ${DIR}/tasks/msdp/preprocessing.py \
--func process_woi_dataset \
--raw_file ${WOI_DATA_FOLDER}/test.jsonl \
--processed_file ${WOI_DATA_FOLDER}/test_processed.txt \
--knwl_ref_file ${WOI_DATA_FOLDER}/output_test_knowledge_reference.txt \
--resp_ref_file ${WOI_DATA_FOLDER}/output_test_response_reference.txt
# Get the knowledge generation prompts for the each test dataset in WoW and WoI
MODEL_FILE=<PATH_OF_THE_FINETUNED_DPR_MODEL>
# WoW test seen
python ${DIR}/tasks/msdp/preprocessing.py \
--func get_knwl_gen_prompts \
--test_file ${WOW_DATA_FOLDER}/testseen_processed.txt \
--train_file ${WOW_DATA_FOLDER}/train_processed.txt \
--model_file ${MODEL_FILE} \
--processed_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_prompts.json \
--data_type wow_seen
# WoW test unseen
python ${DIR}/tasks/msdp/preprocessing.py \
--func get_knwl_gen_prompts \
--test_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \
--train_file ${WOW_DATA_FOLDER}/train_processed.txt \
--model_file ${MODEL_FILE} \
--processed_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_prompts.json \
--data_type wow_unseen
# WoI
python ${DIR}/tasks/msdp/preprocessing.py \
--func get_knwl_gen_prompts \
--test_file ${WOI_DATA_FOLDER}/test_processed.txt \
--train_file ${WOW_DATA_FOLDER}/train_processed.txt \
--model_file ${MODEL_FILE} \
--processed_file ${WOI_DATA_FOLDER}/output_test_knowledge_prompts.json \
--data_type woi
# Get the response generation prompts (can be applied for all the test datasets)
python ${DIR}/tasks/msdp/preprocessing.py \
--func get_resp_gen_prompts \
--train_file ${WOW_DATA_FOLDER}/train_processed.txt \
--processed_file ${WOW_DATA_FOLDER}/output_response_prompts.txt
#!/bin/bash
#########################
# Evaluate the F1 scores.
#########################
WORLD_SIZE=1
DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
MODEL_GEN_PATH=<PATH_OF_THE_KNOWLEDGE_GENERATION> \
(e.g., /testseen_knowledge_generations.txt)
GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE> \
(e.g., /testseen_knowledge_reference.txt)
python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--micro-batch-size 4 \
--task MSDP-EVAL-F1 \
--guess-file ${MODEL_GEN_PATH} \
--answer-file ${GROUND_TRUTH_PATH}
############################################
# Evaluate BLEU, METEOR, and ROUGE-L scores.
############################################
# We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to
# evaluate the BLEU, METEOR, and ROUGE-L scores.
# To evaluate on these metrics, please setup the environments based on
# the nlg-eval github, and run the corresponding evaluation commands.
nlg-eval \
--hypothesis=<PATH_OF_THE_KNOWLEDGE_GENERATION> \
--references=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE>
#!/bin/bash
#########################
# Evaluate the F1 scores.
#########################
WORLD_SIZE=1
DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
MODEL_GEN_PATH=<PATH_OF_THE_RESPONSE_GENERATION> \
(e.g., /testseen_response_generations.txt)
GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_RESPONSE> \
(e.g., /testseen_response_reference.txt)
python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--micro-batch-size 4 \
--task MSDP-EVAL-F1 \
--guess-file ${MODEL_GEN_PATH} \
--answer-file ${GROUND_TRUTH_PATH}
##########################
# Evaluate the KF1 scores.
##########################
MODEL_GEN_PATH=<PATH_OF_THE_RESPONSE_GENERATION> \
(e.g., /testseen_response_generations.txt)
GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE> \
(e.g., /testseen_knowledge_reference.txt)
python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--micro-batch-size 4 \
--task MSDP-EVAL-F1 \
--guess-file ${MODEL_GEN_PATH} \
--answer-file ${GROUND_TRUTH_PATH}
############################################
# Evaluate BLEU, METEOR, and ROUGE-L scores.
############################################
# We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to
# evaluate the BLEU, METEOR, and ROUGE-L scores.
# To evaluate on these metrics, please setup the environments based on
# the nlg-eval github, and run the corresponding evaluation commands.
nlg-eval \
--hypothesis=<PATH_OF_THE_RESPONSE_GENERATION> \
--references=<PATH_OF_THE_GROUND_TRUTH_RESPONSE>
#!/bin/bash
# Preparing the input file for the response generation (second-stage prompting)
DIR=`pwd`
TEST_FILE=<PATH_OF_PROCESSED_TEST_DATA> \
(e.g., /testseen_processed.txt)
KNOWLEDGE_FILE=<PATH_OF_GENERATED_KNOWLEDGE_DATA> \
(e.g., /testseen_knowledge_generations.txt)
PROCESSED_FILE=<PATH_OF_INPUT_FILE_FOR_RESPONSE_GENERATION> \
(e.g., /testseen_processed_with_generated_knowledge.txt)
python ${DIR}/tasks/msdp/preprocessing.py \
--func prepare_input \
--test_file ${TEST_FILE} \
--knwl_gen_file ${KNOWLEDGE_FILE} \
--processed_file ${PROCESSED_FILE}
#!/bin/bash
# Stage-1: Prompt a pretrained language model to generate the context-relevant knowledge
# The input contains prompts and current dialogue context, the output is the relevant knowledge
# The size of the pretrained language model is 357M
WORLD_SIZE=8
DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
CHECKPOINT_PATH=<PATH_OF_LANGUAGE_MODEL> (e.g., /357m)
VOCAB_PATH=<PATH_OF_VOCAB_FILE> (e.g., /gpt2-vocab.json)
MERGE_PATH=<PATH_OF_MERGE_FILE> (e.g., /gpt2-merges.txt)
INPUT_PATH=<PATH_OF_PROCESSED_TEST_DATA_FILE> \
(e.g., /testseen_processed.txt)
PROMPT_PATH=<PATH_OF_KNOWLEDGE_GENERATION_PROMPTS> \
(e.g., /testseen_knowledge_prompts.json)
OUTPUT_PATH=<PATH_OF_OUTPUT_GENERATION_FILE> \
(e.g., /testseen_knowledge_generations.txt)
python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--micro-batch-size 1 \
--vocab-file ${VOCAB_PATH} \
--merge-file ${MERGE_PATH} \
--load ${CHECKPOINT_PATH} \
--fp16 \
--DDP-impl torch \
--tokenizer-type GPT2BPETokenizer \
--sample-input-file ${INPUT_PATH} \
--sample-output-file ${OUTPUT_PATH} \
--prompt-file ${PROMPT_PATH} \
--prompt-type knowledge \
--num-prompt-examples 10 \
--task MSDP-PROMPT
# NOTE: If you use api for the model generation, please use
# the "--api-prompt" flag (setting this value as True).
#!/bin/bash
# Stage-2: Prompt a pretrained language model to generate the corresponding response
# The input contains prompts, current dialogue context, and generated knowledge in Stage-1
# The output is the corresponding response.
# The size of the pretrained language model is 357M
WORLD_SIZE=8
DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
CHECKPOINT_PATH=<PATH_OF_LANGUAGE_MODEL> (e.g., /357m)
VOCAB_PATH=<PATH_OF_VOCAB_FILE> (e.g., /gpt2-vocab.json)
MERGE_PATH=<PATH_OF_MERGE_FILE> (e.g., /gpt2-merges.txt)
INPUT_PATH=<PATH_OF_INPUT_TEST_DATA_FILE> (e.g., /testseen_processed.txt)
PROMPT_PATH=<PATH_OF_RESPONSE_GENERATION_PROMPTS> \
(e.g., /response_prompts.txt)
OUTPUT_PATH=<PATH_OF_OUTPUT_GENERATION_FILE> \
(e.g., /output_testseen_response_generations.txt)
python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--micro-batch-size 1 \
--vocab-file ${VOCAB_PATH} \
--merge-file ${MERGE_PATH} \
--load ${CHECKPOINT_PATH} \
--fp16 \
--DDP-impl torch \
--tokenizer-type GPT2BPETokenizer \
--sample-input-file ${INPUT_PATH} \
--sample-output-file ${OUTPUT_PATH} \
--prompt-file ${PROMPT_PATH} \
--prompt-type response \
--num-prompt-examples 20 \
--task MSDP-PROMPT
# NOTE: If you use api for the model generation, please use
# the "--api-prompt" flag (setting this value as True).
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
CHECKPOINT_PATH=<Specify path>
VOCAB_FILE=<Specify path to file>/bert-vocab.txt
DATA_PATH=<Specify path and file prefix>_text_sentence
BERT_ARGS="
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 512 \
--max-position-embeddings 512 \
--micro-batch-size 4 \
--global-batch-size 8 \
--lr 0.0001 \
--train-iters 2000000 \
--lr-decay-iters 990000 \
--lr-decay-style linear \
--min-lr 0.00001 \
--weight-decay 1e-2 \
--lr-warmup-fraction .01 \
--clip-grad 1.0 \
--fp16
"
DATA_ARGS="
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--split 949,50,1
"
OUTPUT_ARGS="
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10
"
torchrun pretrain_bert.py \
$BERT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
CHECKPOINT_PATH=<Specify path>
VOCAB_FILE=<Specify path to file>/bert-vocab.txt
DATA_PATH=<Specify path and file prefix>_text_sentence
DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
BERT_ARGS="
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 512 \
--max-position-embeddings 512 \
--micro-batch-size 4 \
--global-batch-size 32 \
--lr 0.0001 \
--train-iters 1000000 \
--lr-decay-iters 990000 \
--lr-decay-style linear \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--lr-warmup-fraction .01 \
--clip-grad 1.0 \
--fp16
"
DATA_ARGS="
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--split 949,50,1
"
OUTPUT_ARGS="
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10
"
torchrun $DISTRIBUTED_ARGS pretrain_bert.py \
$BERT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
CHECKPOINT_PATH=<Specify path>
VOCAB_FILE=<Specify path to file>/bert-vocab.txt
DATA_PATH=<Specify path and file prefix>_text_sentence
DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
BERT_ARGS="
--tensor-model-parallel-size 2 \
--pipeline-model-parallel-size 2 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 512 \
--max-position-embeddings 512 \
--micro-batch-size 2 \
--global-batch-size 16 \
--lr 0.0001 \
--train-iters 1000000 \
--lr-decay-iters 990000 \
--lr-decay-style linear \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--lr-warmup-fraction .01 \
--clip-grad 1.0 \
--fp16
"
DATA_ARGS="
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--split 949,50,1
"
OUTPUT_ARGS="
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10
"
torchrun $DISTRIBUTED_ARGS pretrain_bert.py \
$BERT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH
#!/bin/bash
# Runs the "345M" parameter model
export CUDA_DEVICE_MAX_CONNECTIONS=1
CHECKPOINT_PATH=<Specify path>
VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
MERGE_FILE=<Specify path to file>/gpt2-merges.txt
DATA_PATH=<Specify path and file prefix>_text_document
GPT_ARGS="
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--micro-batch-size 4 \
--global-batch-size 8 \
--lr 0.00015 \
--train-iters 500000 \
--lr-decay-iters 320000 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--lr-warmup-fraction .01 \
--clip-grad 1.0 \
--fp16
"
DATA_ARGS="
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--split 949,50,1
"
OUTPUT_ARGS="
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10
"
torchrun pretrain_gpt.py \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH
#!/bin/bash
#SBATCH <SLURM OPTIONS> --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b
DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
mkdir -p $DIR/logs
DATASET_1="<PATH TO THE FIRST DATASET>"
DATASET_2="<PATH TO THE SECOND DATASET>"
DATASET_3="<PATH TO THE THIRD DATASET>"
DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
options=" \
--tensor-model-parallel-size 8 \
--pipeline-model-parallel-size 16 \
--num-layers 96 \
--hidden-size 12288 \
--num-attention-heads 96 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--micro-batch-size 1 \
--global-batch-size 1536 \
--rampup-batch-size 16 16 5859375 \
--train-samples 146484375 \
--lr-decay-samples 126953125 \
--lr-warmup-samples 183105 \
--lr 6.0e-5 \
--min-lr 6.0e-6 \
--lr-decay-style cosine \
--log-interval 10 \
--eval-iters 40 \
--eval-interval 1000 \
--data-path ${DATASET} \
--vocab-file <PATH TO gpt-vocab.json> \
--merge-file <PATH TO gpt-merges.txt> \
--save-interval 1000 \
--save <PATH TO CHECKPOINTS DIRECTORY> \
--load <PATH TO CHECKPOINTS DIRECTORY> \
--split 98,2,0 \
--clip-grad 1.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.006 \
--tensorboard-dir <TENSORBOARD DIRECTORY> \
--fp16 "
run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}"
srun -l \
--container-image "nvcr.io/nvidia/pytorch:20.12-py3" \
--container-mounts "<DIRECTORIES TO MOUNT>" \
--output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
set +x
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment