Commit 160bf237 authored by wangxj's avatar wangxj
Browse files

更新0.12

parent b01809dd
Pipeline #2448 failed with stages
#!/bin/bash
set -eux
#export FLASH_ATTENTION_PRINT_PARAM=1
# Runs the "7B" parameter model
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_P2P_LEVEL=PXB # SYS
#export HIP_ALLOC_INITIALIZE=0
# export GPU_MAX_HW_QUEUES=10
export NCCL_ALGO=Ring
export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MIN_NCHANNELS=32 # 20
export NCCL_MAX_NCHANNELS=32 # 20
export NCCL_IB_TIMEOUT=22
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_TOPO_FILE="/public/home/wangxj/Projects/rccl-test/rccl-tests-0204/topo-input.xml"
# export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
export GLOG_minloglevel=3 # 打印error级别的nccl日志
source /opt/dtk/env.sh
# 导入hipblaslt库
# export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/public/home/wangxj/Downloads/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH
# 更新rocblas
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1211/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1228/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/public/home/wangxj/Downloads/rocblas-install-0224/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/public/home/wangxj/Downloads/rocblas-install-0227/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install-0118-bf16/lib:$LD_LIBRARY_PATH
# torch控制多流转单流
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export SENDRECV_STREAM_WITH_COMPUTE=1
# prof采集添加同步, 避免卡顿
# export GPU_FLUSH_ON_EXECUTION=1
# export HIP_DIRECT_DISPATCH=0
# 采集rocblas size
# export ROCBLAS_LAYER=3
# 采集 fa size
# export FLASH_ATTENTION_PRINT_PARAM=1
#增加编译缓存
export cache_size_limit=64
# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b #
SAVE_PATH=./tmp_7b
TENSORBOARD_LOGS_PATH=./tmp_7b #$2 #<Specify path>
DATA_PATH="/public/home/wangxj/Downloads/datasets/oscar-1GB-head/oscar-1GB_head-llama2_text_document" #<Specify path and file prefix>_text_document
# DATA_PATH="/data/datasets/oscar-1GB-head/oscar-1GB_head-llama2_text_document" #<Specify path and file prefix>_text_document
GPT_MODEL_ARGS=(
--num-layers 32
--hidden-size 4096
--ffn-hidden-size 11008
--num-attention-heads 32
--max-position-embeddings 4096
--normalization RMSNorm
--position-embedding-type rope # none #
--untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性
)
export NVTE_FLASH_ATTN=1 # 走cutlass
# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
# --transformer-impl transformer_engine # 走core用这两组参数
# --use-mcore-models
# --transformer-impl local # 走legacy用这两组参数
# --use-legacy-models
TRAINING_ARGS=(
--transformer-impl local # 走legacy用这两组参数
--use-legacy-models
--micro-batch-size 1
--global-batch-size 256 #256 #240 #60 #512 #64
--train-iters 50
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--bf16
# --fp16 # 开启fp16需要指定loss-scale
# --loss-scale 1024
--use-distributed-optimizer
--disable-bias-linear
--attention-dropout 0
--hidden-dropout 0
# --no-gradient-accumulation-fusion
--swiglu
--lr 3.0e-5
--lr-decay-style cosine
--min-lr 3.0e-6
--lr-warmup-iters 1
--ckpt-format torch
--ddp-average-in-collective # 在dp阶段通信中, 梯度或参数将被直接平均, 而不是先求和(到一个设备)再平均
# --recompute-granularity full # 开启重计算降低显存增加耗时
# --recompute-num-layers 5 #0 #
# --recompute-method block
--overlap-grad-reduce # 重叠ddp grad reduce
# --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
--use-flash-attn
)
# 使用torch fa的环境变量
# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
# export TORCHINDUCTOR_BENCHMARK_FUSION=1
# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
# export TORCHINDUCTOR_MAX_AUTOTUNE=1
# export TORCHINDUCTOR_CACHE_DIR=./cache
# --use-flash-attn-cutlass # cutlass fa
# --use-flash-attn-triton # triton fa
# --use-flash-attn-torch # torch fa
MODEL_PARALLEL_ARGS=(
--sequence-parallel
--tensor-model-parallel-size 1
--pipeline-model-parallel-size 2
# --num-layers-per-virtual-pipeline-stage 4
# --microbatch-group-size-per-virtual-pipeline-stage 1
# --no-overlap-p2p-communication # 开启后
)
DATA_ARGS=(
--data-path $DATA_PATH
--seq-length 4096 #4096
--split 949,50,1
--tokenizer-type Llama2Tokenizer
--tokenizer-model /public/home/wangxj/Downloads/model_weights/llama2_7b_hf/tokenizer.model
# --tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 1
--log-throughput
--save-interval 1000
--eval-interval 1000
--save $SAVE_PATH
--load $SAVE_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
# FINETUNE_ARGS=(
# # --finetune
# # --pretrained-checkpoint $CHECKPOINT_PATH
# --load $CHECKPOINT_PATH
# --no-load-optim
# --no-load-rng
# )
PROFILE_ARGS=(
--profile
--profile-step-start 4
--profile-step-end 5
--use-pytorch-profiler
--profile-ranks 0 1 2 3 4 5 6 7
--profile-dir prof_data
)
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
DIST_URL=${1}
DIST_PORT=34577
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
--local-rank ${LOCAL_RANK}
--dist-url tcp://${DIST_URL}:${DIST_PORT}
)
# torchrun --nproc_per_node 8 pretrain_gpt.py \
# ${GPT_MODEL_ARGS[@]} \
# ${TRAINING_ARGS[@]} \
# ${MODEL_PARALLEL_ARGS[@]} \
# ${DATA_ARGS[@]} \
# ${EVAL_AND_LOGGING_ARGS[@]}
APP="python -u pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
${DISTRIBUTED_ARGS[@]} \
"
# 开启profile
# ${PROFILE_ARGS[@]} \
# export HIP_VISIBLE_DEVICES=0,7 # # 4,5,6,7 #,
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # # 4,5,6,7 #,
# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
# ${APP}
case ${LOCAL_RANK} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=3 --membind=3 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[4])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=4 --membind=4 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[5])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=5 --membind=5 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[6])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=6 --membind=6 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[7])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=7 --membind=7 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
esac
\ No newline at end of file
#!/bin/bash
set -eux
#export FLASH_ATTENTION_PRINT_PARAM=1
# Runs the "7B" parameter model
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_P2P_LEVEL=PXB # SYS
#export HIP_ALLOC_INITIALIZE=0
#export GPU_MAX_HW_QUEUES=20
export NCCL_ALGO=Ring
export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MIN_NCHANNELS=20 # channel
export NCCL_IB_TIMEOUT=22
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_HCA=mlx5_1,mlx5_2
export NCCL_NET_GDR_LEVEL=SYS
export NCCL_NET_GDR_READ=0
export GLOG_minloglevel=3 # 打印error级别的nccl日志
source /opt/dtk/env.sh
# te调用gemm需要导入hipblaslt库
# export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH
# 更新rocblas
export LD_LIBRARY_PATH=/data/rocblas-install/lib:$LD_LIBRARY_PATH
# # prof采集添加同步
# export GPU_FLUSH_ON_EXECUTION=1
# export HIP_DIRECT_DISPATCH=0
CHECKPOINT_PATH=./tmp_7b #$1 #<Specify path>
TENSORBOARD_LOGS_PATH=./tmp_7b #$2 #<Specify path>
DATA_PATH="/data/datasets/nemo_pretrain/oscar-1GB/oscar-1GB-llama_text_document" #<Specify path and file prefix>_text_document
# GPT_MODEL_ARGS=(
# --num-layers 32
# --hidden-size 5120
# --ffn-hidden-size 13824
# --num-attention-heads 40
# --seq-length 4096 #4096
# --max-position-embeddings 32768 #4096
# --num-query-groups 40
# --group-query-attention
# )
GPT_MODEL_ARGS=(
--num-layers 6
--hidden-size 4096
--ffn-hidden-size 11008
--num-attention-heads 32
--seq-length 4096 #4096
--max-position-embeddings 4096
)
# export NVTE_FLASH_ATTN=1 # 走cutlass
export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
# --transformer-impl transformer_engine
# --use-mcore-models
# --transformer-impl local
# --use-legacy-models
TRAINING_ARGS=(
--transformer-impl transformer_engine
--use-mcore-models
--micro-batch-size 1
--global-batch-size 6 #240 #60 #512 #64
--train-iters 10
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--bf16
--use-distributed-optimizer
--disable-bias-linear
--attention-dropout 0
--hidden-dropout 0
--no-gradient-accumulation-fusion
--add-qkv-bias
--swiglu
--lr 3.0e-5
--lr-decay-style cosine
--min-lr 3.0e-6
--lr-warmup-iters 1
--ckpt-format torch
--ddp-average-in-collective
--recompute-granularity full
--recompute-num-layers 5 #0 #
--recompute-method block
--overlap-grad-reduce
--use-flash-attn-triton
)
# --add-qkv-bias # qwen
# --ckpt-format torch
# --ddp-average-in-collective
# --recompute-granularity full
# --recompute-num-layers 5
# --recompute-method block
# --overlap-grad-reduce
# --use-flash-attn-cutlass
# --use-flash-attn-triton
MODEL_PARALLEL_ARGS=(
--sequence-parallel
--tensor-model-parallel-size 2
--pipeline-model-parallel-size 2
)
DATA_ARGS=(
--data-path $DATA_PATH
--split 949,50,1
--untie-embeddings-and-output-weights
--use-rotary-position-embeddings
--normalization RMSNorm
--no-position-embedding
--tokenizer-type Llama2Tokenizer
--tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 1
--log-throughput
--save-interval 1000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
PROFILE_ARGS=(
--profile
--profile-step-start 4
--profile-step-end 5
--use-pytorch-profiler
--profile-ranks 0 3
--profile-dir prof_data
)
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
DIST_URL=${1}
DIST_PORT=34566
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
--local-rank ${LOCAL_RANK}
--dist-url tcp://${DIST_URL}:${DIST_PORT}
)
APP="python -u pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
${DISTRIBUTED_ARGS[@]} \
${PROFILE_ARGS[@]} \
"
export HIP_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3 # 4,5,6,7 #,
# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
# ${APP}
case ${LOCAL_RANK} in
[0])
# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
# numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
# numactl --cpunodebind=0 --membind=0 ${APP}
;;
[2])
# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
# numactl --cpunodebind=0 --membind=0 ${APP}
;;
[3])
# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
# numactl --cpunodebind=0 --membind=0 ${APP}
;;
# [4])
# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# ${APP}
# # numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [5])
# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# ${APP}
# # numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [6])
# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# ${APP}
# # numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [7])
# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# ${APP}
# # numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
esac
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
......@@ -29,7 +29,8 @@ class BlendedDataset(torch.utils.data.Dataset):
weights (List[Union[int, float]]): The weights that determine the dataset blend ratios
size (Optional[int]): The number of samples to draw from the blend. If None, for each dataset index idx draw exactly weights[idx] samples from datasets[idx].
size (Optional[int]): The number of samples to draw from the blend. If None, for each
dataset index idx draw exactly weights[idx] samples from datasets[idx].
config (BlendedMegatronDatasetConfig): The config
......@@ -74,7 +75,6 @@ class BlendedDataset(torch.utils.data.Dataset):
unique_identifiers["split"] = self.split.name
unique_identifiers["weights"] = self.weights
unique_identifiers["size"] = self.size
unique_identifiers["renormalize_blend_weights"] = self.config.renormalize_blend_weights
self.unique_description = json.dumps(
unique_identifiers, indent=4, default=lambda obj: obj.unique_identifiers
......@@ -168,7 +168,7 @@ class BlendedDataset(torch.utils.data.Dataset):
log_single_rank(
logger,
logging.WARNING,
f"Unable to save the {type(self).__name__} indexes because path_to_cache is None",
f"Cannot save the {type(self).__name__} indexes because path_to_cache is None",
)
t_end = time.time()
......
......@@ -34,7 +34,9 @@ class BlendedMegatronDatasetBuilder(object):
sizes (List[Optional[int]]): The minimum total number of samples to draw, or None, per split
is_built_on_rank (Callable): A callable which returns True if the dataset should be built on the current rank and False otherwise. It should be Megatron Core parallelism aware i.e. global rank, local group rank, and virtual rank may inform its return value.
is_built_on_rank (Callable): A callable which returns True if the dataset should be built on
the current rank and False otherwise. It should be Megatron Core parallelism aware i.e.
global rank, local group rank, and virtual rank may inform its return value.
config (BlendedMegatronDatasetConfig): The config object which informs dataset creation
"""
......@@ -54,7 +56,7 @@ class BlendedMegatronDatasetBuilder(object):
log_single_rank(
logger,
logging.INFO,
f"Building dataset splits with cls={cls.__name__}, sizes={self.sizes}, and config={self.config}",
f"Building {cls.__name__} splits with sizes={self.sizes} and config={self.config}",
)
if not self.config.mock:
......@@ -96,7 +98,8 @@ class BlendedMegatronDatasetBuilder(object):
(2) The split has one contributing dataset, and...
(a) 'size' is not None
- Build a mid-level dataset with low-level dataset sampling in proportion to the size
- Build a mid-level dataset with low-level dataset sampling in proportion to the
size
(b) 'size' is None
- Build mid-level datasets with no excess low-level dataset sampling
......@@ -104,24 +107,27 @@ class BlendedMegatronDatasetBuilder(object):
(3) The split has multiple contributing datasets, and...
(a) 'weights' is not None and 'size' is not None
- Build mid-level datasets with low-level dataset sampling in proportion to their weights and the size
- Build a top-level dataset of length marginally greater than 'size' with mid-level dataset sampling in proportion to their weights and the size
- Build mid-level datasets with low-level dataset sampling in proportion to their
weights and the size
- Build a top-level dataset of length marginally greater than 'size' with mid-level
dataset sampling in proportion to their weights and the size
(b) 'weights' is not None and 'size' is None
- Error
(c) 'weights' is None and 'size' is not None
- Build mid-level datasets with no excess low-level dataset sampling
- Build a top-level dataset of length 'size' with mid-level dataset sampling in proportion to their lengths and the size
- The 'size' of the top-level dataset is capped at the sum of the mid-level dataset lengths
- Build a top-level dataset of length 'size' (capped at the sum of the mid-level
dataset lengths) with mid-level dataset sampling in proportion to their lengths
and the size
(d) 'weights' is None and 'size' is None
- Build mid-level datasets with no excess low-level dataset sampling
- Build a top-level dataset with no excess mid-level dataset sampling
Returns:
List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per split
List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per
split
"""
datasets = self._build_blended_dataset_splits()
......@@ -134,24 +140,35 @@ class BlendedMegatronDatasetBuilder(object):
log_single_rank(
logger,
logging.INFO,
f"Verifying NumPy indices for {type(dataset).__name__} {dataset.split.name} split",
(
f"Verifying NumPy indices for {type(dataset).__name__} "
f"{dataset.split.name} split"
),
)
else:
log_single_rank(
logger,
logging.INFO,
f"NumPy indices for {type(dataset).__name__} {dataset.split.name} split are fully cached, skipping verification",
(
f"NumPy indices for {type(dataset).__name__} {dataset.split.name} "
f"split are fully cached, skipping verification"
),
)
continue
# Check blend size
assert dataset.size is None or dataset.size == dataset.dataset_index.shape[0]
# Check blend access of mid-level datasets
_, sizes = numpy.unique(dataset.dataset_index, return_counts=True)
for i, dataset_and_size in enumerate(zip(dataset.datasets, sizes)):
if len(dataset_and_size[0]) < dataset_and_size[1]:
dataset_indices, dataset_sizes = numpy.unique(
dataset.dataset_index, return_counts=True
)
for i, (index, size) in enumerate(zip(dataset_indices, dataset_sizes)):
if len(dataset.datasets[index]) < size:
raise IndexError(
f"The {dataset.split.name} blend oversamples (N = {dataset_and_size[1]}) {type(dataset_and_size[0]).__name__} {i} (len = {len(dataset_and_size[0])}). "
f"Set renormalize_blend_weights to True and re-run. File an issue if the problem is not resolved."
f"The {dataset.split.name} blend oversamples the contributing "
f"datasets and, e.g., requests {size} samples from "
f"{type(dataset.datasets[index]).__name__} {i} with size "
f"{len(dataset.datasets[index])}. This is unexpected. "
f"Please file an issue."
)
return datasets
......@@ -162,7 +179,8 @@ class BlendedMegatronDatasetBuilder(object):
See the BlendedMegatronDatasetBuilder.build alias for more information.
Returns:
List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per split
List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per
split
"""
##
# Return fake "mock" datasets
......@@ -192,13 +210,19 @@ class BlendedMegatronDatasetBuilder(object):
# Build the mid-level datasets
if weights is None:
sizes_per_dataset = [[None for split in Split] for prefix in prefixes]
# Build only one "epoch"
sizes_per_dataset_buffer = [[None for split in Split] for prefix in prefixes]
else:
sizes_per_dataset = _get_size_per_split_per_dataset(weights, self.sizes)
# The number of samples we plan to use per dataset
sizes_per_dataset_target = _get_size_per_split_per_dataset(weights, self.sizes)
# The number of samples we plan to build per dataset
sizes_per_dataset_buffer = _get_size_per_split_per_dataset(
weights, self.sizes, margin=0.5
)
# build each dataset in parallel
# Build each dataset in parallel
megatron_datasets = self._build_megatron_datasets_parallel(
prefixes, split, sizes_per_dataset
prefixes, split, sizes_per_dataset_buffer
)
# Build the top-level datasets
......@@ -207,11 +231,11 @@ class BlendedMegatronDatasetBuilder(object):
if split[i] is not None:
weights_i = weights
if weights_i is not None and self.sizes[i] is not None:
size_per_dataset = list(zip(*sizes_per_dataset))[i]
# Blend according to client-specified weights and client-specified size
size_per_dataset = list(zip(*sizes_per_dataset_target))[i]
size_i = sum(size_per_dataset)
if self.config.renormalize_blend_weights:
weights_i = list(map(lambda _size: _size / size_i, size_per_dataset))
elif weights_i is None:
# Blend according to dataset sizes as-is and (maybe) client-specified size
try:
weights_i = [
len(megatron_dataset) for megatron_dataset in megatron_datasets[i]
......@@ -221,9 +245,12 @@ class BlendedMegatronDatasetBuilder(object):
if self.sizes[i] is not None:
size_i = min(self.sizes[i], sum(weights_i))
else:
size_i = None # => the size will be sum(weights_i)
# Build exhaustive indices
size_i = None
else:
raise RuntimeError
raise ValueError(
"Using client-specified weights requires client-specified size"
)
blended_datasets[i] = self.build_generic_dataset(
BlendedDataset,
self.is_built_on_rank,
......@@ -263,22 +290,31 @@ class BlendedMegatronDatasetBuilder(object):
# Build mid-level datasets
if weights is None:
sizes_per_dataset = [[None for split in Split] for prefix in prefixes]
sizes_per_dataset_buffer = [
[None for split in Split] for prefix in prefixes
]
else:
sizes_per_dataset = _get_size_per_split_per_dataset(weights, sizes_spoof)
# The number of samples we plan to use per dataset
sizes_per_dataset_target = _get_size_per_split_per_dataset(
weights, sizes_spoof
)
# The number of samples we plan to build per dataset
sizes_per_dataset_buffer = _get_size_per_split_per_dataset(
weights, sizes_spoof, margin=0.5
)
# build each dataset in parallel
# Build each dataset in parallel
megatron_datasets = self._build_megatron_datasets_parallel(
prefixes, split_spoof, sizes_per_dataset
prefixes, split_spoof, sizes_per_dataset_buffer
)[i]
# Build top-level dataset
if weights is not None and self.sizes[i] is not None:
size_per_dataset = list(zip(*sizes_per_dataset))[i]
# Blend according to client-specified weights and client-specified size
size_per_dataset = list(zip(*sizes_per_dataset_target))[i]
size = sum(size_per_dataset)
if self.config.renormalize_blend_weights:
weights = list(map(lambda _size: _size / size, size_per_dataset))
elif weights is None:
# Blend according to dataset sizes as-is and (maybe) client-specified size
try:
weights = [
len(megatron_dataset) for megatron_dataset in megatron_datasets
......@@ -288,7 +324,8 @@ class BlendedMegatronDatasetBuilder(object):
if self.sizes[i] is not None:
size = min(self.sizes[i], sum(weights))
else:
size = None # => the size will be sum(weights)
# Build exhaustive indices
size = None
else:
raise RuntimeError
blended_datasets[i] = self.build_generic_dataset(
......@@ -395,13 +432,15 @@ class BlendedMegatronDatasetBuilder(object):
"""Build each MidLevelDataset split from a single LowLevelDataset
Args:
dataset_path (Optional[str]): The path on disk which defines the underlying LowLevelDataset, or None for mock dataset classes
dataset_path (Optional[str]): The path on disk which defines the underlying
LowLevelDataset, or None for mock dataset classes
split (List[Tuple[float, float]]): The dataset split matrix
sizes (List[int]): The number of total samples to draw from each split
synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks behavior. Set to False when we enforce this behavior at higher level.
synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks
behavior. Set to False when we enforce this behavior at higher level.
Returns:
List[Optional[MidLevelDataset]]: The MidLevelDataset (or None) per split
......@@ -462,17 +501,22 @@ class BlendedMegatronDatasetBuilder(object):
and torch.distributed is initialized.
Args:
cls (Union[Type[DistributedDataset], Callable]): The DistributedDataset class to be built. In special cases, e.g. when we are building the low level dataset for a RawMegatronDataset instance, we can accept a Callable which returns an Iterable.
cls (Union[Type[DistributedDataset], Callable]): The DistributedDataset class to be
built. In special cases, e.g. when we are building the low level dataset for a
RawMegatronDataset instance, we can accept a Callable which returns an Iterable.
synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks behavior. Set to False when we enforce this behavior at higher level.
synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks
behavior. Set to False when we enforce this behavior at higher level.
args (Tuple[Any]): The positional arguments used to build the provided DistributedDataset class
args (Tuple[Any]): The positional arguments used to build the provided
DistributedDataset class
Raises:
Exception: When the dataset constructor raises an OSError
Returns:
Optional[Union[DistributedDataset, Iterable]]: The DistributedDataset instantion, the Iterable instantiation, or None
Optional[Union[DistributedDataset, Iterable]]: The DistributedDataset instantion, the
Iterable instantiation, or None
"""
if torch.distributed.is_initialized():
rank = torch.distributed.get_rank()
......@@ -485,10 +529,10 @@ class BlendedMegatronDatasetBuilder(object):
dataset = cls(*args)
except OSError as err:
log = (
f"Failed to write dataset materials to the data cache directory. "
+ f"Please supply a directory to which you have write access via "
+ f"the path_to_cache attribute in BlendedMegatronDatasetConfig and "
+ f"retry. Refer to the preserved traceback above for more information."
f"Failed to write dataset materials to the data cache directory. Please "
f"supply a directory to which you have write access via the path_to_cache "
f"attribute in BlendedMegatronDatasetConfig and retry. Refer to the "
f"preserved traceback above for more information."
)
raise Exception(log) from err
......@@ -505,23 +549,30 @@ class BlendedMegatronDatasetBuilder(object):
def _get_size_per_split_per_dataset(
normalized_weights: List[float], target_size_per_split: List[int]
normalized_weights: List[float], target_size_per_split: List[int], margin: float = 0.0
) -> List[List[int]]:
"""Determine the contribution of the MegatronDataset splits to the BlendedDataset splits
Args:
normalized_weights (List[float]): e.g. [0.3, 0.7]
target_size_per_split (List[int]): The number of samples to target for each BlendedDataset split
target_size_per_split (List[int]): The number of samples to target for each BlendedDataset
split
margin (float): The relative quantity of extra samples to build per per split per dataset,
as a percentage
Returns:
List[List[int]]: The number of samples to request per MegatronDataset per split
"""
assert numpy.isclose(sum(normalized_weights), 1.0)
# Use 0.5% target margin to ensure we satiate the request
# Use margin as buffer to ensure we satiate the request
sizes_per_dataset = [
[int(math.ceil(target_size * weight * 1.005)) for target_size in target_size_per_split]
[
int(math.ceil(math.ceil(target_size * weight) * (1 + margin / 100)))
for target_size in target_size_per_split
]
for weight in normalized_weights
]
......
......@@ -34,12 +34,6 @@ class BlendedMegatronDatasetConfig:
'blend'. Defauls to None.
"""
renormalize_blend_weights: bool = False
"""Renormalize the blend weights to account for mid-level dataset oversampling done to ensure
fulfillmenet of the of the requested number of samples. Defaults to False for backward
comparability in the data sample order.
"""
split: Optional[str] = None
"""The split string, a comma separated weighting for the dataset splits when drawing samples
from a single distribution. Not to be used with 'blend_per_split'. Defaults to None.
......@@ -67,7 +61,7 @@ class BlendedMegatronDatasetConfig:
"""
tokenizer: Optional[MegatronTokenizer] = None
"""The MegatronTokenizer instance or None. Required for datasets which do online tokenization."""
"""The MegatronTokenizer instance. Required for datasets that do online tokenization."""
def __post_init__(self) -> None:
"""Do asserts and set fields post init"""
......@@ -149,7 +143,8 @@ def convert_split_vector_to_split_matrix(
Args:
vector_a (List[float]): The primary split vector
vector_b (Optional[List[float]]): An optional secondary split vector which constrains the primary split vector. Defaults to None.
vector_b (Optional[List[float]]): An optional secondary split vector which constrains the
primary split vector. Defaults to None.
Returns:
List[Tuple[float, float]]: The split matrix consisting of book-ends of each split in order
......
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment