更新0.12

160bf237 · wangxj · b01809dd · 160bf237 · b01809dd · 160bf237
Commit 160bf237 authored Mar 05, 2025 by wangxj
20 changed files
--- a/Llama2_7b.sh
+++ b/Llama2_7b.sh
+#!/bin/bash
+set -eux
+
+#export FLASH_ATTENTION_PRINT_PARAM=1
+# Runs the "7B" parameter model
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export NCCL_P2P_LEVEL=PXB # SYS
+
+#export HIP_ALLOC_INITIALIZE=0
+# export GPU_MAX_HW_QUEUES=10
+
+export NCCL_ALGO=Ring
+export NCCL_NCHANNELS_PER_PEER=16
+export NCCL_MIN_NCHANNELS=32 # 20
+export NCCL_MAX_NCHANNELS=32 # 20
+export NCCL_IB_TIMEOUT=22
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_NET_GDR_READ=1
+export RCCL_SDMA_COPY_ENABLE=0
+export NCCL_TOPO_FILE="/public/home/wangxj/Projects/rccl-test/rccl-tests-0204/topo-input.xml"
+# export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
+export GLOG_minloglevel=3 # 打印error级别的nccl日志
+source /opt/dtk/env.sh
+# 导入hipblaslt库
+# export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH 
+# export LD_LIBRARY_PATH=/data/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH 
+export LD_LIBRARY_PATH=/public/home/wangxj/Downloads/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH 
+
+# 更新rocblas
+# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1211/lib:$LD_LIBRARY_PATH
+# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1228/lib:$LD_LIBRARY_PATH
+# export LD_LIBRARY_PATH=/public/home/wangxj/Downloads/rocblas-install-0224/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=/public/home/wangxj/Downloads/rocblas-install-0227/lib:$LD_LIBRARY_PATH
+# export LD_LIBRARY_PATH=/data/rocblas-install-0118-bf16/lib:$LD_LIBRARY_PATH
+
+# torch控制多流转单流
+export ALLREDUCE_STREAM_WITH_COMPUTE=1
+export SENDRECV_STREAM_WITH_COMPUTE=1 
+
+# prof采集添加同步, 避免卡顿
+# export GPU_FLUSH_ON_EXECUTION=1
+# export HIP_DIRECT_DISPATCH=0
+
+# 采集rocblas size
+# export ROCBLAS_LAYER=3
+# 采集 fa size
+# export FLASH_ATTENTION_PRINT_PARAM=1
+
+#增加编译缓存
+export cache_size_limit=64
+
+# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b # 
+SAVE_PATH=./tmp_7b
+TENSORBOARD_LOGS_PATH=./tmp_7b  #$2 #<Specify path>
+DATA_PATH="/public/home/wangxj/Downloads/datasets/oscar-1GB-head/oscar-1GB_head-llama2_text_document" #<Specify path and file prefix>_text_document
+# DATA_PATH="/data/datasets/oscar-1GB-head/oscar-1GB_head-llama2_text_document" #<Specify path and file prefix>_text_document
+
+GPT_MODEL_ARGS=(
+    --num-layers 32
+    --hidden-size 4096
+    --ffn-hidden-size 11008 
+    --num-attention-heads 32
+    --max-position-embeddings 4096
+
+    --normalization RMSNorm 
+    --position-embedding-type rope # none # 
+    --untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性
+)
+
+export NVTE_FLASH_ATTN=1 # 走cutlass
+# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
+# --transformer-impl transformer_engine # 走core用这两组参数
+    # --use-mcore-models
+    # --transformer-impl local # 走legacy用这两组参数
+    # --use-legacy-models 
+TRAINING_ARGS=(
+    --transformer-impl local # 走legacy用这两组参数
+    --use-legacy-models 
+    --micro-batch-size 1
+    --global-batch-size 256 #256 #240 #60 #512 #64
+    --train-iters 50
+    --weight-decay 0.1 
+    --adam-beta1 0.9 
+    --adam-beta2 0.95 
+    --init-method-std 0.006 
+    --clip-grad 1.0 
+    --bf16
+    # --fp16 # 开启fp16需要指定loss-scale
+    # --loss-scale 1024
+    --use-distributed-optimizer 
+    --disable-bias-linear
+    --attention-dropout 0
+    --hidden-dropout 0
+    # --no-gradient-accumulation-fusion
+    --swiglu
+    --lr 3.0e-5 
+    --lr-decay-style cosine 
+    --min-lr 3.0e-6
+    --lr-warmup-iters 1
+    --ckpt-format torch
+    --ddp-average-in-collective # 在dp阶段通信中, 梯度或参数将被直接平均, 而不是先求和(到一个设备)再平均
+    # --recompute-granularity full # 开启重计算降低显存增加耗时
+    # --recompute-num-layers 5 #0 #
+    # --recompute-method block
+    --overlap-grad-reduce # 重叠ddp grad reduce
+    # --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
+    # --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
+    --use-flash-attn
+)
+# 使用torch fa的环境变量
+# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
+# export TORCHINDUCTOR_BENCHMARK_FUSION=1
+# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
+# export TORCHINDUCTOR_MAX_AUTOTUNE=1
+# export TORCHINDUCTOR_CACHE_DIR=./cache
+# --use-flash-attn-cutlass # cutlass fa
+# --use-flash-attn-triton # triton fa
+# --use-flash-attn-torch # torch fa
+
+MODEL_PARALLEL_ARGS=(
+    --sequence-parallel
+	--tensor-model-parallel-size 1
+	--pipeline-model-parallel-size 2
+  # --num-layers-per-virtual-pipeline-stage 4
+  # --microbatch-group-size-per-virtual-pipeline-stage 1
+  # --no-overlap-p2p-communication # 开启后
+)
+
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --seq-length 4096 #4096
+    --split 949,50,1
+    --tokenizer-type Llama2Tokenizer
+    --tokenizer-model /public/home/wangxj/Downloads/model_weights/llama2_7b_hf/tokenizer.model
+    # --tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model
+)
+
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 1
+    --log-throughput
+    --save-interval 1000 
+    --eval-interval 1000 
+    --save $SAVE_PATH 
+    --load $SAVE_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+
+# FINETUNE_ARGS=(
+#     # --finetune
+#     # --pretrained-checkpoint $CHECKPOINT_PATH
+#     --load $CHECKPOINT_PATH
+#     --no-load-optim
+#     --no-load-rng
+# )
+
+PROFILE_ARGS=(
+    --profile
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-pytorch-profiler
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-dir prof_data
+)
+
+RANK=$OMPI_COMM_WORLD_RANK
+LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+DIST_URL=${1}
+DIST_PORT=34577
+
+DISTRIBUTED_ARGS=(
+    --rank ${RANK}
+    --world-size ${WORLD_SIZE}
+    --local-rank ${LOCAL_RANK}
+    --dist-url tcp://${DIST_URL}:${DIST_PORT}
+)
+
+# torchrun --nproc_per_node 8 pretrain_gpt.py \
+#     ${GPT_MODEL_ARGS[@]} \
+#     ${TRAINING_ARGS[@]} \
+#     ${MODEL_PARALLEL_ARGS[@]} \
+#     ${DATA_ARGS[@]} \
+#     ${EVAL_AND_LOGGING_ARGS[@]}
+
+APP="python -u pretrain_gpt.py \
+        ${GPT_MODEL_ARGS[@]} \
+        ${TRAINING_ARGS[@]} \
+        ${MODEL_PARALLEL_ARGS[@]} \
+        ${DATA_ARGS[@]} \
+        ${EVAL_AND_LOGGING_ARGS[@]} \
+        ${DISTRIBUTED_ARGS[@]} \
+        
+"
+# 开启profile
+# ${PROFILE_ARGS[@]} \
+
+# export HIP_VISIBLE_DEVICES=0,7 #  # 4,5,6,7 #,
+export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 #  # 4,5,6,7 #,
+# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
+# ${APP}
+case ${LOCAL_RANK} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[4])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=4 --membind=4 ${APP}
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[5])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=5 --membind=5 ${APP}
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[6])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=6 --membind=6 ${APP}
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[7])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=7 --membind=7 ${APP}
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+esac
\ No newline at end of file
--- a/Llama_pretraining.sh
+++ b/Llama_pretraining.sh
-#!/bin/bash
-set -eux
-
-#export FLASH_ATTENTION_PRINT_PARAM=1
-# Runs the "7B" parameter model
-export HSA_FORCE_FINE_GRAIN_PCIE=1
-export OMP_NUM_THREADS=1
-export NCCL_P2P_LEVEL=PXB # SYS
-
-#export HIP_ALLOC_INITIALIZE=0
-#export GPU_MAX_HW_QUEUES=20
-
-export NCCL_ALGO=Ring
-export NCCL_NCHANNELS_PER_PEER=16
-export NCCL_MIN_NCHANNELS=20   # channel
-export NCCL_IB_TIMEOUT=22
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-export NCCL_IB_HCA=mlx5_1,mlx5_2
-export NCCL_NET_GDR_LEVEL=SYS
-export NCCL_NET_GDR_READ=0
-export GLOG_minloglevel=3 # 打印error级别的nccl日志
-source /opt/dtk/env.sh
-# te调用gemm需要导入hipblaslt库
-# export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH 
-# 更新rocblas
-export LD_LIBRARY_PATH=/data/rocblas-install/lib:$LD_LIBRARY_PATH
-
-# # prof采集添加同步
-# export GPU_FLUSH_ON_EXECUTION=1
-# export HIP_DIRECT_DISPATCH=0
-
-CHECKPOINT_PATH=./tmp_7b #$1 #<Specify path>
-TENSORBOARD_LOGS_PATH=./tmp_7b  #$2 #<Specify path>
-DATA_PATH="/data/datasets/nemo_pretrain/oscar-1GB/oscar-1GB-llama_text_document" #<Specify path and file prefix>_text_document
-
-# GPT_MODEL_ARGS=(
-#     --num-layers 32
-#     --hidden-size 5120
-#     --ffn-hidden-size 13824
-#     --num-attention-heads 40
-#     --seq-length 4096 #4096
-#     --max-position-embeddings 32768 #4096
-#     --num-query-groups 40
-#     --group-query-attention
-# )
-
-GPT_MODEL_ARGS=(
-    --num-layers 6
-    --hidden-size 4096
-    --ffn-hidden-size 11008 
-    --num-attention-heads 32
-    --seq-length 4096 #4096
-    --max-position-embeddings 4096
-)
-
-# export NVTE_FLASH_ATTN=1 # 走cutlass
-export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
-# --transformer-impl transformer_engine
-    # --use-mcore-models
-    # --transformer-impl local
-    # --use-legacy-models 
-TRAINING_ARGS=(
-    --transformer-impl transformer_engine
-    --use-mcore-models
-    --micro-batch-size 1 
-    --global-batch-size 6 #240 #60 #512 #64
-    --train-iters 10
-    --weight-decay 0.1 
-    --adam-beta1 0.9 
-    --adam-beta2 0.95 
-    --init-method-std 0.006 
-    --clip-grad 1.0 
-    --bf16
-    --use-distributed-optimizer 
-    --disable-bias-linear
-    --attention-dropout 0
-    --hidden-dropout 0
-    --no-gradient-accumulation-fusion
-    --add-qkv-bias
-    --swiglu
-    --lr 3.0e-5 
-    --lr-decay-style cosine 
-    --min-lr 3.0e-6
-    --lr-warmup-iters 1
-    --ckpt-format torch
-    --ddp-average-in-collective
-    --recompute-granularity full
-    --recompute-num-layers 5 #0 #
-    --recompute-method block
-    --overlap-grad-reduce
-    --use-flash-attn-triton
-)
-# --add-qkv-bias # qwen
-# --ckpt-format torch
-#     --ddp-average-in-collective
-#     --recompute-granularity full
-#     --recompute-num-layers 5
-#     --recompute-method block
-#     --overlap-grad-reduce
-# --use-flash-attn-cutlass
-# --use-flash-attn-triton
-
-MODEL_PARALLEL_ARGS=(
-    --sequence-parallel
-	--tensor-model-parallel-size 2
-	--pipeline-model-parallel-size 2
-)
-
-DATA_ARGS=(
-    --data-path $DATA_PATH 
-    --split 949,50,1
-    --untie-embeddings-and-output-weights
-    --use-rotary-position-embeddings 
-    --normalization RMSNorm 
-    --no-position-embedding 
-    --tokenizer-type Llama2Tokenizer
-    --tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model
-)
-
-EVAL_AND_LOGGING_ARGS=(
-    --log-interval 1
-    --log-throughput
-    --save-interval 1000 
-    --eval-interval 1000 
-    --save $CHECKPOINT_PATH 
-    --load $CHECKPOINT_PATH 
-    --eval-iters 10
-    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
-)
-
-PROFILE_ARGS=(
-    --profile
-    --profile-step-start 4
-    --profile-step-end 5
-    --use-pytorch-profiler
-    --profile-ranks 0 3
-    --profile-dir prof_data
-)
-
-RANK=$OMPI_COMM_WORLD_RANK
-LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
-WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
-DIST_URL=${1}
-DIST_PORT=34566
-
-DISTRIBUTED_ARGS=(
-    --rank ${RANK}
-    --world-size ${WORLD_SIZE}
-    --local-rank ${LOCAL_RANK}
-    --dist-url tcp://${DIST_URL}:${DIST_PORT}
-)
-
-APP="python -u pretrain_gpt.py \
-        ${GPT_MODEL_ARGS[@]} \
-        ${TRAINING_ARGS[@]} \
-        ${MODEL_PARALLEL_ARGS[@]} \
-        ${DATA_ARGS[@]} \
-        ${EVAL_AND_LOGGING_ARGS[@]} \
-        ${DISTRIBUTED_ARGS[@]} \
-        ${PROFILE_ARGS[@]} \
-"
-
-export HIP_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3 # 4,5,6,7 #,
-# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
-# ${APP}
-case ${LOCAL_RANK} in
-[0])
-#   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  # numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-[1])
-#   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  # numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-[2])
-#   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  # numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-[3])
-#   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  ${APP}
-  # numactl --cpunodebind=0 --membind=0 ${APP}
-  ;;
-# [4])
-#   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-#   ${APP}
-#   # numactl --cpunodebind=0 --membind=0 ${APP}
-#   ;;
-# [5])
-#   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-#   ${APP}
-#   # numactl --cpunodebind=0 --membind=0 ${APP}
-#   ;;
-# [6])
-#   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-#   ${APP}
-#   # numactl --cpunodebind=0 --membind=0 ${APP}
-#   ;;
-# [7])
-#   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-#   ${APP}
-#   # numactl --cpunodebind=0 --membind=0 ${APP}
-#   ;;
-esac
--- a/megatron/core/QuickStart.md
+++ b/megatron/core/QuickStart.md
--- a/megatron/core/README.md
+++ b/megatron/core/README.md
--- a/megatron/core/README_STRAGGLER.md
+++ b/megatron/core/README_STRAGGLER.md
--- a/megatron/core/__init__.py
+++ b/megatron/core/__init__.py
--- a/megatron/core/config_logger.py
+++ b/megatron/core/config_logger.py
--- a/megatron/core/datasets/Makefile
+++ b/megatron/core/datasets/Makefile
--- a/megatron/core/datasets/__init__.py
+++ b/megatron/core/datasets/__init__.py
--- a/megatron/core/datasets/bert_dataset.py
+++ b/megatron/core/datasets/bert_dataset.py
--- a/megatron/core/datasets/blended_dataset.py
+++ b/megatron/core/datasets/blended_dataset.py
@@ -29,7 +29,8 @@ class BlendedDataset(torch.utils.data.Dataset):

        weights (List[Union[int, float]]): The weights that determine the dataset blend ratios

-        size (Optional[int]): The number of samples to draw from the blend. If None, for each dataset index idx draw exactly weights[idx] samples from datasets[idx].
+        size (Optional[int]): The number of samples to draw from the blend. If None, for each
+            dataset index idx draw exactly weights[idx] samples from datasets[idx].

        config (BlendedMegatronDatasetConfig): The config

@@ -74,7 +75,6 @@ class BlendedDataset(torch.utils.data.Dataset):
        unique_identifiers["split"] = self.split.name
        unique_identifiers["weights"] = self.weights
        unique_identifiers["size"] = self.size
-        unique_identifiers["renormalize_blend_weights"] = self.config.renormalize_blend_weights

        self.unique_description = json.dumps(
            unique_identifiers, indent=4, default=lambda obj: obj.unique_identifiers
@@ -168,7 +168,7 @@ class BlendedDataset(torch.utils.data.Dataset):
                log_single_rank(
                    logger,
                    logging.WARNING,
-                    f"Unable to save the {type(self).__name__} indexes because path_to_cache is None",
+                    f"Cannot save the {type(self).__name__} indexes because path_to_cache is None",
                )

            t_end = time.time()

--- a/megatron/core/datasets/blended_megatron_dataset_builder.py
+++ b/megatron/core/datasets/blended_megatron_dataset_builder.py
@@ -34,7 +34,9 @@ class BlendedMegatronDatasetBuilder(object):

        sizes (List[Optional[int]]): The minimum total number of samples to draw, or None, per split

-        is_built_on_rank (Callable): A callable which returns True if the dataset should be built on the current rank and False otherwise. It should be Megatron Core parallelism aware i.e. global rank, local group rank, and virtual rank may inform its return value.
+        is_built_on_rank (Callable): A callable which returns True if the dataset should be built on
+            the current rank and False otherwise. It should be Megatron Core parallelism aware i.e.
+            global rank, local group rank, and virtual rank may inform its return value.

        config (BlendedMegatronDatasetConfig): The config object which informs dataset creation
    """
@@ -54,7 +56,7 @@ class BlendedMegatronDatasetBuilder(object):
        log_single_rank(
            logger,
            logging.INFO,
-            f"Building dataset splits with cls={cls.__name__}, sizes={self.sizes}, and config={self.config}",
+            f"Building {cls.__name__} splits with sizes={self.sizes} and config={self.config}",
        )

        if not self.config.mock:
@@ -96,7 +98,8 @@ class BlendedMegatronDatasetBuilder(object):
        (2) The split has one contributing dataset, and...

            (a) 'size' is not None
-                - Build a mid-level dataset with low-level dataset sampling in proportion to the size
+                - Build a mid-level dataset with low-level dataset sampling in proportion to the
+                size

            (b) 'size' is None
                - Build mid-level datasets with no excess low-level dataset sampling
@@ -104,24 +107,27 @@ class BlendedMegatronDatasetBuilder(object):
        (3) The split has multiple contributing datasets, and...

            (a) 'weights' is not None and 'size' is not None
-                - Build mid-level datasets with low-level dataset sampling in proportion to their weights and the size
-                - Build a top-level dataset of length marginally greater than 'size' with mid-level dataset sampling in proportion to their weights and the size
+                - Build mid-level datasets with low-level dataset sampling in proportion to their
+                weights and the size
+                - Build a top-level dataset of length marginally greater than 'size' with mid-level
+                dataset sampling in proportion to their weights and the size

            (b) 'weights' is not None and 'size' is None
                - Error

            (c) 'weights' is None and 'size' is not None
                - Build mid-level datasets with no excess low-level dataset sampling
-                - Build a top-level dataset of length 'size' with mid-level dataset sampling in proportion to their lengths and the size
-
-                  - The 'size' of the top-level dataset is capped at the sum of the mid-level dataset lengths
+                - Build a top-level dataset of length 'size' (capped at the sum of the mid-level
+                dataset lengths) with mid-level dataset sampling in proportion to their lengths
+                and the size

            (d) 'weights' is None and 'size' is None
                - Build mid-level datasets with no excess low-level dataset sampling
                - Build a top-level dataset with no excess mid-level dataset sampling

        Returns:
-            List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per split
+            List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per
+                split
        """
        datasets = self._build_blended_dataset_splits()

@@ -134,24 +140,35 @@ class BlendedMegatronDatasetBuilder(object):
                        log_single_rank(
                            logger,
                            logging.INFO,
-                            f"Verifying NumPy indices for {type(dataset).__name__} {dataset.split.name} split",
+                            (
+                                f"Verifying NumPy indices for {type(dataset).__name__} "
+                                f"{dataset.split.name} split"
+                            ),
                        )
                    else:
                        log_single_rank(
                            logger,
                            logging.INFO,
-                            f"NumPy indices for {type(dataset).__name__} {dataset.split.name} split are fully cached, skipping verification",
+                            (
+                                f"NumPy indices for {type(dataset).__name__} {dataset.split.name} "
+                                f"split are fully cached, skipping verification"
+                            ),
                        )
                        continue
                    # Check blend size
                    assert dataset.size is None or dataset.size == dataset.dataset_index.shape[0]
                    # Check blend access of mid-level datasets
-                    _, sizes = numpy.unique(dataset.dataset_index, return_counts=True)
-                    for i, dataset_and_size in enumerate(zip(dataset.datasets, sizes)):
-                        if len(dataset_and_size[0]) < dataset_and_size[1]:
+                    dataset_indices, dataset_sizes = numpy.unique(
+                        dataset.dataset_index, return_counts=True
+                    )
+                    for i, (index, size) in enumerate(zip(dataset_indices, dataset_sizes)):
+                        if len(dataset.datasets[index]) < size:
                            raise IndexError(
-                                f"The {dataset.split.name} blend oversamples (N = {dataset_and_size[1]}) {type(dataset_and_size[0]).__name__} {i} (len = {len(dataset_and_size[0])}). "
-                                f"Set renormalize_blend_weights to True and re-run. File an issue if the problem is not resolved."
+                                f"The {dataset.split.name} blend oversamples the contributing "
+                                f"datasets  and, e.g., requests {size} samples from "
+                                f"{type(dataset.datasets[index]).__name__} {i} with size "
+                                f"{len(dataset.datasets[index])}. This is unexpected. "
+                                f"Please file an issue."
                            )

        return datasets
@@ -162,7 +179,8 @@ class BlendedMegatronDatasetBuilder(object):
        See the BlendedMegatronDatasetBuilder.build alias for more information.

        Returns:
-            List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per split
+            List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per
+                split
        """
        ##
        # Return fake "mock" datasets
@@ -192,13 +210,19 @@ class BlendedMegatronDatasetBuilder(object):

            # Build the mid-level datasets
            if weights is None:
-                sizes_per_dataset = [[None for split in Split] for prefix in prefixes]
+                # Build only one "epoch"
+                sizes_per_dataset_buffer = [[None for split in Split] for prefix in prefixes]
            else:
-                sizes_per_dataset = _get_size_per_split_per_dataset(weights, self.sizes)
+                # The number of samples we plan to use per dataset
+                sizes_per_dataset_target = _get_size_per_split_per_dataset(weights, self.sizes)
+                # The number of samples we plan to build per dataset
+                sizes_per_dataset_buffer = _get_size_per_split_per_dataset(
+                    weights, self.sizes, margin=0.5
+                )

-            # build each dataset in parallel
+            # Build each dataset in parallel
            megatron_datasets = self._build_megatron_datasets_parallel(
-                prefixes, split, sizes_per_dataset
+                prefixes, split, sizes_per_dataset_buffer
            )

            # Build the top-level datasets
@@ -207,11 +231,11 @@ class BlendedMegatronDatasetBuilder(object):
                if split[i] is not None:
                    weights_i = weights
                    if weights_i is not None and self.sizes[i] is not None:
-                        size_per_dataset = list(zip(*sizes_per_dataset))[i]
+                        # Blend according to client-specified weights and client-specified size
+                        size_per_dataset = list(zip(*sizes_per_dataset_target))[i]
                        size_i = sum(size_per_dataset)
-                        if self.config.renormalize_blend_weights:
-                            weights_i = list(map(lambda _size: _size / size_i, size_per_dataset))
                    elif weights_i is None:
+                        # Blend according to dataset sizes as-is and (maybe) client-specified size
                        try:
                            weights_i = [
                                len(megatron_dataset) for megatron_dataset in megatron_datasets[i]
@@ -221,9 +245,12 @@ class BlendedMegatronDatasetBuilder(object):
                        if self.sizes[i] is not None:
                            size_i = min(self.sizes[i], sum(weights_i))
                        else:
-                            size_i = None  # => the size will be sum(weights_i)
+                            # Build exhaustive indices
+                            size_i = None
                    else:
-                        raise RuntimeError
+                        raise ValueError(
+                            "Using client-specified weights requires client-specified size"
+                        )
                    blended_datasets[i] = self.build_generic_dataset(
                        BlendedDataset,
                        self.is_built_on_rank,
@@ -263,22 +290,31 @@ class BlendedMegatronDatasetBuilder(object):

                    # Build mid-level datasets
                    if weights is None:
-                        sizes_per_dataset = [[None for split in Split] for prefix in prefixes]
+                        sizes_per_dataset_buffer = [
+                            [None for split in Split] for prefix in prefixes
+                        ]
                    else:
-                        sizes_per_dataset = _get_size_per_split_per_dataset(weights, sizes_spoof)
+                        # The number of samples we plan to use per dataset
+                        sizes_per_dataset_target = _get_size_per_split_per_dataset(
+                            weights, sizes_spoof
+                        )
+                        # The number of samples we plan to build per dataset
+                        sizes_per_dataset_buffer = _get_size_per_split_per_dataset(
+                            weights, sizes_spoof, margin=0.5
+                        )

-                    # build each dataset in parallel
+                    # Build each dataset in parallel
                    megatron_datasets = self._build_megatron_datasets_parallel(
-                        prefixes, split_spoof, sizes_per_dataset
+                        prefixes, split_spoof, sizes_per_dataset_buffer
                    )[i]

                    # Build top-level dataset
                    if weights is not None and self.sizes[i] is not None:
-                        size_per_dataset = list(zip(*sizes_per_dataset))[i]
+                        # Blend according to client-specified weights and client-specified size
+                        size_per_dataset = list(zip(*sizes_per_dataset_target))[i]
                        size = sum(size_per_dataset)
-                        if self.config.renormalize_blend_weights:
-                            weights = list(map(lambda _size: _size / size, size_per_dataset))
                    elif weights is None:
+                        # Blend according to dataset sizes as-is and (maybe) client-specified size
                        try:
                            weights = [
                                len(megatron_dataset) for megatron_dataset in megatron_datasets
@@ -288,7 +324,8 @@ class BlendedMegatronDatasetBuilder(object):
                        if self.sizes[i] is not None:
                            size = min(self.sizes[i], sum(weights))
                        else:
-                            size = None  # => the size will be sum(weights)
+                            # Build exhaustive indices
+                            size = None
                    else:
                        raise RuntimeError
                    blended_datasets[i] = self.build_generic_dataset(
@@ -395,13 +432,15 @@ class BlendedMegatronDatasetBuilder(object):
        """Build each MidLevelDataset split from a single LowLevelDataset

        Args:
-            dataset_path (Optional[str]): The path on disk which defines the underlying LowLevelDataset, or None for mock dataset classes
+            dataset_path (Optional[str]): The path on disk which defines the underlying
+                LowLevelDataset, or None for mock dataset classes

            split (List[Tuple[float, float]]): The dataset split matrix

            sizes (List[int]): The number of total samples to draw from each split

-            synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks behavior. Set to False when we enforce this behavior at higher level.
+            synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks
+                behavior. Set to False when we enforce this behavior at higher level.

        Returns:
            List[Optional[MidLevelDataset]]: The MidLevelDataset (or None) per split
@@ -462,17 +501,22 @@ class BlendedMegatronDatasetBuilder(object):
        and torch.distributed is initialized.

        Args:
-            cls (Union[Type[DistributedDataset], Callable]): The DistributedDataset class to be built. In special cases, e.g. when we are building the low level dataset for a RawMegatronDataset instance, we can accept a Callable which returns an Iterable.
+            cls (Union[Type[DistributedDataset], Callable]): The DistributedDataset class to be
+                built. In special cases, e.g. when we are building the low level dataset for a
+                RawMegatronDataset instance, we can accept a Callable which returns an Iterable.

-            synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks behavior. Set to False when we enforce this behavior at higher level.
+            synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks
+                behavior. Set to False when we enforce this behavior at higher level.

-            args (Tuple[Any]): The positional arguments used to build the provided DistributedDataset class
+            args (Tuple[Any]): The positional arguments used to build the provided
+                DistributedDataset class

        Raises:
            Exception: When the dataset constructor raises an OSError

        Returns:
-            Optional[Union[DistributedDataset, Iterable]]: The DistributedDataset instantion, the Iterable instantiation, or None
+            Optional[Union[DistributedDataset, Iterable]]: The DistributedDataset instantion, the
+                Iterable instantiation, or None
        """
        if torch.distributed.is_initialized():
            rank = torch.distributed.get_rank()
@@ -485,10 +529,10 @@ class BlendedMegatronDatasetBuilder(object):
                    dataset = cls(*args)
                except OSError as err:
                    log = (
-                        f"Failed to write dataset materials to the data cache directory. "
-                        + f"Please supply a directory to which you have write access via "
-                        + f"the path_to_cache attribute in BlendedMegatronDatasetConfig and "
-                        + f"retry. Refer to the preserved traceback above for more information."
+                        f"Failed to write dataset materials to the data cache directory. Please "
+                        f"supply a directory to which you have write access via the path_to_cache "
+                        f"attribute in BlendedMegatronDatasetConfig and retry. Refer to the "
+                        f"preserved traceback above for more information."
                    )
                    raise Exception(log) from err

@@ -505,23 +549,30 @@ class BlendedMegatronDatasetBuilder(object):


 def _get_size_per_split_per_dataset(
-    normalized_weights: List[float], target_size_per_split: List[int]
+    normalized_weights: List[float], target_size_per_split: List[int], margin: float = 0.0
 ) -> List[List[int]]:
    """Determine the contribution of the MegatronDataset splits to the BlendedDataset splits

    Args:
        normalized_weights (List[float]): e.g. [0.3, 0.7]

-        target_size_per_split (List[int]): The number of samples to target for each BlendedDataset split
+        target_size_per_split (List[int]): The number of samples to target for each BlendedDataset
+            split
+
+        margin (float): The relative quantity of extra samples to build per per split per dataset,
+            as a percentage

    Returns:
        List[List[int]]: The number of samples to request per MegatronDataset per split
    """
    assert numpy.isclose(sum(normalized_weights), 1.0)

-    # Use 0.5% target margin to ensure we satiate the request
+    # Use margin as buffer to ensure we satiate the request
    sizes_per_dataset = [
-        [int(math.ceil(target_size * weight * 1.005)) for target_size in target_size_per_split]
+        [
+            int(math.ceil(math.ceil(target_size * weight) * (1 + margin / 100)))
+            for target_size in target_size_per_split
+        ]
        for weight in normalized_weights
    ]


--- a/megatron/core/datasets/blended_megatron_dataset_config.py
+++ b/megatron/core/datasets/blended_megatron_dataset_config.py
@@ -34,12 +34,6 @@ class BlendedMegatronDatasetConfig:
       'blend'. Defauls to None.
    """

-    renormalize_blend_weights: bool = False
-    """Renormalize the blend weights to account for mid-level dataset oversampling done to ensure 
-       fulfillmenet of the of the requested number of samples. Defaults to False for backward
-       comparability in the data sample order.
-    """
-
    split: Optional[str] = None
    """The split string, a comma separated weighting for the dataset splits when drawing samples
       from a single distribution. Not to be used with 'blend_per_split'.  Defaults to None.
@@ -67,7 +61,7 @@ class BlendedMegatronDatasetConfig:
    """

    tokenizer: Optional[MegatronTokenizer] = None
-    """The MegatronTokenizer instance or None. Required for datasets which do online tokenization."""
+    """The MegatronTokenizer instance. Required for datasets that do online tokenization."""

    def __post_init__(self) -> None:
        """Do asserts and set fields post init"""
@@ -149,7 +143,8 @@ def convert_split_vector_to_split_matrix(
    Args:
        vector_a (List[float]): The primary split vector

-        vector_b (Optional[List[float]]): An optional secondary split vector which constrains the primary split vector. Defaults to None.
+        vector_b (Optional[List[float]]): An optional secondary split vector which constrains the
+            primary split vector. Defaults to None.

    Returns:
        List[Tuple[float, float]]: The split matrix consisting of book-ends of each split in order

--- a/megatron/core/datasets/gpt_dataset.py
+++ b/megatron/core/datasets/gpt_dataset.py
--- a/megatron/core/datasets/helpers.cpp
+++ b/megatron/core/datasets/helpers.cpp
--- a/megatron/core/datasets/helpers.py
+++ b/megatron/core/datasets/helpers.py
--- a/megatron/core/datasets/indexed_dataset.py
+++ b/megatron/core/datasets/indexed_dataset.py
--- a/megatron/core/datasets/masked_dataset.py
+++ b/megatron/core/datasets/masked_dataset.py
--- a/megatron/core/datasets/megatron_dataset.py
+++ b/megatron/core/datasets/megatron_dataset.py
--- a/megatron/core/datasets/megatron_tokenizer.py
+++ b/megatron/core/datasets/megatron_tokenizer.py