第一次测试提交

bc5c7fa7 · wxj · 70fddd0f · bc5c7fa7 · bc5c7fa7 · bc5c7fa7
Commit bc5c7fa7 authored Jan 07, 2025 by wxj
20 changed files
--- a/Megatron-LM-core_r0.7.0.beta/examples/sc21/run_figure_11.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/sc21/run_figure_11.sh
+#!/bin/bash
+
+# ================================
+# Choose the case to run.
+# ================================
+
+# Pipeline-parallel size options = [1, 2, 4, 8].
+PP=1
+
+# Batch size (global batch size) options = [8, 128].
+GBS=8
+
+
+
+
+
+# Set pipeline-parallel size options.
+NLS=$((3*PP))
+NNODES=${PP}
+
+
+# Other params.
+TP=8
+MBS=1
+HS=20480
+NAH=128
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+
+
+# Name of the job.
+export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS}
+
+
+# Import the configs.
+. `pwd`/CONFIG.sh
+
+
+# Submit the job.
+. `pwd`/SBATCH.sh
+
+
+exit 0
+
+
+
--- a/Megatron-LM-core_r0.7.0.beta/examples/sc21/run_figure_12.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/sc21/run_figure_12.sh
+#!/bin/bash
+
+# ================================
+# Choose the case to run.
+# ================================
+
+# Interleaved schedule options = [YES, NO].
+INTERLEAVED=YES
+
+# Batch size (global batch size) options = [12, 24, 36, ..., 60].
+GBS=12
+
+
+
+
+
+# Set interleaved schedule options.
+if [ ${INTERLEAVED} == "YES" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
+elif [ ${INTERLEAVED} == "NO" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+else
+    echo "Invalid configuration"
+    exit 1
+fi
+
+
+# Other params.
+TP=8
+PP=12
+MBS=1
+NLS=96
+HS=12288
+NAH=96
+DDP=local
+NNODES=12
+
+
+# Name of the job.
+export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS}
+
+
+# Import the configs.
+. `pwd`/CONFIG.sh
+
+
+# Submit the job.
+. `pwd`/SBATCH.sh
+
+
+exit 0
+
+
+
--- a/Megatron-LM-core_r0.7.0.beta/examples/sc21/run_figure_13.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/sc21/run_figure_13.sh
+#!/bin/bash
+
+# ================================
+# Choose the case to run.
+# ================================
+
+# Pipeline-parallel size options = [2, 4, 8, 16, 32].
+PP=2
+
+# Batch size (global batch size) options = [32, 128].
+GBS=32
+
+
+
+
+
+# Set pipeline-parallel and tensor-parallel size options.
+TP=$((64/PP))
+
+
+# Other params.
+MBS=1
+NLS=32
+HS=20480
+NAH=128
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+NNODES=8
+
+
+# Name of the job.
+export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS}
+
+
+# Import the configs.
+. `pwd`/CONFIG.sh
+
+
+# Submit the job.
+. `pwd`/SBATCH.sh
+
+
+exit 0
+
+
+
--- a/Megatron-LM-core_r0.7.0.beta/examples/sc21/run_figure_14.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/sc21/run_figure_14.sh
+#!/bin/bash
+
+# ================================
+# Choose the case to run.
+# ================================
+
+# Pipeline-parallel size options = [2, 4, 8, 16, 32].
+PP=2
+
+# Batch size (global batch size) options = [32, 512].
+GBS=32
+
+
+
+
+
+# Set pipeline-parallel and data-parallel size options.
+DP=$((64/PP))
+
+
+# Other params.
+TP=1
+MBS=1
+NLS=32
+HS=3840
+NAH=32
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+NNODES=8
+
+
+# Name of the job.
+export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS}
+
+
+# Import the configs.
+. `pwd`/CONFIG.sh
+
+
+# Submit the job.
+. `pwd`/SBATCH.sh
+
+
+exit 0
+
+
+
--- a/Megatron-LM-core_r0.7.0.beta/examples/sc21/run_figure_15.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/sc21/run_figure_15.sh
+#!/bin/bash
+
+# ================================
+# Choose the case to run.
+# ================================
+
+# Tensor-parallel size options = [2, 4, 8, 16, 32].
+TP=2
+
+# Batch size (global batch size) options = [32, 128, 512].
+GBS=32
+
+
+
+
+
+# Set tensor-parallel and data-parallel size options.
+DP=$((64/TP))
+
+
+# Other params.
+PP=1
+MBS=1
+NLS=32
+HS=3840
+NAH=32
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+NNODES=8
+
+
+# Name of the job.
+export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS}
+
+
+# Import the configs.
+. `pwd`/CONFIG.sh
+
+
+# Submit the job.
+. `pwd`/SBATCH.sh
+
+
+exit 0
+
+
+
--- a/Megatron-LM-core_r0.7.0.beta/examples/sc21/run_figure_16.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/sc21/run_figure_16.sh
--- a/Megatron-LM-core_r0.7.0.beta/examples/sc21/run_figure_17.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/sc21/run_figure_17.sh
--- a/Megatron-LM-core_r0.7.0.beta/examples/sc21/run_figure_18.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/sc21/run_figure_18.sh
--- a/Megatron-LM-core_r0.7.0.beta/examples/sc21/run_table_1.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/sc21/run_table_1.sh
--- a/Megatron-LM-core_r0.7.0.beta/examples/t5/README.md
+++ b/Megatron-LM-core_r0.7.0.beta/examples/t5/README.md
--- a/Megatron-LM-core_r0.7.0.beta/examples/t5/t5_mcore_train_curve.png
+++ b/Megatron-LM-core_r0.7.0.beta/examples/t5/t5_mcore_train_curve.png
--- a/Megatron-LM-core_r0.7.0.beta/examples/t5/train_t5_220m_distributed.sh
+++ b/Megatron-LM-core_r0.7.0.beta/examples/t5/train_t5_220m_distributed.sh
--- a/Megatron-LM-core_r0.7.0.beta/megatron/__init__.py
+++ b/Megatron-LM-core_r0.7.0.beta/megatron/__init__.py
--- a/Megatron-LM-core_r0.7.0.beta/megatron/core/QuickStart.md
+++ b/Megatron-LM-core_r0.7.0.beta/megatron/core/QuickStart.md
--- a/Megatron-LM-core_r0.7.0.beta/megatron/core/README.md
+++ b/Megatron-LM-core_r0.7.0.beta/megatron/core/README.md
+Megatron Core is a library for efficient and scalable training of transformer based models.
\ No newline at end of file
--- a/Megatron-LM-core_r0.7.0.beta/megatron/core/README_STRAGGLER.md
+++ b/Megatron-LM-core_r0.7.0.beta/megatron/core/README_STRAGGLER.md
--- a/Megatron-LM-core_r0.7.0.beta/megatron/core/__init__.py
+++ b/Megatron-LM-core_r0.7.0.beta/megatron/core/__init__.py
+import megatron.core.tensor_parallel
+import megatron.core.utils
+from megatron.core import parallel_state
+from megatron.core.distributed import DistributedDataParallel
+from megatron.core.inference_params import InferenceParams
+from megatron.core.model_parallel_config import ModelParallelConfig
+from megatron.core.timers import Timers
+
+# Alias parallel_state as mpu, its legacy name
+mpu = parallel_state
+
+__all__ = [
+    "parallel_state",
+    "tensor_parallel",
+    "utils",
+    "DistributedDataParallel",
+    "InferenceParams",
+    "ModelParallelConfig",
+    "Timers",
+]
--- a/Megatron-LM-core_r0.7.0.beta/megatron/core/datasets/Makefile
+++ b/Megatron-LM-core_r0.7.0.beta/megatron/core/datasets/Makefile
--- a/Megatron-LM-core_r0.7.0.beta/megatron/core/datasets/__init__.py
+++ b/Megatron-LM-core_r0.7.0.beta/megatron/core/datasets/__init__.py
--- a/Megatron-LM-core_r0.7.0.beta/megatron/core/datasets/bert_dataset.py
+++ b/Megatron-LM-core_r0.7.0.beta/megatron/core/datasets/bert_dataset.py