更新代码

688448db · silencealiang · a02a5490 · 688448db · 688448db · 688448db
Commit 688448db authored Mar 14, 2025 by silencealiang
20 changed files
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 50
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -47,4 +47,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --fp16: true
  --apply-query-key-layer-scaling: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.79311, "5": 10.83074, "10": 10.76725, "15": 10.82664, "20": 10.81793, "25": 10.76529, "30": 10.69182, "35": 10.61672, "40": 10.44907, "45": 10.21488, "50": 10.21715, "55": 10.14491, "60": 9.76806, "65": 9.20573, "70": 9.87752, "75": 9.55094, "80": 9.52283, "85": 9.7106, "90": 9.89179, "95": 9.59202, "100": 9.48543}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 586369024.0, "5": 587417600.0, "10": 587417600.0, "15": 587417600.0, "20": 869128704.0, "25": 867031552.0, "30": 867031552.0, "35": 867031552.0, "40": 867031552.0, "45": 867031552.0, "50": 869128704.0, "55": 867031552.0, "60": 867031552.0, "65": 867031552.0, "70": 867031552.0, "75": 867031552.0, "80": 869128704.0, "85": 867031552.0, "90": 867031552.0, "95": 867031552.0, "100": 867031552.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 3832784384.0, "5": 3832784896.0, "10": 3832784896.0, "15": 3832784896.0, "20": 4114758144.0, "25": 4114758144.0, "30": 4114758144.0, "35": 4114758144.0, "40": 4114758144.0, "45": 4114758144.0, "50": 4114758144.0, "55": 4114758144.0, "60": 4114758144.0, "65": 4114758144.0, "70": 4114758144.0, "75": 4114758144.0, "80": 4114758144.0, "85": 4114758144.0, "90": 4114758144.0, "95": 4114758144.0, "100": 4114758144.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 15.36019, "5": 0.14748, "10": 0.14569, "15": 0.14722, "20": 0.15678, "25": 0.15572, "30": 0.15085, "35": 0.15125, "40": 0.15141, "45": 0.15202, "50": 0.14925, "55": 0.14768, "60": 0.14952, "65": 0.15001, "70": 0.15024, "75": 0.14973, "80": 0.14933, "85": 0.1492, "90": 0.14942, "95": 0.14927, "100": 0.14832}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1846.0, "25": 2348.0, "30": 2490.0, "35": 2010.0, "40": 2016.0, "45": 2642.0, "50": 2810.0, "55": 2481.0, "60": 2945.0, "65": 2329.0, "70": 3673.0, "75": 3016.0, "80": 3642.0, "85": 4122.0, "90": 3744.0, "95": 4035.0, "100": 3447.0}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.79311, "5": 10.83074, "10": 10.76725, "15": 10.82664, "20": 10.81793, "25": 10.76529, "30": 10.69182, "35": 10.61672, "40": 10.44907, "45": 10.21488, "50": 10.21715, "55": 10.14491, "60": 9.76806, "65": 9.20573, "70": 9.87752, "75": 9.55094, "80": 9.52283, "85": 9.7106, "90": 9.89179, "95": 9.59202, "100": 9.48543}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 586369024.0, "5": 587417600.0, "10": 587417600.0, "15": 587417600.0, "20": 869128704.0, "25": 869128704.0, "30": 869128704.0, "35": 869128704.0, "40": 869128704.0, "45": 869128704.0, "50": 869128704.0, "55": 869128704.0, "60": 869128704.0, "65": 869128704.0, "70": 869128704.0, "75": 869128704.0, "80": 869128704.0, "85": 869128704.0, "90": 869128704.0, "95": 869128704.0, "100": 869128704.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 3832784384.0, "5": 3832784896.0, "10": 3832784896.0, "15": 3832784896.0, "20": 4114758144.0, "25": 4114758144.0, "30": 4114758144.0, "35": 4114758144.0, "40": 4114758144.0, "45": 4114758144.0, "50": 4114758144.0, "55": 4114758144.0, "60": 4114758144.0, "65": 4114758144.0, "70": 4114758144.0, "75": 4114758144.0, "80": 4114758144.0, "85": 4114758144.0, "90": 4114758144.0, "95": 4114758144.0, "100": 4114758144.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 17.6616, "5": 0.15013, "10": 0.15207, "15": 0.15146, "20": 0.15882, "25": 0.15823, "30": 0.15777, "35": 0.15885, "40": 0.15922, "45": 0.15588, "50": 0.15635, "55": 0.15588, "60": 0.15681, "65": 0.15688, "70": 0.15648, "75": 0.15793, "80": 0.15889, "85": 0.15769, "90": 0.15693, "95": 0.15611, "100": 0.15689}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1846.0, "25": 2348.0, "30": 2490.0, "35": 2010.0, "40": 2016.0, "45": 2642.0, "50": 2810.0, "55": 2481.0, "60": 2945.0, "65": 2329.0, "70": 3673.0, "75": 3016.0, "80": 3642.0, "85": 4122.0, "90": 3744.0, "95": 4035.0, "100": 3447.0}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -48,4 +48,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --fp16: true
  --apply-query-key-layer-scaling: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0]}, "iteration_timing_avg": 0.14292588235294112}
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.85929, "5": 10.87929, "10": 10.84772, "15": 10.86867, "20": 10.87317, "25": 10.83338, "30": 10.75624, "35": 10.66844, "40": 10.50171, "45": 10.28002, "50": 10.25621}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 379427840.0, "5": 378379264.0, "10": 378903552.0, "15": 378379264.0, "20": 561597952.0, "25": 561073664.0, "30": 561597952.0, "35": 561597952.0, "40": 561597952.0, "45": 561597952.0, "50": 561597952.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1704025600.0, "5": 1704026112.0, "10": 1704026112.0, "15": 1704026112.0, "20": 1886196224.0, "25": 1886196224.0, "30": 1886196224.0, "35": 1886196224.0, "40": 1886196224.0, "45": 1886196224.0, "50": 1886196224.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 9.22765, "5": 0.19439, "10": 0.19327, "15": 0.19227, "20": 0.20227, "25": 0.20323, "30": 0.2014, "35": 0.20216, "40": 0.20166, "45": 0.20072, "50": 0.19941}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1808.0, "25": 2385.0, "30": 2591.0, "35": 1997.0, "40": 1959.0, "45": 2368.0, "50": 3073.0}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0]}, "iteration_timing_avg": 0.14292588235294112}
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.85929, "5": 10.87929, "10": 10.84772, "15": 10.86867, "20": 10.87317, "25": 10.83338, "30": 10.75624, "35": 10.66844, "40": 10.50171, "45": 10.28002, "50": 10.25621}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 380476416.0, "5": 380476416.0, "10": 378903552.0, "15": 380476416.0, "20": 560549376.0, "25": 560549376.0, "30": 560549376.0, "35": 560549376.0, "40": 560287232.0, "45": 560549376.0, "50": 560549376.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1704025600.0, "5": 1704026112.0, "10": 1704026112.0, "15": 1704026112.0, "20": 1884099072.0, "25": 1884099072.0, "30": 1884099072.0, "35": 1884099072.0, "40": 1884361216.0, "45": 1884361216.0, "50": 1884361216.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 12.59076, "5": 0.20078, "10": 0.20046, "15": 0.19967, "20": 0.20892, "25": 0.20876, "30": 0.2082, "35": 0.2082, "40": 0.21131, "45": 0.21272, "50": 0.21012}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1808.0, "25": 2385.0, "30": 2591.0, "35": 1997.0, "40": 1959.0, "45": 2368.0, "50": 3073.0}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 50
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -46,4 +46,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --fp16: true
  --apply-query-key-layer-scaling: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -47,4 +47,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --fp16: true
  --apply-query-key-layer-scaling: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume/golden_values_dev.json
+{}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume/golden_values_lts.json
+{}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume/model_config.yaml
@@ -34,8 +34,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -63,4 +63,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
  --attention-backend: unfused
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/golden_values_0.10.0.json
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/golden_values_0.10.0.json
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/golden_values_0.9.0.json
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/golden_values_0.9.0.json
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/golden_values_0.10.0.json
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/golden_values_0.10.0.json
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/golden_values_0.8.0.json
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/golden_values_0.8.0.json
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/golden_values_0.9.0.json
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/golden_values_0.9.0.json
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml
@@ -76,8 +76,8 @@ MODEL_ARGS:
  --eval-iters: 32
  --eval-interval: 200
  # Add checkpointing args
-  --load: ${OUTPUT_PATH}/checkpoints
-  --save: ${OUTPUT_PATH}/checkpoints
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --save-interval: 5000
  # Add initialization args
  --init-method-std: 0.010

--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml
@@ -76,8 +76,8 @@ MODEL_ARGS:
  --eval-iters: 32
  --eval-interval: 200
  # Add checkpointing args
-  --load: ${OUTPUT_PATH}/checkpoints
-  --save: ${OUTPUT_PATH}/checkpoints
+  --save: ${CHECKPOINT_LOAD_PATH}
+  --load: ${CHECKPOINT_SAVE_PATH}
  --save-interval: 500
  # Add initialization args
  --init-method-std: 0.010

--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/golden_values_0.10.0.json
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/golden_values_0.10.0.json
--- a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/golden_values_0.9.0.json
+++ b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/golden_values_0.9.0.json