更新代码

688448db · silencealiang · a02a5490 · 688448db · 688448db · 688448db
Commit 688448db authored Mar 14, 2025 by silencealiang
20 changed files
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -48,4 +48,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.92655, "5": 10.92715, "10": 10.90788, "15": 10.88296, "20": 10.77598, "25": 10.59263, "30": 10.39177, "35": 10.297, "40": 10.09664, "45": 9.84468, "50": 9.90938, "55": 9.87767, "60": 9.4912, "65": 8.94239, "70": 9.72271, "75": 9.41883, "80": 9.40054, "85": 9.61183, "90": 9.81021, "95": 9.51721, "100": 9.40125}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 61.0, "5": 67.0, "10": 45.0, "15": 63.0, "20": 62.0, "25": 59.0, "30": 62.0, "35": 73.0, "40": 68.0, "45": 80.0, "50": 96.0, "55": 51.0, "60": 83.0, "65": 93.0, "70": 91.0, "75": 76.0, "80": 78.0, "85": 78.0, "90": 88.0, "95": 82.0, "100": 90.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 487096832.0, "5": 487096832.0, "10": 487096832.0, "15": 487096832.0, "20": 487096832.0, "25": 487096832.0, "30": 487096832.0, "35": 487096832.0, "40": 487096832.0, "45": 487096832.0, "50": 487096832.0, "55": 487096832.0, "60": 487096832.0, "65": 487096832.0, "70": 487096832.0, "75": 487096832.0, "80": 487096832.0, "85": 487096832.0, "90": 487096832.0, "95": 487096832.0, "100": 487096832.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1668052480.0, "5": 1848125952.0, "10": 1848125952.0, "15": 1848125952.0, "20": 1848125952.0, "25": 1848125952.0, "30": 1848125952.0, "35": 1848125952.0, "40": 1848125952.0, "45": 1848125952.0, "50": 1848125952.0, "55": 1848125952.0, "60": 1848125952.0, "65": 1848125952.0, "70": 1848125952.0, "75": 1848125952.0, "80": 1848125952.0, "85": 1848125952.0, "90": 1848125952.0, "95": 1848125952.0, "100": 1848125952.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.8138, "5": 0.19926, "10": 0.19439, "15": 0.19389, "20": 0.19552, "25": 0.19186, "30": 0.19341, "35": 0.19268, "40": 0.19289, "45": 0.19218, "50": 0.19214, "55": 0.19236, "60": 0.19561, "65": 0.19299, "70": 0.19296, "75": 0.19308, "80": 0.19336, "85": 0.19452, "90": 0.19164, "95": 0.19304, "100": 0.19217}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.92705, "5": 10.92799, "10": 10.90789, "15": 10.88313, "20": 10.77626, "25": 10.59138, "30": 10.39195, "35": 10.29687, "40": 10.0964, "45": 9.84466, "50": 9.90919, "55": 9.87765, "60": 9.49125, "65": 8.94236, "70": 9.72262, "75": 9.4191, "80": 9.40075, "85": 9.61211, "90": 9.81017, "95": 9.51717, "100": 9.40147}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 68.0, "5": 64.0, "10": 61.0, "15": 58.0, "20": 64.0, "25": 58.0, "30": 85.0, "35": 66.0, "40": 85.0, "45": 82.0, "50": 68.0, "55": 84.0, "60": 71.0, "65": 85.0, "70": 92.0, "75": 62.0, "80": 87.0, "85": 74.0, "90": 71.0, "95": 79.0, "100": 72.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 487096320.0, "5": 487096320.0, "10": 487096320.0, "15": 487096320.0, "20": 487096320.0, "25": 487096320.0, "30": 487096320.0, "35": 487096320.0, "40": 487096320.0, "45": 487096320.0, "50": 487096320.0, "55": 487096320.0, "60": 487096320.0, "65": 487096320.0, "70": 487096320.0, "75": 487096320.0, "80": 487096320.0, "85": 487096320.0, "90": 487096320.0, "95": 487096320.0, "100": 487096320.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2158389248.0, "5": 2338462720.0, "10": 2338462720.0, "15": 2338462720.0, "20": 2338462720.0, "25": 2338462720.0, "30": 2338462720.0, "35": 2338462720.0, "40": 2338462720.0, "45": 2338462720.0, "50": 2338462720.0, "55": 2338462720.0, "60": 2338462720.0, "65": 2338462720.0, "70": 2338462720.0, "75": 2338462720.0, "80": 2338462720.0, "85": 2338462720.0, "90": 2338462720.0, "95": 2338462720.0, "100": 2338462720.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.88233, "5": 0.22608, "10": 0.21553, "15": 0.21336, "20": 0.21247, "25": 0.21243, "30": 0.23729, "35": 0.2257, "40": 0.21253, "45": 0.21718, "50": 0.21345, "55": 0.21376, "60": 0.21327, "65": 0.21242, "70": 0.21194, "75": 0.21274, "80": 0.21252, "85": 0.21061, "90": 0.21024, "95": 0.21239, "100": 0.21117}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -48,4 +48,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.92655, "5": 10.9272, "10": 10.90786, "15": 10.88292, "20": 10.77603, "25": 10.59271, "30": 10.39175, "35": 10.297, "40": 10.09664, "45": 9.84468, "50": 9.9094, "55": 9.87765, "60": 9.49117, "65": 8.94241, "70": 9.72269, "75": 9.41888, "80": 9.40055, "85": 9.61184, "90": 9.81022, "95": 9.51724, "100": 9.4013}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1637.0, "5": 1988.0, "10": 1422.0, "15": 1936.0, "20": 1566.0, "25": 1705.0, "30": 1974.0, "35": 2043.0, "40": 2249.0, "45": 2145.0, "50": 2454.0, "55": 2388.0, "60": 2479.0, "65": 2674.0, "70": 3241.0, "75": 2687.0, "80": 3465.0, "85": 3382.0, "90": 3023.0, "95": 3415.0, "100": 3347.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 487096832.0, "5": 487096832.0, "10": 487096832.0, "15": 487096832.0, "20": 487096832.0, "25": 487096832.0, "30": 487096832.0, "35": 487096832.0, "40": 487096832.0, "45": 487096832.0, "50": 487096832.0, "55": 487096832.0, "60": 487096832.0, "65": 487096832.0, "70": 487096832.0, "75": 487096832.0, "80": 487096832.0, "85": 487096832.0, "90": 487096832.0, "95": 487096832.0, "100": 487096832.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1229747712.0, "5": 1409821184.0, "10": 1409821184.0, "15": 1409821184.0, "20": 1409821184.0, "25": 1409821184.0, "30": 1409821184.0, "35": 1409821184.0, "40": 1409821184.0, "45": 1409821184.0, "50": 1409821184.0, "55": 1409821184.0, "60": 1409821184.0, "65": 1409821184.0, "70": 1409821184.0, "75": 1409821184.0, "80": 1409821184.0, "85": 1409821184.0, "90": 1409821184.0, "95": 1409821184.0, "100": 1409821184.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 13.17732, "5": 0.20296, "10": 0.20325, "15": 0.20174, "20": 0.20216, "25": 0.20151, "30": 0.20223, "35": 0.20172, "40": 0.20152, "45": 0.20108, "50": 0.20046, "55": 0.1934, "60": 0.19326, "65": 0.19362, "70": 0.19278, "75": 0.19295, "80": 0.19307, "85": 0.19325, "90": 0.19304, "95": 0.19317, "100": 0.19328}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.92705, "5": 10.92795, "10": 10.90786, "15": 10.88314, "20": 10.77629, "25": 10.5914, "30": 10.39194, "35": 10.29685, "40": 10.09639, "45": 9.84463, "50": 9.90918, "55": 9.87766, "60": 9.49126, "65": 8.94236, "70": 9.72266, "75": 9.41909, "80": 9.40076, "85": 9.61209, "90": 9.81018, "95": 9.51718, "100": 9.40151}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1627.0, "5": 2010.0, "10": 1368.0, "15": 1897.0, "20": 1626.0, "25": 1769.0, "30": 1899.0, "35": 1988.0, "40": 2199.0, "45": 2158.0, "50": 2494.0, "55": 2485.0, "60": 2351.0, "65": 2777.0, "70": 3197.0, "75": 2615.0, "80": 3395.0, "85": 3340.0, "90": 3060.0, "95": 3408.0, "100": 3242.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 487096320.0, "5": 487096320.0, "10": 487096320.0, "15": 487096320.0, "20": 487096320.0, "25": 487096320.0, "30": 487096320.0, "35": 487096320.0, "40": 487096320.0, "45": 487096320.0, "50": 487096320.0, "55": 487096320.0, "60": 487096320.0, "65": 487096320.0, "70": 487096320.0, "75": 487096320.0, "80": 487096320.0, "85": 487096320.0, "90": 487096320.0, "95": 487096320.0, "100": 487096320.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1720084480.0, "5": 1900157952.0, "10": 1900157952.0, "15": 1900157952.0, "20": 1900157952.0, "25": 1900157952.0, "30": 1900157952.0, "35": 1900157952.0, "40": 1900157952.0, "45": 1900157952.0, "50": 1900157952.0, "55": 1900157952.0, "60": 1900157952.0, "65": 1900157952.0, "70": 1900157952.0, "75": 1900157952.0, "80": 1900157952.0, "85": 1900157952.0, "90": 1900157952.0, "95": 1900157952.0, "100": 1900157952.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 13.97156, "5": 0.2168, "10": 0.21367, "15": 0.22327, "20": 0.20978, "25": 0.20953, "30": 0.21033, "35": 0.20882, "40": 0.21062, "45": 0.20902, "50": 0.20932, "55": 0.21153, "60": 0.20966, "65": 0.20901, "70": 0.20892, "75": 0.21183, "80": 0.21189, "85": 0.21367, "90": 0.21386, "95": 0.21529, "100": 0.21247}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -47,4 +47,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.92655, "5": 10.9272, "10": 10.90786, "15": 10.88292, "20": 10.77603, "25": 10.59271, "30": 10.39175, "35": 10.297, "40": 10.09664, "45": 9.84468, "50": 9.9094, "55": 9.87765, "60": 9.49117, "65": 8.94241, "70": 9.72269, "75": 9.41888, "80": 9.40055, "85": 9.61184, "90": 9.81022, "95": 9.51724, "100": 9.4013}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1637.0, "5": 1988.0, "10": 1422.0, "15": 1936.0, "20": 1566.0, "25": 1705.0, "30": 1974.0, "35": 2043.0, "40": 2249.0, "45": 2145.0, "50": 2454.0, "55": 2388.0, "60": 2479.0, "65": 2674.0, "70": 3241.0, "75": 2687.0, "80": 3465.0, "85": 3382.0, "90": 3023.0, "95": 3415.0, "100": 3347.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 487096832.0, "5": 487096832.0, "10": 487096832.0, "15": 487096832.0, "20": 487096832.0, "25": 487096832.0, "30": 487096832.0, "35": 487096832.0, "40": 487096832.0, "45": 487096832.0, "50": 487096832.0, "55": 487096832.0, "60": 487096832.0, "65": 487096832.0, "70": 487096832.0, "75": 487096832.0, "80": 487096832.0, "85": 487096832.0, "90": 487096832.0, "95": 487096832.0, "100": 487096832.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1225553408.0, "5": 1405626880.0, "10": 1405626880.0, "15": 1405626880.0, "20": 1405626880.0, "25": 1405626880.0, "30": 1405626880.0, "35": 1405626880.0, "40": 1405626880.0, "45": 1405626880.0, "50": 1405626880.0, "55": 1405626880.0, "60": 1405626880.0, "65": 1405626880.0, "70": 1405626880.0, "75": 1405626880.0, "80": 1405626880.0, "85": 1405626880.0, "90": 1405626880.0, "95": 1405626880.0, "100": 1405626880.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 12.43309, "5": 0.19435, "10": 0.19438, "15": 0.19481, "20": 0.19447, "25": 0.19273, "30": 0.19383, "35": 0.19374, "40": 0.19351, "45": 0.19317, "50": 0.19324, "55": 0.19031, "60": 0.19029, "65": 0.1911, "70": 0.19168, "75": 0.19169, "80": 0.1923, "85": 0.19181, "90": 0.19164, "95": 0.19197, "100": 0.19113}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.92705, "5": 10.92795, "10": 10.90786, "15": 10.88314, "20": 10.77629, "25": 10.5914, "30": 10.39194, "35": 10.29685, "40": 10.09639, "45": 9.84463, "50": 9.90918, "55": 9.87766, "60": 9.49126, "65": 8.94236, "70": 9.72266, "75": 9.41909, "80": 9.40076, "85": 9.61209, "90": 9.81018, "95": 9.51718, "100": 9.40151}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1627.0, "5": 2010.0, "10": 1368.0, "15": 1897.0, "20": 1626.0, "25": 1769.0, "30": 1899.0, "35": 1988.0, "40": 2199.0, "45": 2158.0, "50": 2494.0, "55": 2485.0, "60": 2351.0, "65": 2777.0, "70": 3197.0, "75": 2615.0, "80": 3395.0, "85": 3340.0, "90": 3060.0, "95": 3408.0, "100": 3242.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 487096320.0, "5": 487096320.0, "10": 487096320.0, "15": 487096320.0, "20": 487096320.0, "25": 487096320.0, "30": 487096320.0, "35": 487096320.0, "40": 487096320.0, "45": 487096320.0, "50": 487096320.0, "55": 487096320.0, "60": 487096320.0, "65": 487096320.0, "70": 487096320.0, "75": 487096320.0, "80": 487096320.0, "85": 487096320.0, "90": 487096320.0, "95": 487096320.0, "100": 487096320.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1715890176.0, "5": 1895963648.0, "10": 1895963648.0, "15": 1895963648.0, "20": 1895963648.0, "25": 1895963648.0, "30": 1895963648.0, "35": 1895963648.0, "40": 1895963648.0, "45": 1895963648.0, "50": 1895963648.0, "55": 1895963648.0, "60": 1895963648.0, "65": 1895963648.0, "70": 1895963648.0, "75": 1895963648.0, "80": 1895963648.0, "85": 1895963648.0, "90": 1895963648.0, "95": 1895963648.0, "100": 1895963648.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.12901, "5": 0.21538, "10": 0.21548, "15": 0.2128, "20": 0.21291, "25": 0.21127, "30": 0.21513, "35": 0.21158, "40": 0.213, "45": 0.21093, "50": 0.2091, "55": 0.20696, "60": 0.21221, "65": 0.20519, "70": 0.2076, "75": 0.20862, "80": 0.20653, "85": 0.20713, "90": 0.20604, "95": 0.21111, "100": 0.20922}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -48,4 +48,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.92655, "5": 10.9272, "10": 10.90786, "15": 10.88292, "20": 10.77603, "25": 10.59271, "30": 10.39175, "35": 10.297, "40": 10.09664, "45": 9.84468, "50": 9.9094, "55": 9.87765, "60": 9.49117, "65": 8.94241, "70": 9.72269, "75": 9.41888, "80": 9.40055, "85": 9.61184, "90": 9.81022, "95": 9.51724, "100": 9.4013}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1637.0, "5": 1988.0, "10": 1422.0, "15": 1936.0, "20": 1566.0, "25": 1705.0, "30": 1974.0, "35": 2043.0, "40": 2249.0, "45": 2145.0, "50": 2454.0, "55": 2388.0, "60": 2479.0, "65": 2674.0, "70": 3241.0, "75": 2687.0, "80": 3465.0, "85": 3382.0, "90": 3023.0, "95": 3415.0, "100": 3347.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 490242560.0, "5": 490242560.0, "10": 490242560.0, "15": 490242560.0, "20": 490242560.0, "25": 490242560.0, "30": 490242560.0, "35": 490242560.0, "40": 490242560.0, "45": 490242560.0, "50": 490242560.0, "55": 490242560.0, "60": 490242560.0, "65": 490242560.0, "70": 490242560.0, "75": 490242560.0, "80": 490242560.0, "85": 490242560.0, "90": 490242560.0, "95": 490242560.0, "100": 490242560.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1228699136.0, "5": 1414015488.0, "10": 1414015488.0, "15": 1414015488.0, "20": 1414015488.0, "25": 1414015488.0, "30": 1414015488.0, "35": 1414015488.0, "40": 1414015488.0, "45": 1414015488.0, "50": 1414015488.0, "55": 1414015488.0, "60": 1414015488.0, "65": 1414015488.0, "70": 1414015488.0, "75": 1414015488.0, "80": 1414015488.0, "85": 1414015488.0, "90": 1414015488.0, "95": 1414015488.0, "100": 1414015488.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 11.55848, "5": 0.19452, "10": 0.1941, "15": 0.19706, "20": 0.19456, "25": 0.19225, "30": 0.19466, "35": 0.19187, "40": 0.19248, "45": 0.1906, "50": 0.19117, "55": 0.20393, "60": 0.20447, "65": 0.20474, "70": 0.20347, "75": 0.20347, "80": 0.20417, "85": 0.2045, "90": 0.20333, "95": 0.20388, "100": 0.20321}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.92705, "5": 10.92795, "10": 10.90786, "15": 10.88314, "20": 10.77629, "25": 10.5914, "30": 10.39194, "35": 10.29685, "40": 10.09639, "45": 9.84463, "50": 9.90918, "55": 9.87766, "60": 9.49126, "65": 8.94236, "70": 9.72266, "75": 9.41909, "80": 9.40076, "85": 9.61209, "90": 9.81018, "95": 9.51718, "100": 9.40151}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1627.0, "5": 2010.0, "10": 1368.0, "15": 1897.0, "20": 1626.0, "25": 1769.0, "30": 1899.0, "35": 1988.0, "40": 2199.0, "45": 2158.0, "50": 2494.0, "55": 2485.0, "60": 2351.0, "65": 2777.0, "70": 3197.0, "75": 2615.0, "80": 3395.0, "85": 3340.0, "90": 3060.0, "95": 3408.0, "100": 3242.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 487096320.0, "5": 487096320.0, "10": 487096320.0, "15": 487096320.0, "20": 487096320.0, "25": 487096320.0, "30": 487096320.0, "35": 487096320.0, "40": 487096320.0, "45": 487096320.0, "50": 487096320.0, "55": 487096320.0, "60": 487096320.0, "65": 487096320.0, "70": 487096320.0, "75": 487096320.0, "80": 487096320.0, "85": 487096320.0, "90": 487096320.0, "95": 487096320.0, "100": 487096320.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1720084480.0, "5": 1900157952.0, "10": 1900157952.0, "15": 1900157952.0, "20": 1900157952.0, "25": 1900157952.0, "30": 1900157952.0, "35": 1900157952.0, "40": 1900157952.0, "45": 1900157952.0, "50": 1900157952.0, "55": 1900157952.0, "60": 1900157952.0, "65": 1900157952.0, "70": 1900157952.0, "75": 1900157952.0, "80": 1900157952.0, "85": 1900157952.0, "90": 1900157952.0, "95": 1900157952.0, "100": 1900157952.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 11.22421, "5": 0.2135, "10": 0.21228, "15": 0.21124, "20": 0.21112, "25": 0.21341, "30": 0.21004, "35": 0.21039, "40": 0.21245, "45": 0.21157, "50": 0.21206, "55": 0.21309, "60": 0.21493, "65": 0.2203, "70": 0.21919, "75": 0.2139, "80": 0.21624, "85": 0.21803, "90": 0.21757, "95": 0.21527, "100": 0.21237}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -48,4 +48,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.92655, "5": 10.92718, "10": 10.90795, "15": 10.88296, "20": 10.77593, "25": 10.59272, "30": 10.39174, "35": 10.29697, "40": 10.09661, "45": 9.84472, "50": 9.90947, "55": 9.87772, "60": 9.49122, "65": 8.94261, "70": 9.72277, "75": 9.41891, "80": 9.40056, "85": 9.61186, "90": 9.81027, "95": 9.51723, "100": 9.40137}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1611.0, "5": 1973.0, "10": 1470.0, "15": 1891.0, "20": 1584.0, "25": 1645.0, "30": 1962.0, "35": 1981.0, "40": 2112.0, "45": 2100.0, "50": 2531.0, "55": 2378.0, "60": 2386.0, "65": 2711.0, "70": 3230.0, "75": 2725.0, "80": 3457.0, "85": 3332.0, "90": 3085.0, "95": 3461.0, "100": 3332.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 438469120.0, "5": 438469120.0, "10": 438469120.0, "15": 438469120.0, "20": 438469120.0, "25": 438469120.0, "30": 438469120.0, "35": 438469120.0, "40": 438469120.0, "45": 438469120.0, "50": 438469120.0, "55": 438469120.0, "60": 438469120.0, "65": 438469120.0, "70": 438469120.0, "75": 438469120.0, "80": 438469120.0, "85": 438469120.0, "90": 438469120.0, "95": 438469120.0, "100": 438469120.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1179678208.0, "5": 1361717760.0, "10": 1361717760.0, "15": 1361717760.0, "20": 1361717760.0, "25": 1361717760.0, "30": 1361717760.0, "35": 1361717760.0, "40": 1361717760.0, "45": 1361717760.0, "50": 1361717760.0, "55": 1361717760.0, "60": 1361717760.0, "65": 1361717760.0, "70": 1361717760.0, "75": 1361717760.0, "80": 1361717760.0, "85": 1361717760.0, "90": 1361717760.0, "95": 1361717760.0, "100": 1361717760.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 11.67908, "5": 0.18507, "10": 0.18222, "15": 0.18185, "20": 0.181, "25": 0.18035, "30": 0.18093, "35": 0.18016, "40": 0.17965, "45": 0.17953, "50": 0.17971, "55": 0.17583, "60": 0.1751, "65": 0.17527, "70": 0.17444, "75": 0.17517, "80": 0.17438, "85": 0.17443, "90": 0.17435, "95": 0.17419, "100": 0.17558}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.92655, "5": 10.92717, "10": 10.90792, "15": 10.88291, "20": 10.77595, "25": 10.59266, "30": 10.39176, "35": 10.29699, "40": 10.09666, "45": 9.84474, "50": 9.90944, "55": 9.87774, "60": 9.49116, "65": 8.94259, "70": 9.72275, "75": 9.4189, "80": 9.40056, "85": 9.61183, "90": 9.81023, "95": 9.51721, "100": 9.4013}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1587.0, "5": 1991.0, "10": 1408.0, "15": 1899.0, "20": 1647.0, "25": 1674.0, "30": 1912.0, "35": 1972.0, "40": 2247.0, "45": 2075.0, "50": 2469.0, "55": 2421.0, "60": 2487.0, "65": 2765.0, "70": 3291.0, "75": 2709.0, "80": 3493.0, "85": 3365.0, "90": 3095.0, "95": 3435.0, "100": 3327.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 435847168.0, "5": 435847168.0, "10": 435847168.0, "15": 435847168.0, "20": 435847168.0, "25": 436895744.0, "30": 435847168.0, "35": 435847168.0, "40": 435847168.0, "45": 435847168.0, "50": 435847168.0, "55": 435847168.0, "60": 435847168.0, "65": 435847168.0, "70": 435847168.0, "75": 435847168.0, "80": 435847168.0, "85": 435847168.0, "90": 435847168.0, "95": 435847168.0, "100": 435847168.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1179682816.0, "5": 1359626240.0, "10": 1359626240.0, "15": 1359626240.0, "20": 1359626240.0, "25": 1359626240.0, "30": 1359626240.0, "35": 1359626240.0, "40": 1359626240.0, "45": 1359626240.0, "50": 1359626240.0, "55": 1359626240.0, "60": 1359626240.0, "65": 1359626240.0, "70": 1359626240.0, "75": 1359626240.0, "80": 1359626240.0, "85": 1359626240.0, "90": 1359626240.0, "95": 1359626240.0, "100": 1359626240.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 13.04316, "5": 0.1807, "10": 0.17867, "15": 0.17689, "20": 0.17644, "25": 0.17764, "30": 0.17742, "35": 0.1794, "40": 0.17805, "45": 0.17812, "50": 0.18362, "55": 0.17265, "60": 0.17303, "65": 0.17109, "70": 0.17167, "75": 0.17216, "80": 0.17147, "85": 0.17705, "90": 0.17916, "95": 0.17291, "100": 0.17146}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml
@@ -17,8 +17,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -44,4 +44,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_dgx_a100_1N8G/golden_values_dev.json
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": {
+            "1": 10.88789,
+            "5": 10.90966,
+            "10": 10.87793,
+            "15": 10.86382,
+            "20": 10.75082,
+            "25": 10.5988,
+            "30": 10.40099,
+            "35": 10.30785,
+            "40": 10.10955,
+            "45": 9.85867,
+            "50": 9.92084,
+            "55": 9.88535,
+            "60": 9.50758,
+            "65": 8.95821,
+            "70": 9.72738,
+            "75": 9.42579,
+            "80": 9.40535,
+            "85": 9.61537,
+            "90": 9.81263,
+            "95": 9.52135,
+            "100": 9.40103
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": {
+            "1": 1742.0,
+            "5": 2115.0,
+            "10": 1468.0,
+            "15": 1877.0,
+            "20": 1665.0,
+            "25": 1643.0,
+            "30": 1900.0,
+            "35": 2086.0,
+            "40": 2185.0,
+            "45": 2254.0,
+            "50": 2496.0,
+            "55": 2418.0,
+            "60": 2489.0,
+            "65": 2697.0,
+            "70": 3267.0,
+            "75": 2631.0,
+            "80": 3442.0,
+            "85": 3440.0,
+            "90": 3075.0,
+            "95": 3348.0,
+            "100": 3389.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": {
+            "1": 246437376.0,
+            "5": 246437376.0,
+            "10": 246437376.0,
+            "15": 246437376.0,
+            "20": 246437376.0,
+            "25": 246437376.0,
+            "30": 246437376.0,
+            "35": 246437376.0,
+            "40": 246437376.0,
+            "45": 246437376.0,
+            "50": 246437376.0,
+            "55": 246437376.0,
+            "60": 246437376.0,
+            "65": 246437376.0,
+            "70": 246437376.0,
+            "75": 246437376.0,
+            "80": 246437376.0,
+            "85": 246437376.0,
+            "90": 246437376.0,
+            "95": 246437376.0,
+            "100": 246437376.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": {
+            "1": 1570924032.0,
+            "5": 1634534400.0,
+            "10": 1634534400.0,
+            "15": 1634534400.0,
+            "20": 1634534400.0,
+            "25": 1634534400.0,
+            "30": 1634593280.0,
+            "35": 1634593280.0,
+            "40": 1634593280.0,
+            "45": 1634593280.0,
+            "50": 1634593280.0,
+            "55": 1634593280.0,
+            "60": 1634593280.0,
+            "65": 1634593280.0,
+            "70": 1634593280.0,
+            "75": 1634593280.0,
+            "80": 1634593280.0,
+            "85": 1634593280.0,
+            "90": 1634593280.0,
+            "95": 1634593280.0,
+            "100": 1634593280.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": {
+            "1": 6.22721,
+            "5": 0.27333,
+            "10": 0.27017,
+            "15": 0.26846,
+            "20": 0.26818,
+            "25": 0.26614,
+            "30": 0.26524,
+            "35": 0.30697,
+            "40": 0.2925,
+            "45": 0.26534,
+            "50": 0.26504,
+            "55": 0.26684,
+            "60": 0.26501,
+            "65": 0.26543,
+            "70": 0.26612,
+            "75": 0.26476,
+            "80": 0.26501,
+            "85": 0.26505,
+            "90": 0.26596,
+            "95": 0.26599,
+            "100": 0.2641
+        }
+    }
+}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_dgx_a100_1N8G/golden_values_lts.json
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": {
+            "1": 10.88789,
+            "5": 10.90966,
+            "10": 10.87793,
+            "15": 10.86382,
+            "20": 10.75082,
+            "25": 10.5988,
+            "30": 10.40099,
+            "35": 10.30785,
+            "40": 10.10955,
+            "45": 9.85867,
+            "50": 9.92084,
+            "55": 9.88535,
+            "60": 9.50758,
+            "65": 8.95821,
+            "70": 9.72738,
+            "75": 9.42579,
+            "80": 9.40535,
+            "85": 9.61537,
+            "90": 9.81263,
+            "95": 9.52135,
+            "100": 9.40103
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": {
+            "1": 1742.0,
+            "5": 2115.0,
+            "10": 1468.0,
+            "15": 1877.0,
+            "20": 1665.0,
+            "25": 1643.0,
+            "30": 1900.0,
+            "35": 2086.0,
+            "40": 2185.0,
+            "45": 2254.0,
+            "50": 2496.0,
+            "55": 2418.0,
+            "60": 2489.0,
+            "65": 2697.0,
+            "70": 3267.0,
+            "75": 2631.0,
+            "80": 3442.0,
+            "85": 3440.0,
+            "90": 3075.0,
+            "95": 3348.0,
+            "100": 3389.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": {
+            "1": 246437376.0,
+            "5": 246437376.0,
+            "10": 246437376.0,
+            "15": 246437376.0,
+            "20": 246437376.0,
+            "25": 246437376.0,
+            "30": 246437376.0,
+            "35": 246437376.0,
+            "40": 246437376.0,
+            "45": 246437376.0,
+            "50": 246437376.0,
+            "55": 246437376.0,
+            "60": 246437376.0,
+            "65": 246437376.0,
+            "70": 246437376.0,
+            "75": 246437376.0,
+            "80": 246437376.0,
+            "85": 246437376.0,
+            "90": 246437376.0,
+            "95": 246437376.0,
+            "100": 246437376.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": {
+            "1": 1570924032.0,
+            "5": 1634534400.0,
+            "10": 1634534400.0,
+            "15": 1634534400.0,
+            "20": 1634534400.0,
+            "25": 1634534400.0,
+            "30": 1634593280.0,
+            "35": 1634593280.0,
+            "40": 1634593280.0,
+            "45": 1634593280.0,
+            "50": 1634593280.0,
+            "55": 1634593280.0,
+            "60": 1634593280.0,
+            "65": 1634593280.0,
+            "70": 1634593280.0,
+            "75": 1634593280.0,
+            "80": 1634593280.0,
+            "85": 1634593280.0,
+            "90": 1634593280.0,
+            "95": 1634593280.0,
+            "100": 1634593280.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": {
+            "1": 6.22721,
+            "5": 0.27333,
+            "10": 0.27017,
+            "15": 0.26846,
+            "20": 0.26818,
+            "25": 0.26614,
+            "30": 0.26524,
+            "35": 0.30697,
+            "40": 0.2925,
+            "45": 0.26534,
+            "50": 0.26504,
+            "55": 0.26684,
+            "60": 0.26501,
+            "65": 0.26543,
+            "70": 0.26612,
+            "75": 0.26476,
+            "80": 0.26501,
+            "85": 0.26505,
+            "90": 0.26596,
+            "95": 0.26599,
+            "100": 0.2641
+        }
+    }
+}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --use-custom-fsdp: true
+  --calculate-per-token-loss: true
+  --data-parallel-sharding-strategy: optim_grads_params
+  --use-distributed-optimizer: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_dev.json
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": {
+            "1": 10.81978,
+            "5": 10.85277,
+            "10": 10.79054,
+            "15": 10.81259,
+            "20": 10.71561,
+            "25": 10.52391,
+            "30": 10.33354,
+            "35": 10.22869,
+            "40": 10.04307,
+            "45": 9.77101,
+            "50": 9.86315,
+            "55": 9.82489,
+            "60": 9.45369,
+            "65": 8.89336,
+            "70": 9.69013,
+            "75": 9.38429,
+            "80": 9.37031,
+            "85": 9.58022,
+            "90": 9.78525,
+            "95": 9.49638,
+            "100": 9.36739
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": {
+            "1": 27138.0,
+            "5": 32036.0,
+            "10": 26255.0,
+            "15": 31309.0,
+            "20": 28869.0,
+            "25": 28605.0,
+            "30": 30817.0,
+            "35": 32882.0,
+            "40": 35373.0,
+            "45": 35484.0,
+            "50": 2136527.0,
+            "55": 2135084.0,
+            "60": 2137981.0,
+            "65": 2138995.0,
+            "70": 2142528.0,
+            "75": 2215276.0,
+            "80": 2144227.0,
+            "85": 2146040.0,
+            "90": 2146440.0,
+            "95": 2144187.0,
+            "100": 2144354.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": {
+            "1": 668320768.0,
+            "5": 668306944.0,
+            "10": 668313600.0,
+            "15": 668326912.0,
+            "20": 668314112.0,
+            "25": 668332544.0,
+            "30": 668326912.0,
+            "35": 668337664.0,
+            "40": 668306432.0,
+            "45": 668297728.0,
+            "50": 668282880.0,
+            "55": 668265984.0,
+            "60": 668249088.0,
+            "65": 668242944.0,
+            "70": 668224512.0,
+            "75": 668213248.0,
+            "80": 668222464.0,
+            "85": 668234752.0,
+            "90": 668237312.0,
+            "95": 668223488.0,
+            "100": 668209664.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": {
+            "1": 2355231744.0,
+            "5": 2605464064.0,
+            "10": 2605464064.0,
+            "15": 2605464064.0,
+            "20": 2605464064.0,
+            "25": 2615321600.0,
+            "30": 2615321600.0,
+            "35": 2618603520.0,
+            "40": 2618603520.0,
+            "45": 2618603520.0,
+            "50": 2618603520.0,
+            "55": 2618603520.0,
+            "60": 2618603520.0,
+            "65": 2618603520.0,
+            "70": 2618603520.0,
+            "75": 2618603520.0,
+            "80": 2618603520.0,
+            "85": 2618603520.0,
+            "90": 2618603520.0,
+            "95": 2618603520.0,
+            "100": 2618603520.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": {
+            "1": 6.84429,
+            "5": 0.49894,
+            "10": 0.4932,
+            "15": 0.48106,
+            "20": 0.48362,
+            "25": 0.48615,
+            "30": 0.49038,
+            "35": 0.49011,
+            "40": 0.50012,
+            "45": 0.49982,
+            "50": 0.49286,
+            "55": 0.92115,
+            "60": 0.49142,
+            "65": 0.49128,
+            "70": 0.49444,
+            "75": 0.49725,
+            "80": 0.4978,
+            "85": 0.49747,
+            "90": 0.497,
+            "95": 0.49687,
+            "100": 0.49788
+        }
+    }
+}
\ No newline at end of file