更新代码

688448db · silencealiang · a02a5490 · 688448db · 688448db · 688448db
Commit 688448db authored Mar 14, 2025 by silencealiang
20 changed files
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.81873, "5": 10.85262, "10": 10.78413, "15": 10.79311, "20": 10.69219, "25": 10.52454, "30": 10.34542, "35": 10.26245, "40": 10.07286, "45": 9.8112, "50": 9.88428, "55": 9.86376, "60": 9.47981, "65": 8.93093, "70": 9.71205, "75": 9.4002, "80": 9.39074, "85": 9.60143, "90": 9.8051, "95": 9.5081, "100": 9.39221}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1541.0, "5": 1912.0, "10": 1317.0, "15": 1921.0, "20": 1595.0, "25": 1666.0, "30": 1933.0, "35": 1920.0, "40": 2094.0, "45": 2101.0, "50": 2362.0, "55": 2269.0, "60": 2379.0, "65": 2624.0, "70": 3128.0, "75": 2551.0, "80": 3192.0, "85": 3503.0, "90": 2966.0, "95": 3326.0, "100": 3383.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 731763200.0, "5": 731763200.0, "10": 731763200.0, "15": 731763200.0, "20": 731763200.0, "25": 731763200.0, "30": 731763200.0, "35": 731763200.0, "40": 731763200.0, "45": 731763200.0, "50": 731763200.0, "55": 731763200.0, "60": 731763200.0, "65": 731763200.0, "70": 731763200.0, "75": 731763200.0, "80": 731763200.0, "85": 731763200.0, "90": 731763200.0, "95": 731763200.0, "100": 731763200.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2368927744.0, "5": 2649590784.0, "10": 2649590784.0, "15": 2649590784.0, "20": 2649590784.0, "25": 2649590784.0, "30": 2649590784.0, "35": 2649590784.0, "40": 2649590784.0, "45": 2649590784.0, "50": 2649590784.0, "55": 2649590784.0, "60": 2649590784.0, "65": 2649590784.0, "70": 2649590784.0, "75": 2649590784.0, "80": 2649590784.0, "85": 2649590784.0, "90": 2649590784.0, "95": 2649590784.0, "100": 2649590784.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 20.40385, "5": 0.1579, "10": 0.15096, "15": 0.15266, "20": 0.15158, "25": 0.15033, "30": 0.15058, "35": 0.14902, "40": 0.14939, "45": 0.14907, "50": 0.14846, "55": 0.1513, "60": 0.15238, "65": 0.1517, "70": 0.15268, "75": 0.15317, "80": 0.15218, "85": 0.14985, "90": 0.15084, "95": 0.14835, "100": 0.14852}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.82005, "5": 10.85284, "10": 10.78455, "15": 10.79229, "20": 10.69211, "25": 10.52412, "30": 10.34552, "35": 10.26242, "40": 10.07239, "45": 9.811, "50": 9.88415, "55": 9.86374, "60": 9.47965, "65": 8.93065, "70": 9.71216, "75": 9.40049, "80": 9.39075, "85": 9.6014, "90": 9.80503, "95": 9.50817, "100": 9.39236}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1559.0, "5": 1840.0, "10": 1380.0, "15": 1848.0, "20": 1601.0, "25": 1635.0, "30": 1908.0, "35": 1925.0, "40": 2126.0, "45": 2086.0, "50": 2298.0, "55": 2284.0, "60": 2337.0, "65": 2636.0, "70": 3136.0, "75": 2539.0, "80": 3253.0, "85": 3363.0, "90": 3004.0, "95": 3333.0, "100": 3447.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 733859840.0, "5": 733859840.0, "10": 733859840.0, "15": 733859840.0, "20": 733859840.0, "25": 733859840.0, "30": 733859840.0, "35": 733859840.0, "40": 733859840.0, "45": 733859840.0, "50": 733859840.0, "55": 733859840.0, "60": 733859840.0, "65": 733859840.0, "70": 733859840.0, "75": 733859840.0, "80": 733859840.0, "85": 733859840.0, "90": 733859840.0, "95": 733859840.0, "100": 733859840.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 3838895104.0, "5": 4122703872.0, "10": 4122703872.0, "15": 4122703872.0, "20": 4122703872.0, "25": 4122703872.0, "30": 4122703872.0, "35": 4122703872.0, "40": 4122703872.0, "45": 4122703872.0, "50": 4122703872.0, "55": 4122703872.0, "60": 4122703872.0, "65": 4122703872.0, "70": 4122703872.0, "75": 4122703872.0, "80": 4122703872.0, "85": 4122703872.0, "90": 4122703872.0, "95": 4122703872.0, "100": 4122703872.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 15.54005, "5": 0.15749, "10": 0.15969, "15": 0.15843, "20": 0.15895, "25": 0.1586, "30": 0.15871, "35": 0.15794, "40": 0.1604, "45": 0.15841, "50": 0.15961, "55": 0.16456, "60": 0.16138, "65": 0.16027, "70": 0.16206, "75": 0.15997, "80": 0.16097, "85": 0.16718, "90": 0.16652, "95": 0.1684, "100": 0.16791}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -46,6 +46,8 @@ MODEL_ARGS:
  --use-checkpoint-opt_param-scheduler: true
  --use-mcore-models: true
  --ckpt-format: torch_dist
+  --ckpt-assume-constant-structure: true
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.81873, "5": 10.85262, "10": 10.78415, "15": 10.79311, "20": 10.69222, "25": 10.52454, "30": 10.34542, "35": 10.26242, "40": 10.07283, "45": 9.81123, "50": 9.88433, "55": 9.86374, "60": 9.47985, "65": 8.93093, "70": 9.71206, "75": 9.4002, "80": 9.39071, "85": 9.60143, "90": 9.80506, "95": 9.50809, "100": 9.39219}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1541.0, "5": 1912.0, "10": 1353.0, "15": 1917.0, "20": 1675.0, "25": 1730.0, "30": 1899.0, "35": 1951.0, "40": 2020.0, "45": 2040.0, "50": 2385.0, "55": 2263.0, "60": 2327.0, "65": 2612.0, "70": 3254.0, "75": 2613.0, "80": 3186.0, "85": 3386.0, "90": 3037.0, "95": 3302.0, "100": 3280.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 521926144.0, "5": 521926144.0, "10": 521926144.0, "15": 521926144.0, "20": 521926144.0, "25": 521926144.0, "30": 521926144.0, "35": 521926144.0, "40": 521926144.0, "45": 521926144.0, "50": 521926144.0, "55": 521926144.0, "60": 521926144.0, "65": 521926144.0, "70": 521926144.0, "75": 521926144.0, "80": 521926144.0, "85": 521926144.0, "90": 521926144.0, "95": 521926144.0, "100": 521926144.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2298877952.0, "5": 2440802304.0, "10": 2440802304.0, "15": 2440802304.0, "20": 2440802304.0, "25": 2440802304.0, "30": 2440802304.0, "35": 2440802304.0, "40": 2440802304.0, "45": 2440802304.0, "50": 2440802304.0, "55": 2440802304.0, "60": 2440802304.0, "65": 2440802304.0, "70": 2440802304.0, "75": 2440802304.0, "80": 2440802304.0, "85": 2440802304.0, "90": 2440802304.0, "95": 2440802304.0, "100": 2440802304.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.42431, "5": 0.15421, "10": 0.15404, "15": 0.15679, "20": 0.15514, "25": 0.1535, "30": 0.1545, "35": 0.15342, "40": 0.15339, "45": 0.15224, "50": 0.15191, "55": 0.14871, "60": 0.14706, "65": 0.14745, "70": 0.14606, "75": 0.1482, "80": 0.14783, "85": 0.15003, "90": 0.14935, "95": 0.15271, "100": 0.16034}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.82005, "5": 10.85284, "10": 10.78455, "15": 10.7923, "20": 10.69211, "25": 10.52414, "30": 10.34555, "35": 10.2624, "40": 10.07237, "45": 9.81103, "50": 9.88417, "55": 9.86375, "60": 9.47966, "65": 8.93063, "70": 9.71218, "75": 9.40046, "80": 9.39077, "85": 9.60141, "90": 9.80504, "95": 9.50823, "100": 9.39237}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1559.0, "5": 1840.0, "10": 1380.0, "15": 1850.0, "20": 1668.0, "25": 1607.0, "30": 1945.0, "35": 1860.0, "40": 2022.0, "45": 2042.0, "50": 2292.0, "55": 2273.0, "60": 2355.0, "65": 2674.0, "70": 3184.0, "75": 2582.0, "80": 3237.0, "85": 3377.0, "90": 2972.0, "95": 3318.0, "100": 3514.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 523003904.0, "5": 523003904.0, "10": 523003904.0, "15": 523003904.0, "20": 523003904.0, "25": 523003904.0, "30": 523003904.0, "35": 523003904.0, "40": 523003904.0, "45": 523003904.0, "50": 523003904.0, "55": 523003904.0, "60": 523003904.0, "65": 523003904.0, "70": 523003904.0, "75": 523003904.0, "80": 523003904.0, "85": 523003904.0, "90": 523003904.0, "95": 523003904.0, "100": 523003904.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 3768872960.0, "5": 3912765440.0, "10": 3912765440.0, "15": 3912765440.0, "20": 3912765440.0, "25": 3912765440.0, "30": 3912765440.0, "35": 3912765440.0, "40": 3912765440.0, "45": 3912765440.0, "50": 3912765440.0, "55": 3912765440.0, "60": 3912765440.0, "65": 3912765440.0, "70": 3912765440.0, "75": 3912765440.0, "80": 3912765440.0, "85": 3912765440.0, "90": 3912765440.0, "95": 3912765440.0, "100": 3912765440.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 16.02969, "5": 0.15735, "10": 0.16072, "15": 0.15723, "20": 0.15745, "25": 0.15826, "30": 0.15964, "35": 0.16023, "40": 0.15616, "45": 0.15487, "50": 0.15469, "55": 0.1613, "60": 0.16121, "65": 0.1622, "70": 0.1599, "75": 0.15976, "80": 0.16152, "85": 0.16061, "90": 0.15993, "95": 0.15988, "100": 0.1599}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -50,4 +50,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.81873, "5": 10.85262, "10": 10.78415, "15": 10.79311, "20": 10.69222, "25": 10.52454, "30": 10.34542, "35": 10.26242, "40": 10.07283, "45": 9.81123, "50": 9.88433, "55": 9.86374, "60": 9.47985, "65": 8.93093, "70": 9.71206, "75": 9.4002, "80": 9.39071, "85": 9.60143, "90": 9.80506, "95": 9.50809, "100": 9.39219}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1541.0, "5": 1912.0, "10": 1353.0, "15": 1917.0, "20": 1675.0, "25": 1730.0, "30": 1899.0, "35": 1951.0, "40": 2020.0, "45": 2040.0, "50": 2385.0, "55": 2263.0, "60": 2327.0, "65": 2612.0, "70": 3254.0, "75": 2613.0, "80": 3186.0, "85": 3386.0, "90": 3037.0, "95": 3302.0, "100": 3280.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 521926144.0, "5": 522974720.0, "10": 522974720.0, "15": 522974720.0, "20": 522974720.0, "25": 522974720.0, "30": 522974720.0, "35": 522974720.0, "40": 522974720.0, "45": 522974720.0, "50": 522974720.0, "55": 522974720.0, "60": 522974720.0, "65": 522974720.0, "70": 522974720.0, "75": 522974720.0, "80": 522974720.0, "85": 522974720.0, "90": 522974720.0, "95": 522974720.0, "100": 522974720.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2298877952.0, "5": 2440802304.0, "10": 2440802304.0, "15": 2440802304.0, "20": 2440802304.0, "25": 2440802304.0, "30": 2440802304.0, "35": 2440802304.0, "40": 2440802304.0, "45": 2440802304.0, "50": 2440802304.0, "55": 2440802304.0, "60": 2440802304.0, "65": 2440802304.0, "70": 2440802304.0, "75": 2440802304.0, "80": 2440802304.0, "85": 2440802304.0, "90": 2440802304.0, "95": 2440802304.0, "100": 2440802304.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 16.79264, "5": 0.15187, "10": 0.15534, "15": 0.15767, "20": 0.15499, "25": 0.1556, "30": 0.15402, "35": 0.15483, "40": 0.15282, "45": 0.15494, "50": 0.15426, "55": 0.1549, "60": 0.15835, "65": 0.15515, "70": 0.15423, "75": 0.15522, "80": 0.15525, "85": 0.15444, "90": 0.15344, "95": 0.15532, "100": 0.15381}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.82005, "5": 10.85284, "10": 10.78455, "15": 10.7923, "20": 10.69211, "25": 10.52414, "30": 10.34555, "35": 10.2624, "40": 10.07237, "45": 9.81103, "50": 9.88417, "55": 9.86375, "60": 9.47966, "65": 8.93063, "70": 9.71218, "75": 9.40046, "80": 9.39077, "85": 9.60141, "90": 9.80504, "95": 9.50823, "100": 9.39237}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1559.0, "5": 1840.0, "10": 1380.0, "15": 1850.0, "20": 1668.0, "25": 1607.0, "30": 1945.0, "35": 1860.0, "40": 2022.0, "45": 2042.0, "50": 2292.0, "55": 2273.0, "60": 2355.0, "65": 2674.0, "70": 3184.0, "75": 2582.0, "80": 3237.0, "85": 3377.0, "90": 2972.0, "95": 3318.0, "100": 3514.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 523003904.0, "5": 523003904.0, "10": 523003904.0, "15": 523003904.0, "20": 523003904.0, "25": 523003904.0, "30": 523003904.0, "35": 523003904.0, "40": 523003904.0, "45": 523003904.0, "50": 523003904.0, "55": 523003904.0, "60": 523003904.0, "65": 523003904.0, "70": 523003904.0, "75": 523003904.0, "80": 523003904.0, "85": 523003904.0, "90": 523003904.0, "95": 523003904.0, "100": 523003904.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 3768872960.0, "5": 3912765440.0, "10": 3912765440.0, "15": 3912765440.0, "20": 3912765440.0, "25": 3912765440.0, "30": 3912765440.0, "35": 3912765440.0, "40": 3912765440.0, "45": 3912765440.0, "50": 3912765440.0, "55": 3912765440.0, "60": 3912765440.0, "65": 3912765440.0, "70": 3912765440.0, "75": 3912765440.0, "80": 3912765440.0, "85": 3912765440.0, "90": 3912765440.0, "95": 3912765440.0, "100": 3912765440.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 15.5918, "5": 0.16302, "10": 0.16601, "15": 0.16426, "20": 0.16354, "25": 0.16043, "30": 0.16113, "35": 0.16179, "40": 0.15952, "45": 0.15947, "50": 0.15972, "55": 0.1647, "60": 0.16566, "65": 0.16575, "70": 0.16359, "75": 0.16555, "80": 0.16349, "85": 0.16459, "90": 0.1641, "95": 0.16549, "100": 0.16374}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -44,6 +44,7 @@ MODEL_ARGS:
  --overlap-grad-reduce: true
  --overlap-param-gather: true
  --check-weight-hash-across-dp-replicas-interval: 10
+  --disable-gloo-process-groups: true
  --ckpt-fully-parallel-load: true
  --deterministic-mode: true
  --no-gradient-accumulation-fusion: true
@@ -51,6 +52,8 @@ MODEL_ARGS:
  --use-checkpoint-opt_param-scheduler: true
  --use-mcore-models: true
  --ckpt-format: torch_dist
+  --ckpt-assume-constant-structure: true
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.93652, "5": 10.93219, "10": 10.91159, "15": 10.85668, "20": 10.77161, "25": 10.60544, "30": 10.40595, "35": 10.31396, "40": 10.12361, "45": 9.87606, "50": 9.94483, "55": 9.90094, "60": 9.5526, "65": 8.96804, "70": 9.77858, "75": 9.44577, "80": 9.4199, "85": 9.64322, "90": 9.85834, "95": 9.52082, "100": 9.43404}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 22727554.0, "5": 22715260.0, "10": 22919068.0, "15": 22821164.0, "20": 22693678.0, "25": 22819604.0, "30": 22631168.0, "35": 22787934.0, "40": 22658232.0, "45": 22674504.0, "50": 22904460.0, "55": 22519162.0, "60": 22743128.0, "65": 23060980.0, "70": 22829344.0, "75": 23053962.0, "80": 22707280.0, "85": 22712296.0, "90": 22971840.0, "95": 23047794.0, "100": 23015940.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 520891904.0, "5": 520891904.0, "10": 520891904.0, "15": 520891904.0, "20": 520891904.0, "25": 520891904.0, "30": 520891904.0, "35": 520891904.0, "40": 520891904.0, "45": 520891904.0, "50": 520891904.0, "55": 520891904.0, "60": 520891904.0, "65": 520891904.0, "70": 520891904.0, "75": 520891904.0, "80": 520891904.0, "85": 520891904.0, "90": 520891904.0, "95": 520891904.0, "100": 520891904.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2299948032.0, "5": 2439768064.0, "10": 2439768064.0, "15": 2439768064.0, "20": 2439768064.0, "25": 2439768064.0, "30": 2439768064.0, "35": 2439768064.0, "40": 2439768064.0, "45": 2439768064.0, "50": 2439768064.0, "55": 2439768064.0, "60": 2439768064.0, "65": 2439768064.0, "70": 2439768064.0, "75": 2439768064.0, "80": 2439768064.0, "85": 2439768064.0, "90": 2439768064.0, "95": 2439768064.0, "100": 2439768064.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 16.00303, "5": 0.15968, "10": 0.16109, "15": 0.15942, "20": 0.15611, "25": 0.15603, "30": 0.15632, "35": 0.15548, "40": 0.15633, "45": 0.15576, "50": 0.15591, "55": 0.15317, "60": 0.1529, "65": 0.15213, "70": 0.15024, "75": 0.15269, "80": 0.15331, "85": 0.15295, "90": 0.15136, "95": 0.15353, "100": 0.15201}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.9359, "5": 10.93225, "10": 10.91081, "15": 10.85723, "20": 10.77091, "25": 10.60558, "30": 10.40544, "35": 10.31364, "40": 10.12333, "45": 9.8756, "50": 9.94451, "55": 9.90089, "60": 9.55236, "65": 8.96792, "70": 9.77832, "75": 9.44604, "80": 9.4201, "85": 9.64321, "90": 9.85827, "95": 9.52085, "100": 9.43416}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 22727686.0, "5": 22715312.0, "10": 22919004.0, "15": 22821282.0, "20": 22693812.0, "25": 22819580.0, "30": 22631132.0, "35": 22787906.0, "40": 22658304.0, "45": 22674764.0, "50": 22904438.0, "55": 22519056.0, "60": 22743204.0, "65": 23060980.0, "70": 22829348.0, "75": 23054184.0, "80": 22707228.0, "85": 22712172.0, "90": 22971870.0, "95": 23047656.0, "100": 23016066.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 523016192.0, "5": 523016192.0, "10": 523016192.0, "15": 523016192.0, "20": 523016192.0, "25": 523016192.0, "30": 523016192.0, "35": 523016192.0, "40": 523016192.0, "45": 523016192.0, "50": 523016192.0, "55": 523016192.0, "60": 523016192.0, "65": 523016192.0, "70": 523016192.0, "75": 523016192.0, "80": 523016192.0, "85": 523016192.0, "90": 523016192.0, "95": 523016192.0, "100": 523016192.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 3769943040.0, "5": 3914774528.0, "10": 3914774528.0, "15": 3914774528.0, "20": 3914774528.0, "25": 3914774528.0, "30": 3914774528.0, "35": 3914774528.0, "40": 3914774528.0, "45": 3914774528.0, "50": 3914774528.0, "55": 3914774528.0, "60": 3914774528.0, "65": 3914774528.0, "70": 3914774528.0, "75": 3914774528.0, "80": 3914774528.0, "85": 3914774528.0, "90": 3914774528.0, "95": 3914774528.0, "100": 3914774528.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 16.01453, "5": 0.166, "10": 0.16623, "15": 0.16232, "20": 0.16292, "25": 0.16015, "30": 0.15928, "35": 0.15947, "40": 0.15937, "45": 0.16129, "50": 0.15757, "55": 0.16029, "60": 0.15889, "65": 0.15795, "70": 0.15758, "75": 0.15718, "80": 0.15858, "85": 0.15639, "90": 0.15635, "95": 0.15626, "100": 0.1578}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -51,4 +51,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.81112, "5": 10.85474, "10": 10.88269, "15": 10.80707, "20": 10.63254, "25": 10.47352, "30": 10.39308, "35": 9.96795, "40": 10.10006, "45": 9.6834, "50": 9.92476, "55": 9.98142, "60": 9.3523, "65": 9.70184, "70": 9.73813, "75": 8.95596, "80": 9.31468, "85": 8.97866, "90": 9.55803, "95": 9.13519, "100": 9.21084}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1011.0, "5": 1122.0, "10": 1338.0, "15": 1172.0, "20": 1143.0, "25": 1122.0, "30": 1527.0, "35": 1239.0, "40": 1436.0, "45": 1565.0, "50": 1754.0, "55": 1787.0, "60": 1768.0, "65": 2460.0, "70": 2516.0, "75": 2011.0, "80": 1970.0, "85": 2220.0, "90": 2181.0, "95": 2486.0, "100": 1974.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 731763200.0, "5": 731763200.0, "10": 731763200.0, "15": 731763200.0, "20": 731763200.0, "25": 731763200.0, "30": 731763200.0, "35": 731763200.0, "40": 731763200.0, "45": 731763200.0, "50": 731763200.0, "55": 731763200.0, "60": 731763200.0, "65": 731763200.0, "70": 731763200.0, "75": 731763200.0, "80": 731763200.0, "85": 731763200.0, "90": 731763200.0, "95": 731763200.0, "100": 731763200.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2516124160.0, "5": 2796787200.0, "10": 2796787200.0, "15": 2796787200.0, "20": 2796787200.0, "25": 2796787200.0, "30": 2796787200.0, "35": 2796787200.0, "40": 2796787200.0, "45": 2796787200.0, "50": 2796787200.0, "55": 2796787200.0, "60": 2796787200.0, "65": 2796787200.0, "70": 2796787200.0, "75": 2796787200.0, "80": 2796787200.0, "85": 2796787200.0, "90": 2796787200.0, "95": 2796787200.0, "100": 2796787200.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 17.34358, "5": 0.33658, "10": 0.32993, "15": 0.32841, "20": 0.32798, "25": 0.33006, "30": 0.3274, "35": 0.32586, "40": 0.32487, "45": 0.32557, "50": 0.32425, "55": 0.33341, "60": 0.33044, "65": 0.32759, "70": 0.32883, "75": 0.33041, "80": 0.33042, "85": 0.33049, "90": 0.32813, "95": 0.33174, "100": 0.32817}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.81184, "5": 10.85464, "10": 10.88256, "15": 10.80679, "20": 10.63196, "25": 10.47374, "30": 10.39285, "35": 9.96791, "40": 10.1, "45": 9.68346, "50": 9.92463, "55": 9.98132, "60": 9.3523, "65": 9.7021, "70": 9.73808, "75": 8.95617, "80": 9.31499, "85": 8.97886, "90": 9.5581, "95": 9.13527, "100": 9.21091}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1125.0, "5": 1255.0, "10": 1367.0, "15": 1127.0, "20": 1082.0, "25": 1114.0, "30": 1558.0, "35": 1292.0, "40": 1433.0, "45": 1637.0, "50": 1779.0, "55": 1819.0, "60": 1851.0, "65": 2490.0, "70": 2549.0, "75": 1996.0, "80": 1939.0, "85": 2175.0, "90": 2179.0, "95": 2519.0, "100": 2013.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 733859840.0, "5": 733859840.0, "10": 733859840.0, "15": 733859840.0, "20": 733859840.0, "25": 733859840.0, "30": 733859840.0, "35": 733859840.0, "40": 733859840.0, "45": 733859840.0, "50": 733859840.0, "55": 733859840.0, "60": 733859840.0, "65": 733859840.0, "70": 733859840.0, "75": 733859840.0, "80": 733859840.0, "85": 733859840.0, "90": 733859840.0, "95": 733859840.0, "100": 733859840.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 4312984064.0, "5": 4596792832.0, "10": 4596792832.0, "15": 4596792832.0, "20": 4596792832.0, "25": 4596792832.0, "30": 4596792832.0, "35": 4596792832.0, "40": 4596792832.0, "45": 4596792832.0, "50": 4596792832.0, "55": 4596792832.0, "60": 4596792832.0, "65": 4596792832.0, "70": 4596792832.0, "75": 4596792832.0, "80": 4596792832.0, "85": 4596792832.0, "90": 4596792832.0, "95": 4596792832.0, "100": 4596792832.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 17.08432, "5": 0.34938, "10": 0.3472, "15": 0.34809, "20": 0.35058, "25": 0.34829, "30": 0.34775, "35": 0.34968, "40": 0.34886, "45": 0.34665, "50": 0.34852, "55": 0.35034, "60": 0.34955, "65": 0.34833, "70": 0.35115, "75": 0.35006, "80": 0.35114, "85": 0.3487, "90": 0.35045, "95": 0.3489, "100": 0.35007}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/model_config.yaml
@@ -20,8 +20,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -51,4 +51,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81184, 10.84052, 10.8763, 10.79906, 10.68214, 10.59702, 10.49258, 10.11236, 10.12393, 9.98165]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1118.0, 1331.0, 1230.0, 1085.0, 1180.0, 1245.0, 1454.0, 1330.0, 1752.0, 1851.0]}, "iteration-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [17.24286, 0.35341, 0.35187, 0.35028, 0.34941, 0.35093, 0.3488, 0.35179, 0.34905, 0.34684]}}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.81184, "5": 10.85467, "10": 10.88256, "15": 10.80682, "20": 10.63195, "25": 10.47372, "30": 10.39284, "35": 9.96785, "40": 10.09999, "45": 9.68342, "50": 9.92465}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1118.0, "5": 1229.0, "10": 1289.0, "15": 1125.0, "20": 1090.0, "25": 1110.0, "30": 1431.0, "35": 1132.0, "40": 1472.0, "45": 1544.0, "50": 1737.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 733859840.0, "5": 733859840.0, "10": 733859840.0, "15": 733859840.0, "20": 733859840.0, "25": 733859840.0, "30": 733859840.0, "35": 733859840.0, "40": 733859840.0, "45": 733859840.0, "50": 733859840.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4312984064.0, "5": 4596792832.0, "10": 4596792832.0, "15": 4596792832.0, "20": 4596792832.0, "25": 4596792832.0, "30": 4596792832.0, "35": 4596792832.0, "40": 4596792832.0, "45": 4596792832.0, "50": 4596792832.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 14.07311, "5": 0.3501, "10": 0.3498, "15": 0.34818, "20": 0.34849, "25": 0.34922, "30": 0.35192, "35": 0.35181, "40": 0.34504, "45": 0.34967, "50": 0.3384}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_lts.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81184, 10.84052, 10.87624, 10.79904, 10.68212, 10.59698, 10.49257, 10.11232, 10.12396, 9.98163]},  "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1125.0, 1304.0, 1252.0, 1102.0, 1201.0, 1200.0, 1489.0, 1395.0, 1677.0, 1867.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1125.0, 1304.0, 1252.0, 1102.0, 1201.0, 1200.0, 1489.0, 1395.0, 1677.0, 1867.0]}, "iteration-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22.22011, 0.36082, 0.35927, 0.35627, 0.35901, 0.35008, 0.34828, 0.34774, 0.35145, 0.35141]}}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.81184, "5": 10.85464, "10": 10.88256, "15": 10.80679, "20": 10.63196, "25": 10.47374, "30": 10.39285, "35": 9.96791, "40": 10.1, "45": 9.68346, "50": 9.92463}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1125.0, "5": 1255.0, "10": 1367.0, "15": 1127.0, "20": 1082.0, "25": 1114.0, "30": 1558.0, "35": 1292.0, "40": 1433.0, "45": 1637.0, "50": 1779.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 733859840.0, "5": 733859840.0, "10": 733859840.0, "15": 733859840.0, "20": 733859840.0, "25": 733859840.0, "30": 733859840.0, "35": 733859840.0, "40": 733859840.0, "45": 733859840.0, "50": 733859840.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4312984064.0, "5": 4596792832.0, "10": 4596792832.0, "15": 4596792832.0, "20": 4596792832.0, "25": 4596792832.0, "30": 4596792832.0, "35": 4596792832.0, "40": 4596792832.0, "45": 4596792832.0, "50": 4596792832.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 15.10993, "5": 0.35468, "10": 0.57928, "15": 0.34914, "20": 0.34779, "25": 0.34849, "30": 0.3476, "35": 0.34849, "40": 0.34839, "45": 0.34576, "50": 0.34689}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/model_config.yaml
@@ -20,8 +20,8 @@ MODEL_ARGS:
  --train-iters: 50
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -51,4 +51,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
  --attention-backend: unfused
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev.json
-{
-    "lm loss": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            10.9735,
-            10.96043,
-            10.95577,
-            10.91036,
-            10.78792,
-            10.71198,
-            10.22428,
-            10.28927,
-            10.19052,
-            9.86378
-        ]
-    },
-    "num-zeros": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            22727056.0,
-            23021982.0,
-            22501104.0,
-            22831164.0,
-            22740086.0,
-            22547896.0,
-            22955344.0,
-            22589272.0,
-            22658866.0,
-            22885040.0
-        ]
-    },
-    "iteration-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            13.92799,
-            0.16275,
-            0.16118,
-            0.16212,
-            0.16165,
-            0.16181,
-            0.16104,
-            0.16149,
-            0.16151,
-            0.16055
-        ]
-    }
-}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.9735, "5": 10.95597, "10": 10.94991, "15": 10.91152, "20": 10.80976, "25": 10.6662, "30": 10.45503, "35": 10.33419, "40": 10.1465, "45": 9.89112, "50": 9.95256}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 22727056.0, "5": 22714202.0, "10": 22918312.0, "15": 22821034.0, "20": 22694248.0, "25": 22819602.0, "30": 22631112.0, "35": 22787556.0, "40": 22658080.0, "45": 22674612.0, "50": 22905288.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 658766848.0, "5": 658766848.0, "10": 657718272.0, "15": 658766848.0, "20": 657718272.0, "25": 658766848.0, "30": 657718272.0, "35": 658766848.0, "40": 657718272.0, "45": 658766848.0, "50": 657718272.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2129712128.0, "5": 2386204672.0, "10": 2386204672.0, "15": 2386204672.0, "20": 2386204672.0, "25": 2386204672.0, "30": 2386204672.0, "35": 2386204672.0, "40": 2386204672.0, "45": 2386204672.0, "50": 2386204672.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 15.12421, "5": 0.15186, "10": 0.15206, "15": 0.15102, "20": 0.14853, "25": 0.14759, "30": 0.14812, "35": 0.14831, "40": 0.14675, "45": 0.14841, "50": 0.14702}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.9735, 10.96043, 10.95576, 10.91038, 10.78791, 10.71201, 10.22424, 10.28926, 10.19049, 9.86378]},"num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [22727052.0, 23021930.0, 22501022.0, 22831208.0, 22740024.0, 22547916.0, 22955210.0, 22589344.0, 22658940.0, 22884970.0]},"iteration_timing_avg": 0.1367805882352941}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.9735, "5": 10.95594, "10": 10.94989, "15": 10.9115, "20": 10.80975, "25": 10.66619, "30": 10.45505, "35": 10.3342, "40": 10.14647, "45": 9.8911, "50": 9.95258}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 22727052.0, "5": 22714228.0, "10": 22918376.0, "15": 22820932.0, "20": 22694228.0, "25": 22819504.0, "30": 22631112.0, "35": 22787612.0, "40": 22658002.0, "45": 22674598.0, "50": 22905310.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 658766848.0, "5": 658766848.0, "10": 657718272.0, "15": 658766848.0, "20": 657718272.0, "25": 658766848.0, "30": 657718272.0, "35": 658766848.0, "40": 657718272.0, "45": 658766848.0, "50": 657718272.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2129712128.0, "5": 2386204672.0, "10": 2386204672.0, "15": 2386204672.0, "20": 2386204672.0, "25": 2386204672.0, "30": 2386204672.0, "35": 2386204672.0, "40": 2386204672.0, "45": 2386204672.0, "50": 2386204672.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 16.33012, "5": 0.15907, "10": 0.15749, "15": 0.15792, "20": 0.1558, "25": 0.15554, "30": 0.15634, "35": 0.15618, "40": 0.15564, "45": 0.15575, "50": 0.15527}}}
\ No newline at end of file