更新代码

688448db · silencealiang · a02a5490 · 688448db · 688448db · 688448db
Commit 688448db authored Mar 14, 2025 by silencealiang
20 changed files
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.85831, "5": 10.87279, "10": 10.83267, "15": 10.82104, "20": 10.71376, "25": 10.54763, "30": 10.36782, "35": 10.2846, "40": 10.08923, "45": 9.84556, "50": 9.91944, "55": 9.89194, "60": 9.5082, "65": 8.9595, "70": 9.73443, "75": 9.43114, "80": 9.41103, "85": 9.61515, "90": 9.82371, "95": 9.5226, "100": 9.40801}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1758.0, "5": 2093.0, "10": 1539.0, "15": 2026.0, "20": 1800.0, "25": 1786.0, "30": 2071.0, "35": 2219.0, "40": 2402.0, "45": 2268.0, "50": 2714.0, "55": 2588.0, "60": 2760.0, "65": 2831.0, "70": 3489.0, "75": 2724.0, "80": 3683.0, "85": 3637.0, "90": 3411.0, "95": 3592.0, "100": 3642.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 232398336.0, "5": 232398336.0, "10": 232398336.0, "15": 232398336.0, "20": 232398336.0, "25": 232398336.0, "30": 232398336.0, "35": 232398336.0, "40": 232398336.0, "45": 232398336.0, "50": 232398336.0, "55": 232398336.0, "60": 232398336.0, "65": 232398336.0, "70": 232398336.0, "75": 232398336.0, "80": 232398336.0, "85": 232398336.0, "90": 232398336.0, "95": 232398336.0, "100": 232398336.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 682342912.0, "5": 773245440.0, "10": 773245440.0, "15": 773245440.0, "20": 773245440.0, "25": 773246464.0, "30": 773246464.0, "35": 773246464.0, "40": 773246464.0, "45": 773246464.0, "50": 773246464.0, "55": 773246464.0, "60": 773246464.0, "65": 773246464.0, "70": 773246464.0, "75": 773246464.0, "80": 773246464.0, "85": 773246464.0, "90": 775342080.0, "95": 775342080.0, "100": 775342080.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 9.25721, "5": 0.297, "10": 0.2962, "15": 0.29314, "20": 0.29254, "25": 0.29368, "30": 0.29285, "35": 0.2939, "40": 0.29424, "45": 0.29981, "50": 0.29991, "55": 0.28268, "60": 0.2813, "65": 0.28183, "70": 0.28205, "75": 0.28103, "80": 0.28125, "85": 0.28141, "90": 0.28129, "95": 0.28133, "100": 0.28055}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.8583, "5": 10.87284, "10": 10.83264, "15": 10.82102, "20": 10.71379, "25": 10.54766, "30": 10.3679, "35": 10.28457, "40": 10.08925, "45": 9.84556, "50": 9.91943, "55": 9.89191, "60": 9.50823, "65": 8.95947, "70": 9.73446, "75": 9.43115, "80": 9.411, "85": 9.61516, "90": 9.82374, "95": 9.52257, "100": 9.408}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1693.0, "5": 2113.0, "10": 1534.0, "15": 2023.0, "20": 1755.0, "25": 1764.0, "30": 2036.0, "35": 2228.0, "40": 2447.0, "45": 2332.0, "50": 2745.0, "55": 2594.0, "60": 2725.0, "65": 2901.0, "70": 3493.0, "75": 2725.0, "80": 3691.0, "85": 3596.0, "90": 3410.0, "95": 3607.0, "100": 3719.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 232422400.0, "5": 232422400.0, "10": 232422400.0, "15": 232422400.0, "20": 232422400.0, "25": 232422400.0, "30": 232422400.0, "35": 232422400.0, "40": 232422400.0, "45": 232422400.0, "50": 232422400.0, "55": 232422400.0, "60": 232422400.0, "65": 232422400.0, "70": 232422400.0, "75": 232422400.0, "80": 232422400.0, "85": 232422400.0, "90": 232422400.0, "95": 232422400.0, "100": 232422400.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 13.16523, "5": 0.31605, "10": 0.28733, "15": 0.28667, "20": 0.28015, "25": 0.31509, "30": 0.28969, "35": 0.28728, "40": 0.29047, "45": 0.28331, "50": 0.28547, "55": 0.2768, "60": 0.27873, "65": 0.2789, "70": 0.27983, "75": 0.27902, "80": 0.27972, "85": 0.28215, "90": 0.27786, "95": 0.28072, "100": 0.28294}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 2
+  --use-distributed-optimizer: true
+  --async-save: true
+  --ckpt-fully-parallel-save: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --log-memory-to-tensorboard: true
+TEST_TYPE: frozen-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.85831, "5": 10.87284, "10": 10.83268, "15": 10.82102, "20": 10.71377, "25": 10.54763, "30": 10.36785, "35": 10.28461, "40": 10.08928, "45": 9.84557, "50": 9.9194, "55": 9.89197, "60": 9.50823, "65": 8.9595, "70": 9.73441, "75": 9.43113, "80": 9.411, "85": 9.61514, "90": 9.82373, "95": 9.52255, "100": 9.40799}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1775.0, "5": 2048.0, "10": 1559.0, "15": 2026.0, "20": 1790.0, "25": 1815.0, "30": 2056.0, "35": 2157.0, "40": 2311.0, "45": 2242.0, "50": 2756.0, "55": 2589.0, "60": 2651.0, "65": 2874.0, "70": 3534.0, "75": 2840.0, "80": 3634.0, "85": 3505.0, "90": 3377.0, "95": 3729.0, "100": 3572.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 232398336.0, "5": 232398336.0, "10": 232398336.0, "15": 232398336.0, "20": 232398336.0, "25": 232398336.0, "30": 232398336.0, "35": 232398336.0, "40": 233446912.0, "45": 232398336.0, "50": 232398336.0, "55": 232398336.0, "60": 232398336.0, "65": 232398336.0, "70": 232398336.0, "75": 232398336.0, "80": 232398336.0, "85": 232398336.0, "90": 232398336.0, "95": 232398336.0, "100": 232398336.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 686536192.0, "5": 775341056.0, "10": 775341056.0, "15": 775341056.0, "20": 775342080.0, "25": 775343616.0, "30": 775343616.0, "35": 775343616.0, "40": 775343616.0, "45": 775343616.0, "50": 775343616.0, "55": 775343616.0, "60": 775343616.0, "65": 775343616.0, "70": 775343616.0, "75": 775343616.0, "80": 775343616.0, "85": 775343616.0, "90": 775343616.0, "95": 775343616.0, "100": 775343616.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 11.96401, "5": 0.29061, "10": 0.28498, "15": 0.28362, "20": 0.28222, "25": 0.28294, "30": 0.28438, "35": 0.28301, "40": 0.28255, "45": 0.28337, "50": 0.28254, "55": 0.29177, "60": 0.29121, "65": 0.2911, "70": 0.29076, "75": 0.29215, "80": 0.29191, "85": 0.28992, "90": 0.29114, "95": 0.29025, "100": 0.28959}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.8583, "5": 10.87279, "10": 10.83264, "15": 10.82099, "20": 10.71379, "25": 10.54767, "30": 10.36789, "35": 10.2846, "40": 10.08927, "45": 9.84554, "50": 9.9194, "55": 9.89196, "60": 9.5082, "65": 8.95952, "70": 9.7344, "75": 9.4311, "80": 9.411, "85": 9.61517, "90": 9.82372, "95": 9.52256, "100": 9.408}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1763.0, "5": 2118.0, "10": 1540.0, "15": 2065.0, "20": 1836.0, "25": 1790.0, "30": 2030.0, "35": 2200.0, "40": 2389.0, "45": 2250.0, "50": 2793.0, "55": 2708.0, "60": 2777.0, "65": 2829.0, "70": 3443.0, "75": 2863.0, "80": 3676.0, "85": 3495.0, "90": 3282.0, "95": 3687.0, "100": 3655.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 232422400.0, "5": 232422400.0, "10": 232422400.0, "15": 232422400.0, "20": 232422400.0, "25": 232422400.0, "30": 232422400.0, "35": 232422400.0, "40": 232422400.0, "45": 232422400.0, "50": 232422400.0, "55": 232422400.0, "60": 232422400.0, "65": 232422400.0, "70": 232422400.0, "75": 232422400.0, "80": 232422400.0, "85": 232422400.0, "90": 232422400.0, "95": 232422400.0, "100": 232422400.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 686566400.0, "5": 775371776.0, "10": 775371776.0, "15": 775372288.0, "20": 775372288.0, "25": 775372288.0, "30": 775372288.0, "35": 775372288.0, "40": 775372288.0, "45": 775372288.0, "50": 775372288.0, "55": 775372288.0, "60": 775372288.0, "65": 775372288.0, "70": 775372288.0, "75": 775372288.0, "80": 775372288.0, "85": 775372288.0, "90": 775372288.0, "95": 775372288.0, "100": 775372288.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 16.98947, "5": 0.28276, "10": 0.29522, "15": 0.28583, "20": 0.29135, "25": 0.28791, "30": 0.28029, "35": 0.27945, "40": 0.27988, "45": 0.29308, "50": 0.28374, "55": 0.2909, "60": 0.29746, "65": 0.28807, "70": 0.29826, "75": 0.28803, "80": 0.29862, "85": 0.28869, "90": 0.28952, "95": 0.28889, "100": 0.28882}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml
@@ -17,8 +17,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -47,4 +47,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.92717, "5": 10.92928, "10": 10.91617, "15": 10.93901, "20": 10.93406, "25": 10.8858, "30": 10.81297, "35": 10.72203, "40": 10.55145, "45": 10.32854, "50": 10.28775, "55": 10.21253, "60": 9.833, "65": 9.27297, "70": 9.92539, "75": 9.59673, "80": 9.55132, "85": 9.73428, "90": 9.9073, "95": 9.60983, "100": 9.50131}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 379952128.0, "5": 378379264.0, "10": 379427840.0, "15": 378379264.0, "20": 559762944.0, "25": 561860096.0, "30": 561073664.0, "35": 561073664.0, "40": 560287232.0, "45": 559762944.0, "50": 560287232.0, "55": 561073664.0, "60": 559762944.0, "65": 559762944.0, "70": 559762944.0, "75": 559762944.0, "80": 559762944.0, "85": 559762944.0, "90": 561860096.0, "95": 560549376.0, "100": 560549376.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 9.22195, "5": 0.20217, "10": 0.20177, "15": 0.20429, "20": 0.21411, "25": 0.21219, "30": 0.21117, "35": 0.21259, "40": 0.21302, "45": 0.21291, "50": 0.21122, "55": 0.22967, "60": 0.2322, "65": 0.23206, "70": 0.23201, "75": 0.23017, "80": 0.22985, "85": 0.23239, "90": 0.231, "95": 0.23146, "100": 0.23157}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1799.0, "25": 2506.0, "30": 2471.0, "35": 2010.0, "40": 2153.0, "45": 2427.0, "50": 2914.0, "55": 2337.0, "60": 2978.0, "65": 2225.0, "70": 3612.0, "75": 3018.0, "80": 3488.0, "85": 3875.0, "90": 3770.0, "95": 3946.0, "100": 3446.0}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.92717, "5": 10.92928, "10": 10.91617, "15": 10.93901, "20": 10.93406, "25": 10.8858, "30": 10.81297, "35": 10.72203, "40": 10.55145, "45": 10.32854, "50": 10.28775, "55": 10.21253, "60": 9.833, "65": 9.27297, "70": 9.92539, "75": 9.59673, "80": 9.55132, "85": 9.73428, "90": 9.9073, "95": 9.60983, "100": 9.5013}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 378379264.0, "5": 378379264.0, "10": 378379264.0, "15": 378379264.0, "20": 561073664.0, "25": 561860096.0, "30": 561073664.0, "35": 561860096.0, "40": 561860096.0, "45": 560811520.0, "50": 561073664.0, "55": 561073664.0, "60": 561073664.0, "65": 561860096.0, "70": 561860096.0, "75": 561073664.0, "80": 561860096.0, "85": 561335808.0, "90": 561073664.0, "95": 561073664.0, "100": 561860096.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 11.81109, "5": 0.21194, "10": 0.21151, "15": 0.21057, "20": 0.22167, "25": 0.2212, "30": 0.22059, "35": 0.22295, "40": 0.22292, "45": 0.22399, "50": 0.22321, "55": 0.21669, "60": 0.21726, "65": 0.21668, "70": 0.22074, "75": 0.21923, "80": 0.21775, "85": 0.21706, "90": 0.21701, "95": 0.21697, "100": 0.2163}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1799.0, "25": 2506.0, "30": 2471.0, "35": 2010.0, "40": 2153.0, "45": 2427.0, "50": 2914.0, "55": 2409.0, "60": 2939.0, "65": 2178.0, "70": 3539.0, "75": 3029.0, "80": 3531.0, "85": 3892.0, "90": 3772.0, "95": 4015.0, "100": 3520.0}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -49,4 +49,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --fp16: true
  --apply-query-key-layer-scaling: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.92717, "5": 10.92928, "10": 10.91616, "15": 10.93902, "20": 10.93405, "25": 10.88579, "30": 10.81295, "35": 10.72198, "40": 10.55137, "45": 10.32844, "50": 10.28765}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 378378752.0, "5": 378903040.0, "10": 378378752.0, "15": 378903040.0, "20": 560548864.0, "25": 560548864.0, "30": 560548864.0, "35": 559238144.0, "40": 560548864.0, "45": 560548864.0, "50": 560548864.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1905351680.0, "5": 1905352192.0, "10": 1905352192.0, "15": 1905352192.0, "20": 2087784448.0, "25": 2087784448.0, "30": 2087784448.0, "35": 2087784448.0, "40": 2087784448.0, "45": 2087784448.0, "50": 2087784448.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 8.36878, "5": 0.2008, "10": 0.19913, "15": 0.19916, "20": 0.21528, "25": 0.21446, "30": 0.2138, "35": 0.21509, "40": 0.2138, "45": 0.21394, "50": 0.21354}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1751.0, "25": 2490.0, "30": 2497.0, "35": 2017.0, "40": 2091.0, "45": 2389.0, "50": 2925.0}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.92717, "5": 10.92928, "10": 10.91616, "15": 10.93902, "20": 10.93405, "25": 10.88579, "30": 10.81295, "35": 10.72198, "40": 10.55137, "45": 10.32844, "50": 10.28766}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 378903040.0, "5": 378378752.0, "10": 378903040.0, "15": 378378752.0, "20": 560811008.0, "25": 560548864.0, "30": 561073152.0, "35": 562646016.0, "40": 560548864.0, "45": 562646016.0, "50": 560548864.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1905351680.0, "5": 1905352192.0, "10": 1905352192.0, "15": 1905352192.0, "20": 2087784448.0, "25": 2087784448.0, "30": 2087784448.0, "35": 2087784448.0, "40": 2087784448.0, "45": 2087784448.0, "50": 2087784448.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.5872, "5": 0.20393, "10": 0.20412, "15": 0.20193, "20": 0.22109, "25": 0.21826, "30": 0.21476, "35": 0.21348, "40": 0.21255, "45": 0.21142, "50": 0.21064}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1751.0, "25": 2491.0, "30": 2428.0, "35": 1827.0, "40": 2072.0, "45": 2361.0, "50": 2998.0}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml
@@ -23,8 +23,8 @@ MODEL_ARGS:
  --train-iters: 50
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -52,4 +52,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --fp16: true
  --apply-query-key-layer-scaling: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
-{
-    "forward-backward-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            7.99255,
-            0.1699,
-            0.16797,
-            0.16814,
-            0.16792,
-            0.1675,
-            0.16973,
-            0.16925,
-            0.16932,
-            0.16655
-        ]
-    },
-    "forward-compute-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            1.99201,
-            0.07269,
-            0.07105,
-            0.07144,
-            0.07113,
-            0.07113,
-            0.07269,
-            0.07292,
-            0.07231,
-            0.07028
-        ]
-    },
-    "backward-compute-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            1.74189,
-            0.07561,
-            0.07559,
-            0.07617,
-            0.07601,
-            0.07555,
-            0.07573,
-            0.07602,
-            0.07589,
-            0.07554
-        ]
-    },
-    "batch-generator-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            0.33623,
-            0.00263,
-            0.00278,
-            0.00281,
-            0.0029,
-            0.00309,
-            0.00249,
-            0.00293,
-            0.00275,
-            0.00267
-        ]
-    },
-    "forward-recv-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            2.03589,
-            0.01468,
-            0.01445,
-            0.01439,
-            0.01441,
-            0.01438,
-            0.01445,
-            0.01443,
-            0.01439,
-            0.01458
-        ]
-    },
-    "forward-send-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            0.56239,
-            0.00016,
-            0.00014,
-            0.00015,
-            0.00015,
-            0.00015,
-            0.00017,
-            0.00015,
-            0.00015,
-            0.00014
-        ]
-    },
-    "backward-recv-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            0.01891,
-            0.01827,
-            0.01862,
-            0.01906,
-            0.01881,
-            0.01843,
-            0.01836,
-            0.01816,
-            0.01928,
-            0.01844
-        ]
-    },
-    "backward-send-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            0.00022,
-            0.00019,
-            0.00026,
-            0.00025,
-            0.00025,
-            0.00026,
-            0.00019,
-            0.00026,
-            0.00024,
-            0.00025
-        ]
-    },
-    "forward-send-backward-recv-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            3.65009,
-            0.02665,
-            0.02419,
-            0.02471,
-            0.02401,
-            0.02444,
-            0.02648,
-            0.02644,
-            0.02615,
-            0.02382
-        ]
-    },
-    "backward-send-forward-recv-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            1.79597,
-            0.00095,
-            0.00098,
-            0.00098,
-            0.00099,
-            0.00104,
-            0.00099,
-            0.00107,
-            0.00111,
-            0.00095
-        ]
-    },
-    "layernorm-grads-all-reduce-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            3e-05,
-            2e-05,
-            3e-05,
-            2e-05,
-            2e-05,
-            2e-05,
-            2e-05,
-            2e-05,
-            2e-05,
-            2e-05
-        ]
-    },
-    "embedding-grads-all-reduce-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            0.00069,
-            0.00052,
-            0.00052,
-            0.00053,
-            0.00053,
-            0.00053,
-            0.00053,
-            0.00052,
-            0.00053,
-            0.00052
-        ]
-    },
-    "all-grads-sync-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            0.59902,
-            0.00084,
-            0.00085,
-            0.00083,
-            0.00084,
-            0.00083,
-            0.00084,
-            0.00087,
-            0.00084,
-            0.00084
-        ]
-    },
-    "optimizer-copy-to-main-grad-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            0.00026,
-            0.00019,
-            0.00019,
-            0.00019,
-            0.00019,
-            0.00019,
-            0.0002,
-            0.00019,
-            0.00019,
-            0.00019
-        ]
-    },
-    "optimizer-clip-main-grad-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            0.85985,
-            0.0011,
-            0.00109,
-            0.00115,
-            0.0012,
-            0.00108,
-            0.0011,
-            0.00108,
-            0.0011,
-            0.00109
-        ]
-    },
-    "optimizer-count-zeros-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            0.0167,
-            0.00528,
-            0.00524,
-            0.00528,
-            0.00523,
-            0.00525,
-            0.00524,
-            0.00525,
-            0.00525,
-            0.00527
-        ]
-    },
-    "optimizer-inner-step-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            0.01141,
-            0.00081,
-            0.00081,
-            0.00083,
-            0.00081,
-            0.00084,
-            0.00084,
-            0.00084,
-            0.00082,
-            0.00083
-        ]
-    },
-    "optimizer-copy-main-to-model-params-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            0.00088,
-            0.0006,
-            0.0006,
-            0.0006,
-            0.0006,
-            0.00082,
-            0.0006,
-            0.00059,
-            0.0006,
-            0.0006
-        ]
-    },
-    "optimizer-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            0.89007,
-            0.00859,
-            0.00853,
-            0.00862,
-            0.00862,
-            0.00885,
-            0.00857,
-            0.00857,
-            0.00854,
-            0.00858
-        ]
-    },
-    "learning-rate": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0
-        ]
-    },
-    "learning-rate vs samples": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0
-        ]
-    },
-    "batch-size": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            32.0,
-            32.0,
-            32.0,
-            32.0,
-            32.0,
-            32.0,
-            32.0,
-            32.0,
-            32.0,
-            32.0
-        ]
-    },
-    "batch-size vs samples": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            32.0,
-            32.0,
-            32.0,
-            32.0,
-            32.0,
-            32.0,
-            32.0,
-            32.0,
-            32.0,
-            32.0
-        ]
-    },
-    "lm loss": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            10.85926,
-            10.89117,
-            10.86647,
-            10.81416,
-            10.70027,
-            10.60761,
-            10.10644,
-            10.21377,
-            10.12972,
-            9.8041
-        ]
-    },
-    "lm loss vs samples": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            10.85926,
-            10.89117,
-            10.86647,
-            10.81416,
-            10.70027,
-            10.60761,
-            10.10644,
-            10.21377,
-            10.12972,
-            9.8041
-        ]
-    },
-    "loss-scale": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            1.0,
-            1.0,
-            1.0,
-            1.0,
-            1.0,
-            1.0,
-            1.0,
-            1.0,
-            1.0,
-            1.0
-        ]
-    },
-    "loss-scale vs samples": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            1.0,
-            1.0,
-            1.0,
-            1.0,
-            1.0,
-            1.0,
-            1.0,
-            1.0,
-            1.0,
-            1.0
-        ]
-    },
-    "grad-norm": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            14.36883,
-            10.19308,
-            9.38217,
-            11.67025,
-            11.2611,
-            10.52068,
-            12.43181,
-            7.21395,
-            6.03602,
-            5.80161
-        ]
-    },
-    "grad-norm vs samples": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            14.36883,
-            10.19308,
-            9.38217,
-            11.67025,
-            11.2611,
-            10.52068,
-            12.43181,
-            7.21395,
-            6.03602,
-            5.80161
-        ]
-    },
-    "num-zeros": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            1726.0,
-            1922.0,
-            2043.0,
-            1879.0,
-            1882.0,
-            1821.0,
-            1648.0,
-            2039.0,
-            2379.0,
-            2451.0
-        ]
-    },
-    "num-zeros vs samples": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            1726.0,
-            1922.0,
-            2043.0,
-            1879.0,
-            1882.0,
-            1821.0,
-            1648.0,
-            2039.0,
-            2379.0,
-            2451.0
-        ]
-    },
-    "params-norm": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            180.01265,
-            180.01265,
-            180.01265,
-            180.01265,
-            180.01265,
-            180.01263,
-            180.0126,
-            180.01251,
-            180.01237,
-            180.01218
-        ]
-    },
-    "params-norm vs samples": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            180.01265,
-            180.01265,
-            180.01265,
-            180.01265,
-            180.01265,
-            180.01263,
-            180.0126,
-            180.01251,
-            180.01237,
-            180.01218
-        ]
-    },
-    "iteration-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            8.9047,
-            0.19058,
-            0.18857,
-            0.18884,
-            0.18868,
-            0.18839,
-            0.19045,
-            0.1901,
-            0.18993,
-            0.18735
-        ]
-    },
-    "lm loss validation": {
-        "start_step": 0,
-        "end_step": 2,
-        "step_interval": 5,
-        "values": [
-            9.81192
-        ]
-    },
-    "lm loss validation vs samples": {
-        "start_step": 0,
-        "end_step": 1,
-        "step_interval": 5,
-        "values": [
-            9.81192
-        ]
-    },
-    "lm loss validation ppl": {
-        "start_step": 0,
-        "end_step": 1,
-        "step_interval": 5,
-        "values": [
-            18250.01367
-        ]
-    },
-    "lm loss validation ppl vs samples": {
-        "start_step": 0,
-        "end_step": 1,
-        "step_interval": 5,
-        "values": [
-            18250.01367
-        ]
-    }
-}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.85926, "5": 10.878, "10": 10.84086, "15": 10.81702, "20": 10.72418, "25": 10.55518, "30": 10.35548, "35": 10.2597, "40": 10.06425, "45": 9.81279, "50": 9.89265}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1726.0, "5": 1899.0, "10": 1437.0, "15": 1923.0, "20": 1700.0, "25": 1640.0, "30": 1993.0, "35": 2075.0, "40": 2268.0, "45": 2144.0, "50": 2461.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 487096832.0, "5": 487096832.0, "10": 487096832.0, "15": 487096832.0, "20": 487096832.0, "25": 487096832.0, "30": 487096832.0, "35": 487096832.0, "40": 487096832.0, "45": 487096832.0, "50": 487096832.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1229747712.0, "5": 1409821184.0, "10": 1409821184.0, "15": 1409821184.0, "20": 1409821184.0, "25": 1409821184.0, "30": 1409821184.0, "35": 1409821184.0, "40": 1409821184.0, "45": 1409821184.0, "50": 1409821184.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 13.04346, "5": 0.19365, "10": 0.19279, "15": 0.19212, "20": 0.1915, "25": 0.19182, "30": 0.192, "35": 0.19258, "40": 0.19179, "45": 0.19135, "50": 0.19151}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86208, 10.89137, 10.86731, 10.81652, 10.70126, 10.60816, 10.11007, 10.21889, 10.1294, 9.80326]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1659.0, 1944.0, 1974.0, 1920.0, 1918.0, 1855.0, 1621.0, 2018.0, 2436.0, 2304.0]}, "iteration_timing_avg": 0.14203264705882354}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86208, "5": 10.87869, "10": 10.84148, "15": 10.81526, "20": 10.72356, "25": 10.55942, "30": 10.35833, "35": 10.26014, "40": 10.06485, "45": 9.81413, "50": 9.89077}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1659.0, "5": 1904.0, "10": 1453.0, "15": 2011.0, "20": 1695.0, "25": 1617.0, "30": 1893.0, "35": 2080.0, "40": 2232.0, "45": 2224.0, "50": 2454.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 486047744.0, "5": 487096320.0, "10": 487096320.0, "15": 486047744.0, "20": 487096320.0, "25": 487096320.0, "30": 486047744.0, "35": 487096320.0, "40": 487096320.0, "45": 486047744.0, "50": 487096320.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1720084480.0, "5": 1900157952.0, "10": 1900157952.0, "15": 1900157952.0, "20": 1900157952.0, "25": 1900157952.0, "30": 1900157952.0, "35": 1900157952.0, "40": 1900157952.0, "45": 1900157952.0, "50": 1900157952.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 9.46191, "5": 0.19848, "10": 0.2013, "15": 0.20084, "20": 0.20142, "25": 0.20039, "30": 0.20371, "35": 0.20255, "40": 0.2022, "45": 0.20294, "50": 0.20066}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 50
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -47,4 +47,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
  --attention-backend: unfused
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.85926, "5": 10.878, "10": 10.84086, "15": 10.81702, "20": 10.72418, "25": 10.55518, "30": 10.35548, "35": 10.2597, "40": 10.06425, "45": 9.81279, "50": 9.89265, "55": 9.86713, "60": 9.4818, "65": 8.93492, "70": 9.71847, "75": 9.41307, "80": 9.3968, "85": 9.60641, "90": 9.80599, "95": 9.51409, "100": 9.39833}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1726.0, "5": 1899.0, "10": 1437.0, "15": 1923.0, "20": 1700.0, "25": 1640.0, "30": 1993.0, "35": 2075.0, "40": 2268.0, "45": 2144.0, "50": 2461.0, "55": 2419.0, "60": 2540.0, "65": 2748.0, "70": 3339.0, "75": 2600.0, "80": 3404.0, "85": 3412.0, "90": 3049.0, "95": 3491.0, "100": 3350.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 488669696.0, "5": 488669696.0, "10": 488669696.0, "15": 488669696.0, "20": 488669696.0, "25": 488669696.0, "30": 488669696.0, "35": 488669696.0, "40": 488669696.0, "45": 488669696.0, "50": 488669696.0, "55": 488669696.0, "60": 488669696.0, "65": 488669696.0, "70": 488669696.0, "75": 488669696.0, "80": 488669696.0, "85": 488669696.0, "90": 488669696.0, "95": 488669696.0, "100": 488669696.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1229747712.0, "5": 1411918336.0, "10": 1411918336.0, "15": 1411918336.0, "20": 1411918336.0, "25": 1411918336.0, "30": 1411918336.0, "35": 1411918336.0, "40": 1411918336.0, "45": 1411918336.0, "50": 1411918336.0, "55": 1411918336.0, "60": 1411918336.0, "65": 1411918336.0, "70": 1411918336.0, "75": 1411918336.0, "80": 1411918336.0, "85": 1411918336.0, "90": 1411918336.0, "95": 1411918336.0, "100": 1411918336.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 13.2816, "5": 0.19252, "10": 0.19307, "15": 0.19178, "20": 0.19278, "25": 0.19268, "30": 0.19244, "35": 0.19333, "40": 0.19291, "45": 0.19374, "50": 0.19199, "55": 0.19307, "60": 0.19049, "65": 0.19061, "70": 0.19137, "75": 0.19057, "80": 0.1903, "85": 0.19047, "90": 0.19357, "95": 0.19059, "100": 0.1907}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.86208, "5": 10.87869, "10": 10.84148, "15": 10.81526, "20": 10.72356, "25": 10.55942, "30": 10.35833, "35": 10.26014, "40": 10.06485, "45": 9.81413, "50": 9.89077, "55": 9.8674, "60": 9.48218, "65": 8.93482, "70": 9.7177, "75": 9.4111, "80": 9.39614, "85": 9.60606, "90": 9.80663, "95": 9.51629, "100": 9.39917}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1659.0, "5": 1904.0, "10": 1453.0, "15": 2011.0, "20": 1695.0, "25": 1617.0, "30": 1893.0, "35": 2080.0, "40": 2232.0, "45": 2224.0, "50": 2454.0, "55": 2461.0, "60": 2555.0, "65": 2883.0, "70": 3255.0, "75": 2586.0, "80": 3445.0, "85": 3442.0, "90": 3067.0, "95": 3500.0, "100": 3328.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 488144896.0, "5": 489193472.0, "10": 489193472.0, "15": 489193472.0, "20": 489193472.0, "25": 489193472.0, "30": 489193472.0, "35": 489193472.0, "40": 489193472.0, "45": 489193472.0, "50": 489193472.0, "55": 489193472.0, "60": 489193472.0, "65": 489193472.0, "70": 489193472.0, "75": 489193472.0, "80": 489193472.0, "85": 489193472.0, "90": 489193472.0, "95": 489193472.0, "100": 489193472.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1720084480.0, "5": 1902255104.0, "10": 1902255104.0, "15": 1902255104.0, "20": 1902255104.0, "25": 1902255104.0, "30": 1902255104.0, "35": 1902255104.0, "40": 1902255104.0, "45": 1902255104.0, "50": 1902255104.0, "55": 1902255104.0, "60": 1902255104.0, "65": 1902255104.0, "70": 1902255104.0, "75": 1902255104.0, "80": 1902255104.0, "85": 1902255104.0, "90": 1902255104.0, "95": 1902255104.0, "100": 1902255104.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 12.77466, "5": 0.19369, "10": 0.19406, "15": 0.19154, "20": 0.19362, "25": 0.19633, "30": 0.19002, "35": 0.19146, "40": 0.19099, "45": 0.19061, "50": 0.19124, "55": 0.19463, "60": 0.1903, "65": 0.19035, "70": 0.19049, "75": 0.18947, "80": 0.19086, "85": 0.1921, "90": 0.19047, "95": 0.1932, "100": 0.19029}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -47,4 +47,5 @@ MODEL_ARGS:
  --use-legacy-models: true
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.82949, 10.78726, 10.565, 10.57088, 10.4836, 10.19521]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2450.0, 2765.0, 2163.0, 2585.0, 2634.0, 2585.0, 2987.0]}, "iteration_timing_avg": 0.1211408823529412}
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.79311, "5": 10.83074, "10": 10.76725, "15": 10.82664, "20": 10.81793, "25": 10.76529, "30": 10.69182, "35": 10.61672, "40": 10.44907, "45": 10.21488, "50": 10.21715}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 586369024.0, "5": 587417600.0, "10": 587417600.0, "15": 587417600.0, "20": 869128704.0, "25": 867031552.0, "30": 867031552.0, "35": 867031552.0, "40": 867031552.0, "45": 867031552.0, "50": 869128704.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3832784384.0, "5": 3832784896.0, "10": 3832784896.0, "15": 3832784896.0, "20": 4114758144.0, "25": 4114758144.0, "30": 4114758144.0, "35": 4114758144.0, "40": 4114758144.0, "45": 4114758144.0, "50": 4114758144.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 14.89966, "5": 0.15568, "10": 0.15311, "15": 0.15336, "20": 0.15735, "25": 0.15804, "30": 0.15672, "35": 0.1548, "40": 0.15515, "45": 0.15584, "50": 0.15477}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1846.0, "25": 2348.0, "30": 2490.0, "35": 2010.0, "40": 2016.0, "45": 2642.0, "50": 2810.0}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79311, 10.85248, 10.87281, 10.83016, 10.82949, 10.78726, 10.565, 10.57088, 10.4836, 10.19521]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [2450.0, 2765.0, 2163.0, 2585.0, 2634.0, 2585.0, 2987.0]}, "iteration_timing_avg": 0.1211408823529412}
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.79311, "5": 10.83074, "10": 10.76725, "15": 10.82664, "20": 10.81793, "25": 10.76529, "30": 10.69182, "35": 10.61672, "40": 10.44907, "45": 10.21488, "50": 10.21715}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 586369024.0, "5": 587417600.0, "10": 587417600.0, "15": 587417600.0, "20": 869128704.0, "25": 869128704.0, "30": 869128704.0, "35": 869128704.0, "40": 869128704.0, "45": 869128704.0, "50": 869128704.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3832784384.0, "5": 3832784896.0, "10": 3832784896.0, "15": 3832784896.0, "20": 4114758144.0, "25": 4114758144.0, "30": 4114758144.0, "35": 4114758144.0, "40": 4114758144.0, "45": 4114758144.0, "50": 4114758144.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 17.9574, "5": 0.15166, "10": 0.15201, "15": 0.1496, "20": 0.15614, "25": 0.15477, "30": 0.15483, "35": 0.15409, "40": 0.1546, "45": 0.15501, "50": 0.15639}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1846.0, "25": 2348.0, "30": 2490.0, "35": 2010.0, "40": 2016.0, "45": 2642.0, "50": 2810.0}}}
\ No newline at end of file