更新代码

688448db · silencealiang · a02a5490 · 688448db · 688448db · 688448db
Commit 688448db authored Mar 14, 2025 by silencealiang
20 changed files
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_lts.json
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": {
+            "1": 10.81978,
+            "5": 10.85277,
+            "10": 10.79054,
+            "15": 10.81259,
+            "20": 10.71561,
+            "25": 10.52391,
+            "30": 10.33354,
+            "35": 10.22869,
+            "40": 10.04307,
+            "45": 9.77101,
+            "50": 9.86315,
+            "55": 9.82489,
+            "60": 9.45369,
+            "65": 8.89336,
+            "70": 9.69013,
+            "75": 9.38429,
+            "80": 9.37031,
+            "85": 9.58022,
+            "90": 9.78525,
+            "95": 9.49638,
+            "100": 9.36739
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": {
+            "1": 27138.0,
+            "5": 32036.0,
+            "10": 26255.0,
+            "15": 31309.0,
+            "20": 28869.0,
+            "25": 28605.0,
+            "30": 30817.0,
+            "35": 32882.0,
+            "40": 35373.0,
+            "45": 35484.0,
+            "50": 2136527.0,
+            "55": 2135084.0,
+            "60": 2137981.0,
+            "65": 2138995.0,
+            "70": 2142528.0,
+            "75": 2215276.0,
+            "80": 2144227.0,
+            "85": 2146040.0,
+            "90": 2146440.0,
+            "95": 2144187.0,
+            "100": 2144354.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": {
+            "1": 668320768.0,
+            "5": 668306944.0,
+            "10": 668313600.0,
+            "15": 668326912.0,
+            "20": 668314112.0,
+            "25": 668332544.0,
+            "30": 668326912.0,
+            "35": 668337664.0,
+            "40": 668306432.0,
+            "45": 668297728.0,
+            "50": 668282880.0,
+            "55": 668265984.0,
+            "60": 668249088.0,
+            "65": 668242944.0,
+            "70": 668224512.0,
+            "75": 668213248.0,
+            "80": 668222464.0,
+            "85": 668234752.0,
+            "90": 668237312.0,
+            "95": 668223488.0,
+            "100": 668209664.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": {
+            "1": 2355231744.0,
+            "5": 2605464064.0,
+            "10": 2605464064.0,
+            "15": 2605464064.0,
+            "20": 2605464064.0,
+            "25": 2615321600.0,
+            "30": 2615321600.0,
+            "35": 2618603520.0,
+            "40": 2618603520.0,
+            "45": 2618603520.0,
+            "50": 2618603520.0,
+            "55": 2618603520.0,
+            "60": 2618603520.0,
+            "65": 2618603520.0,
+            "70": 2618603520.0,
+            "75": 2618603520.0,
+            "80": 2618603520.0,
+            "85": 2618603520.0,
+            "90": 2618603520.0,
+            "95": 2618603520.0,
+            "100": 2618603520.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 5,
+        "values": {
+            "1": 6.84429,
+            "5": 0.49894,
+            "10": 0.4932,
+            "15": 0.48106,
+            "20": 0.48362,
+            "25": 0.48615,
+            "30": 0.49038,
+            "35": 0.49011,
+            "40": 0.50012,
+            "45": 0.49982,
+            "50": 0.49286,
+            "55": 0.92115,
+            "60": 0.49142,
+            "65": 0.49128,
+            "70": 0.49444,
+            "75": 0.49725,
+            "80": 0.4978,
+            "85": 0.49747,
+            "90": 0.497,
+            "95": 0.49687,
+            "100": 0.49788
+        }
+    }
+}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 1
+  --expert-model-parallel-size: 2
+  --disable-bias-linear: true
+  --sequence-parallel: true
+  --num-experts: 8
+  --moe-router-load-balancing-type: aux_loss
+  --moe-router-topk: 2
+  --moe-aux-loss-coeff: 1e-2
+  --use-custom-fsdp: true
+  --calculate-per-token-loss: true
+  --data-parallel-sharding-strategy: optim_grads_params
+  --use-distributed-optimizer: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --moe-grouped-gemm: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json
-{
-    "lm loss": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            10.86126,
-            10.88645,
-            10.87768,
-            10.83106,
-            10.71636,
-            10.60597,
-            10.13124,
-            10.22753,
-            10.1591,
-            9.83464
-        ]
-    },
-    "num-zeros": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            1755.0,
-            2147.0,
-            2147.0,
-            2042.0,
-            2108.0,
-            1931.0,
-            1762.0,
-            2184.0,
-            2529.0,
-            2615.0
-        ]
-    },
-    "iteration-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            6.25178,
-            0.35642,
-            0.31793,
-            0.31783,
-            0.31708,
-            0.31607,
-            0.31789,
-            0.31477,
-            0.31433,
-            0.31727
-        ]
-    }
-}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86126, "5": 10.88248, "10": 10.83507, "15": 10.82743, "20": 10.72743, "25": 10.5575, "30": 10.37893, "35": 10.28325, "40": 10.08786, "45": 9.82625, "50": 9.91321}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1755.0, "5": 2185.0, "10": 1522.0, "15": 2063.0, "20": 1801.0, "25": 1775.0, "30": 2044.0, "35": 2294.0, "40": 2587.0, "45": 2425.0, "50": 2628.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 269842432.0, "5": 269842432.0, "10": 269842432.0, "15": 269842432.0, "20": 269842432.0, "25": 269842432.0, "30": 269842432.0, "35": 269842432.0, "40": 269842432.0, "45": 269842432.0, "50": 269842432.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1449633280.0, "5": 1515627520.0, "10": 1515627520.0, "15": 1515627520.0, "20": 1515627520.0, "25": 1515627520.0, "30": 1515627520.0, "35": 1515627520.0, "40": 1515627520.0, "45": 1515627520.0, "50": 1515627520.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 6.40817, "5": 0.30338, "10": 0.30031, "15": 0.59063, "20": 0.30088, "25": 0.30345, "30": 0.30256, "35": 0.30195, "40": 0.3015, "45": 0.30212, "50": 0.30102}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86126, 10.88643, 10.87768, 10.83108, 10.71635, 10.60599, 10.13124, 10.2275, 10.15914, 9.83465]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1752.0, 2067.0, 2123.0, 2072.0, 1999.0, 1941.0, 1784.0, 2229.0, 2546.0, 2567.0]}, "iteration_timing_avg": 0.2256223529411765}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86126, "5": 10.88253, "10": 10.83509, "15": 10.82739, "20": 10.72744, "25": 10.55749, "30": 10.37894, "35": 10.28322, "40": 10.08784, "45": 9.82625, "50": 9.91326}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1752.0, "5": 2204.0, "10": 1567.0, "15": 2068.0, "20": 1783.0, "25": 1795.0, "30": 2124.0, "35": 2186.0, "40": 2575.0, "45": 2278.0, "50": 2683.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 269891584.0, "5": 269891584.0, "10": 269891584.0, "15": 269891584.0, "20": 269891584.0, "25": 269891584.0, "30": 269891584.0, "35": 269891584.0, "40": 269891584.0, "45": 269891584.0, "50": 269891584.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1450730496.0, "5": 1513579520.0, "10": 1513579520.0, "15": 1513579520.0, "20": 1515676672.0, "25": 1515676672.0, "30": 1515676672.0, "35": 1515676672.0, "40": 1515676672.0, "45": 1515676672.0, "50": 1515676672.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 7.60475, "5": 0.30064, "10": 0.2968, "15": 0.29543, "20": 0.29652, "25": 0.29285, "30": 0.29834, "35": 0.2921, "40": 0.29827, "45": 0.29122, "50": 0.30005}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 50
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -49,4 +49,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
  --attention-backend: unfused
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev.json
-{
-    "lm loss": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            10.86126,
-            10.88645,
-            10.87768,
-            10.83106,
-            10.71636,
-            10.60597,
-            10.13124,
-            10.22753,
-            10.1591,
-            9.83464
-        ]
-    },
-    "num-zeros": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            1755.0,
-            2147.0,
-            2147.0,
-            2042.0,
-            2108.0,
-            1931.0,
-            1762.0,
-            2184.0,
-            2529.0,
-            2615.0
-        ]
-    },
-    "iteration-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            7.0561,
-            0.32588,
-            0.32628,
-            0.32385,
-            0.32419,
-            0.32364,
-            0.32337,
-            0.32334,
-            0.32358,
-            0.32395
-        ]
-    }
-}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86126, "5": 10.88248, "10": 10.83507, "15": 10.82743, "20": 10.72743, "25": 10.5575, "30": 10.37893, "35": 10.28325, "40": 10.08786, "45": 9.82625, "50": 9.91321}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1755.0, "5": 2185.0, "10": 1522.0, "15": 2063.0, "20": 1801.0, "25": 1775.0, "30": 2044.0, "35": 2294.0, "40": 2587.0, "45": 2425.0, "50": 2628.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 269842432.0, "5": 269842432.0, "10": 269842432.0, "15": 269842432.0, "20": 269842432.0, "25": 269842432.0, "30": 269842432.0, "35": 269842432.0, "40": 269842432.0, "45": 269842432.0, "50": 269842432.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1450682368.0, "5": 1515627520.0, "10": 1515627520.0, "15": 1515627520.0, "20": 1515627520.0, "25": 1515627520.0, "30": 1515627520.0, "35": 1515627520.0, "40": 1515627520.0, "45": 1515627520.0, "50": 1515627520.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 6.08264, "5": 0.50294, "10": 0.31361, "15": 0.31749, "20": 0.30552, "25": 0.31296, "30": 0.31703, "35": 0.30458, "40": 0.30685, "45": 0.31528, "50": 0.30493}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86126, 10.88643, 10.87768, 10.83108, 10.71635, 10.60599, 10.13124, 10.2275, 10.15914, 9.83465]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1752.0, 2067.0, 2123.0, 2072.0, 1999.0, 1941.0, 1784.0, 2229.0, 2546.0, 2567.0]}, "iteration_timing_avg": 0.22043823529411763}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86126, "5": 10.88253, "10": 10.83509, "15": 10.82739, "20": 10.72744, "25": 10.55749, "30": 10.37894, "35": 10.28322, "40": 10.08784, "45": 9.82625, "50": 9.91326}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1752.0, "5": 2204.0, "10": 1567.0, "15": 2068.0, "20": 1783.0, "25": 1795.0, "30": 2124.0, "35": 2186.0, "40": 2575.0, "45": 2278.0, "50": 2683.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 269891584.0, "5": 269891584.0, "10": 269891584.0, "15": 269891584.0, "20": 269891584.0, "25": 269891584.0, "30": 269891584.0, "35": 269891584.0, "40": 269891584.0, "45": 269891584.0, "50": 269891584.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1449682432.0, "5": 1515676672.0, "10": 1515676672.0, "15": 1515676672.0, "20": 1515676672.0, "25": 1515676672.0, "30": 1515676672.0, "35": 1515676672.0, "40": 1515676672.0, "45": 1515676672.0, "50": 1515676672.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 8.33148, "5": 0.3095, "10": 0.30881, "15": 0.30285, "20": 0.305, "25": 0.30028, "30": 0.30512, "35": 0.30125, "40": 0.30469, "45": 0.29938, "50": 0.30327}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 50
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -50,4 +50,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
  --attention-backend: unfused
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev.json
-{
-    "lm loss": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            10.86217,
-            10.88646,
-            10.87861,
-            10.83295,
-            10.7203,
-            10.61089,
-            10.14181,
-            10.23434,
-            10.16609,
-            9.84444
-        ]
-    },
-    "num-zeros": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            1769.0,
-            2056.0,
-            2198.0,
-            2079.0,
-            2181.0,
-            1912.0,
-            1825.0,
-            2115.0,
-            2621.0,
-            2598.0
-        ]
-    },
-    "iteration-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            6.42448,
-            0.42854,
-            0.42836,
-            0.42582,
-            0.42274,
-            0.42187,
-            0.42561,
-            0.42178,
-            0.44234,
-            0.42304
-        ]
-    }
-}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86217, "5": 10.88249, "10": 10.83646, "15": 10.82906, "20": 10.73236, "25": 10.56397, "30": 10.38482, "35": 10.28955, "40": 10.09137, "45": 9.83491, "50": 9.91602}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1722.0, "5": 2259.0, "10": 1596.0, "15": 2099.0, "20": 1919.0, "25": 1785.0, "30": 2048.0, "35": 2290.0, "40": 2558.0, "45": 2447.0, "50": 2676.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 368387584.0, "5": 368387584.0, "10": 368387584.0, "15": 368387584.0, "20": 368387584.0, "25": 368387584.0, "30": 368387584.0, "35": 368387584.0, "40": 368387584.0, "45": 368387584.0, "50": 368387584.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1522507264.0, "5": 1653494272.0, "10": 1653494272.0, "15": 1653494272.0, "20": 1653494272.0, "25": 1653494272.0, "30": 1653494272.0, "35": 1653494272.0, "40": 1653494272.0, "45": 1653494272.0, "50": 1653494272.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 5.95291, "5": 0.41956, "10": 0.41644, "15": 0.41504, "20": 0.41541, "25": 0.41645, "30": 0.41452, "35": 0.41456, "40": 0.41402, "45": 0.41451, "50": 0.41368}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86217, 10.88641, 10.8786, 10.83291, 10.72031, 10.6109, 10.1418, 10.23434, 10.16605, 9.84445]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1769.0, 2019.0, 2145.0, 2058.0, 2166.0, 2060.0, 1776.0, 2174.0, 2524.0, 2645.0]}, "iteration_timing_avg": 0.2256223529411765}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86217, "5": 10.8825, "10": 10.83647, "15": 10.8291, "20": 10.73231, "25": 10.56391, "30": 10.38476, "35": 10.28957, "40": 10.09136, "45": 9.83492, "50": 9.91604}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1769.0, "5": 2216.0, "10": 1573.0, "15": 2132.0, "20": 1815.0, "25": 1849.0, "30": 2009.0, "35": 2182.0, "40": 2489.0, "45": 2381.0, "50": 2727.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 368387072.0, "5": 368387072.0, "10": 368387072.0, "15": 368387072.0, "20": 368387072.0, "25": 368387072.0, "30": 368387072.0, "35": 368387072.0, "40": 368387072.0, "45": 368387072.0, "50": 368387072.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1512020480.0, "5": 1647202304.0, "10": 1647202304.0, "15": 1647202304.0, "20": 1647202304.0, "25": 1647202304.0, "30": 1647202304.0, "35": 1647202304.0, "40": 1647202304.0, "45": 1647202304.0, "50": 1647202304.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 7.51772, "5": 0.40946, "10": 0.40614, "15": 0.40997, "20": 0.41426, "25": 0.41365, "30": 0.41686, "35": 0.40937, "40": 0.41695, "45": 0.4147, "50": 0.42032}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 50
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -49,4 +49,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
  --attention-backend: unfused
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.86065, "5": 10.88253, "10": 10.8352, "15": 10.82848, "20": 10.72795, "25": 10.55737, "30": 10.37935, "35": 10.28345, "40": 10.0878, "45": 9.82662, "50": 9.91321, "55": 9.87799, "60": 9.50877, "65": 8.95112, "70": 9.73131, "75": 9.43668, "80": 9.41164, "85": 9.61594, "90": 9.8216, "95": 9.51907, "100": 9.40583}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1716.0, "5": 2123.0, "10": 1539.0, "15": 2026.0, "20": 1842.0, "25": 1767.0, "30": 2099.0, "35": 2213.0, "40": 2387.0, "45": 2378.0, "50": 2771.0, "55": 2649.0, "60": 2734.0, "65": 2982.0, "70": 3716.0, "75": 2729.0, "80": 3758.0, "85": 3562.0, "90": 3313.0, "95": 3458.0, "100": 3432.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 269842944.0, "5": 269842944.0, "10": 269842944.0, "15": 269842944.0, "20": 269842944.0, "25": 269842944.0, "30": 269842944.0, "35": 269842944.0, "40": 269842944.0, "45": 269842944.0, "50": 269842944.0, "55": 269842944.0, "60": 269842944.0, "65": 269842944.0, "70": 269842944.0, "75": 269842944.0, "80": 269842944.0, "85": 269842944.0, "90": 269842944.0, "95": 269842944.0, "100": 269842944.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 956153344.0, "5": 1035779584.0, "10": 1035779584.0, "15": 1035779584.0, "20": 1035779584.0, "25": 1035779584.0, "30": 1035779584.0, "35": 1035779584.0, "40": 1035779584.0, "45": 1035779584.0, "50": 1035779584.0, "55": 1035779584.0, "60": 1035779584.0, "65": 1035779584.0, "70": 1035779584.0, "75": 1035779584.0, "80": 1035779584.0, "85": 1035779584.0, "90": 1035779584.0, "95": 1035779584.0, "100": 1035779584.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 6.03589, "5": 0.28552, "10": 0.28504, "15": 0.29587, "20": 0.28309, "25": 0.27926, "30": 0.27852, "35": 0.27751, "40": 0.27651, "45": 0.27785, "50": 0.27743, "55": 0.27487, "60": 0.27351, "65": 0.27319, "70": 0.27565, "75": 0.50898, "80": 0.27289, "85": 0.27348, "90": 0.27316, "95": 0.27294, "100": 0.27277}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.86126, "5": 10.88253, "10": 10.83509, "15": 10.82739, "20": 10.72744, "25": 10.55749, "30": 10.37894, "35": 10.28322, "40": 10.08784, "45": 9.82625, "50": 9.91326, "55": 9.8779, "60": 9.50869, "65": 8.95102, "70": 9.73166, "75": 9.43677, "80": 9.41158, "85": 9.61615, "90": 9.82168, "95": 9.51915, "100": 9.40594}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1752.0, "5": 2204.0, "10": 1567.0, "15": 2068.0, "20": 1783.0, "25": 1795.0, "30": 2124.0, "35": 2186.0, "40": 2575.0, "45": 2278.0, "50": 2683.0, "55": 2676.0, "60": 2773.0, "65": 3019.0, "70": 3591.0, "75": 2870.0, "80": 3765.0, "85": 3549.0, "90": 3490.0, "95": 3544.0, "100": 3617.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 269891584.0, "5": 269891584.0, "10": 269891584.0, "15": 269891584.0, "20": 269891584.0, "25": 269891584.0, "30": 269891584.0, "35": 269891584.0, "40": 269891584.0, "45": 269891584.0, "50": 269891584.0, "55": 269891584.0, "60": 269891584.0, "65": 269891584.0, "70": 269891584.0, "75": 269891584.0, "80": 269891584.0, "85": 269891584.0, "90": 269891584.0, "95": 269891584.0, "100": 269891584.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1450730496.0, "5": 1515676672.0, "10": 1515676672.0, "15": 1515676672.0, "20": 1515676672.0, "25": 1515676672.0, "30": 1515676672.0, "35": 1515676672.0, "40": 1515676672.0, "45": 1515676672.0, "50": 1515676672.0, "55": 1515676672.0, "60": 1515676672.0, "65": 1515676672.0, "70": 1515676672.0, "75": 1515676672.0, "80": 1515676672.0, "85": 1515676672.0, "90": 1515676672.0, "95": 1515676672.0, "100": 1515676672.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 8.22865, "5": 0.31237, "10": 0.31889, "15": 0.30979, "20": 0.31592, "25": 0.31368, "30": 0.31292, "35": 0.31001, "40": 0.31087, "45": 0.30787, "50": 0.3067, "55": 0.30223, "60": 0.29974, "65": 0.29841, "70": 0.29787, "75": 0.30072, "80": 0.29729, "85": 0.29753, "90": 0.29692, "95": 0.29937, "100": 0.29618}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -49,4 +49,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.86065, "5": 10.88253, "10": 10.8352, "15": 10.82848, "20": 10.72795, "25": 10.55737, "30": 10.37935, "35": 10.28345, "40": 10.0878, "45": 9.82662, "50": 9.91321, "55": 9.87799, "60": 9.50877, "65": 8.95112, "70": 9.73131, "75": 9.43668, "80": 9.41164, "85": 9.61594, "90": 9.8216, "95": 9.51907, "100": 9.40583}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1716.0, "5": 2123.0, "10": 1539.0, "15": 2026.0, "20": 1842.0, "25": 1767.0, "30": 2099.0, "35": 2213.0, "40": 2387.0, "45": 2378.0, "50": 2771.0, "55": 2649.0, "60": 2734.0, "65": 2982.0, "70": 3716.0, "75": 2729.0, "80": 3758.0, "85": 3562.0, "90": 3313.0, "95": 3458.0, "100": 3432.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 269842944.0, "5": 269842944.0, "10": 269842944.0, "15": 269842944.0, "20": 269842944.0, "25": 269842944.0, "30": 269842944.0, "35": 269842944.0, "40": 269842944.0, "45": 269842944.0, "50": 269842944.0, "55": 269842944.0, "60": 269842944.0, "65": 269842944.0, "70": 269842944.0, "75": 269842944.0, "80": 269842944.0, "85": 269842944.0, "90": 269842944.0, "95": 269842944.0, "100": 269842944.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 964541952.0, "5": 1035779584.0, "10": 1035779584.0, "15": 1035779584.0, "20": 1035779584.0, "25": 1035779584.0, "30": 1035779584.0, "35": 1035779584.0, "40": 1035779584.0, "45": 1035779584.0, "50": 1035779584.0, "55": 1035779584.0, "60": 1035779584.0, "65": 1035779584.0, "70": 1035779584.0, "75": 1035779584.0, "80": 1035779584.0, "85": 1035779584.0, "90": 1035779584.0, "95": 1035779584.0, "100": 1035779584.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 6.5187, "5": 0.28962, "10": 0.29031, "15": 0.28267, "20": 0.28213, "25": 0.28217, "30": 0.28177, "35": 0.28205, "40": 0.28416, "45": 0.28076, "50": 0.28157, "55": 0.28142, "60": 0.28019, "65": 0.28032, "70": 0.28088, "75": 0.27982, "80": 0.27983, "85": 0.27971, "90": 0.28117, "95": 0.28008, "100": 0.27953}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.86126, "5": 10.88253, "10": 10.83509, "15": 10.82739, "20": 10.72744, "25": 10.55749, "30": 10.37894, "35": 10.28322, "40": 10.08784, "45": 9.82625, "50": 9.91326, "55": 9.8779, "60": 9.50869, "65": 8.95102, "70": 9.73166, "75": 9.43677, "80": 9.41158, "85": 9.61615, "90": 9.82168, "95": 9.51915, "100": 9.40594}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1752.0, "5": 2204.0, "10": 1567.0, "15": 2068.0, "20": 1783.0, "25": 1795.0, "30": 2124.0, "35": 2186.0, "40": 2575.0, "45": 2278.0, "50": 2683.0, "55": 2676.0, "60": 2773.0, "65": 3019.0, "70": 3591.0, "75": 2870.0, "80": 3765.0, "85": 3549.0, "90": 3490.0, "95": 3544.0, "100": 3617.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 269891584.0, "5": 269891584.0, "10": 269891584.0, "15": 269891584.0, "20": 269891584.0, "25": 269891584.0, "30": 269891584.0, "35": 269891584.0, "40": 269891584.0, "45": 269891584.0, "50": 269891584.0, "55": 269891584.0, "60": 269891584.0, "65": 269891584.0, "70": 269891584.0, "75": 269891584.0, "80": 269891584.0, "85": 269891584.0, "90": 269891584.0, "95": 269891584.0, "100": 269891584.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1450731008.0, "5": 1515675648.0, "10": 1515676672.0, "15": 1515676672.0, "20": 1515676672.0, "25": 1515676672.0, "30": 1515676672.0, "35": 1515676672.0, "40": 1515676672.0, "45": 1515676672.0, "50": 1515676672.0, "55": 1515676672.0, "60": 1515676672.0, "65": 1515676672.0, "70": 1515676672.0, "75": 1515676672.0, "80": 1515676672.0, "85": 1515676672.0, "90": 1515676672.0, "95": 1515676672.0, "100": 1515676672.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 11.38789, "5": 0.322, "10": 0.31419, "15": 0.31533, "20": 0.30974, "25": 0.30867, "30": 0.30191, "35": 0.30301, "40": 0.30266, "45": 0.30177, "50": 0.30441, "55": 0.33472, "60": 0.31376, "65": 0.32009, "70": 0.31308, "75": 0.31965, "80": 0.31251, "85": 0.31098, "90": 0.30726, "95": 0.30595, "100": 0.30772}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -50,4 +50,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.86073, "5": 10.8823, "10": 10.83564, "15": 10.83051, "20": 10.73302, "25": 10.56317, "30": 10.38508, "35": 10.28979, "40": 10.09131, "45": 9.83512, "50": 9.91593, "55": 9.88231, "60": 9.51403, "65": 8.95406, "70": 9.7307, "75": 9.43134, "80": 9.40601, "85": 9.61116, "90": 9.8175, "95": 9.51556, "100": 9.40417}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1793.0, "5": 2158.0, "10": 1547.0, "15": 2089.0, "20": 1858.0, "25": 1753.0, "30": 2091.0, "35": 2200.0, "40": 2602.0, "45": 2415.0, "50": 2741.0, "55": 2688.0, "60": 2698.0, "65": 2813.0, "70": 3731.0, "75": 2787.0, "80": 3822.0, "85": 3525.0, "90": 3430.0, "95": 3582.0, "100": 3723.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 368387584.0, "5": 368387584.0, "10": 368387584.0, "15": 368387584.0, "20": 368387584.0, "25": 368387584.0, "30": 368387584.0, "35": 368387584.0, "40": 368387584.0, "45": 368387584.0, "50": 368387584.0, "55": 368387584.0, "60": 368387584.0, "65": 368387584.0, "70": 368387584.0, "75": 368387584.0, "80": 368387584.0, "85": 368387584.0, "90": 368387584.0, "95": 368387584.0, "100": 368387584.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1032173056.0, "5": 1163160064.0, "10": 1163160064.0, "15": 1163160064.0, "20": 1163160064.0, "25": 1163160064.0, "30": 1163160064.0, "35": 1163160064.0, "40": 1163160064.0, "45": 1163160064.0, "50": 1163160064.0, "55": 1163160064.0, "60": 1163160064.0, "65": 1163160064.0, "70": 1163160064.0, "75": 1163160064.0, "80": 1163160064.0, "85": 1163160064.0, "90": 1163160064.0, "95": 1163160064.0, "100": 1163160064.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 8.70935, "5": 0.38509, "10": 0.39398, "15": 0.39049, "20": 0.39065, "25": 0.38942, "30": 0.38888, "35": 0.39041, "40": 0.39256, "45": 0.39188, "50": 0.39096, "55": 0.38207, "60": 0.38257, "65": 0.38138, "70": 0.3975, "75": 0.38155, "80": 0.38011, "85": 0.38775, "90": 0.38412, "95": 0.3829, "100": 0.38287}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.86217, "5": 10.8825, "10": 10.83647, "15": 10.8291, "20": 10.73231, "25": 10.56391, "30": 10.38476, "35": 10.28957, "40": 10.09136, "45": 9.83492, "50": 9.91604, "55": 9.88229, "60": 9.51379, "65": 8.95396, "70": 9.731, "75": 9.43126, "80": 9.40596, "85": 9.61136, "90": 9.81744, "95": 9.51567, "100": 9.4043}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1769.0, "5": 2216.0, "10": 1573.0, "15": 2132.0, "20": 1815.0, "25": 1849.0, "30": 2009.0, "35": 2182.0, "40": 2489.0, "45": 2381.0, "50": 2727.0, "55": 2667.0, "60": 2723.0, "65": 2907.0, "70": 3734.0, "75": 2746.0, "80": 3726.0, "85": 3599.0, "90": 3323.0, "95": 3615.0, "100": 3524.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 368387072.0, "5": 368387072.0, "10": 368387072.0, "15": 368387072.0, "20": 368387072.0, "25": 368387072.0, "30": 368387072.0, "35": 368387072.0, "40": 368387072.0, "45": 368387072.0, "50": 368387072.0, "55": 368387072.0, "60": 368387072.0, "65": 368387072.0, "70": 368387072.0, "75": 368387072.0, "80": 368387072.0, "85": 368387072.0, "90": 368387072.0, "95": 368387072.0, "100": 368387072.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1513069568.0, "5": 1647199744.0, "10": 1647199744.0, "15": 1647201792.0, "20": 1647201792.0, "25": 1647201792.0, "30": 1647201792.0, "35": 1647201792.0, "40": 1647201792.0, "45": 1647201792.0, "50": 1647201792.0, "55": 1647201792.0, "60": 1649298944.0, "65": 1649298944.0, "70": 1649298944.0, "75": 1649298944.0, "80": 1649298944.0, "85": 1649298944.0, "90": 1649298944.0, "95": 1649298944.0, "100": 1649298944.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.88726, "5": 0.40682, "10": 0.43529, "15": 0.47149, "20": 0.41093, "25": 0.40566, "30": 0.42086, "35": 0.40692, "40": 0.4028, "45": 0.40374, "50": 0.404, "55": 0.41679, "60": 0.42436, "65": 0.427, "70": 0.42395, "75": 0.4485, "80": 0.45249, "85": 0.41989, "90": 0.41911, "95": 0.42649, "100": 0.42528}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -49,4 +49,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume