更新代码

688448db · silencealiang · a02a5490 · 688448db · 688448db · 688448db
Commit 688448db authored Mar 14, 2025 by silencealiang
20 changed files
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev.json
-{
-    "lm loss": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            10.84474,
-            10.87688,
-            10.90253,
-            10.81872,
-            10.67849,
-            10.60076,
-            10.06361,
-            10.19267,
-            10.11344,
-            9.75987
-        ]
-    },
-    "num-zeros": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            1769.0,
-            2129.0,
-            1987.0,
-            1961.0,
-            1961.0,
-            1886.0,
-            1655.0,
-            2130.0,
-            2315.0,
-            2362.0
-        ]
-    },
-    "iteration-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            8.72642,
-            0.16194,
-            0.15926,
-            0.15956,
-            0.15972,
-            0.1623,
-            0.16029,
-            0.15863,
-            0.15947,
-            0.15935
-        ]
-    }
-}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.84474, "5": 10.8642, "10": 10.82152, "15": 10.81201, "20": 10.71869, "25": 10.53034, "30": 10.33576, "35": 10.24082, "40": 10.05009, "45": 9.76761, "50": 9.85505}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1769.0, "5": 2061.0, "10": 1636.0, "15": 2011.0, "20": 1779.0, "25": 1875.0, "30": 2074.0, "35": 2069.0, "40": 2190.0, "45": 2153.0, "50": 2508.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 888098304.0, "5": 888098304.0, "10": 888098304.0, "15": 888098304.0, "20": 888098304.0, "25": 888098304.0, "30": 888098304.0, "35": 888098304.0, "40": 888098304.0, "45": 888098304.0, "50": 888098304.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3215778304.0, "5": 3575244288.0, "10": 3575244288.0, "15": 3575244288.0, "20": 3575244288.0, "25": 3575244288.0, "30": 3575244288.0, "35": 3575244288.0, "40": 3575244288.0, "45": 3575244288.0, "50": 3575244288.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 8.69754, "5": 0.16083, "10": 0.16079, "15": 0.16126, "20": 0.16129, "25": 0.16055, "30": 0.1609, "35": 0.16119, "40": 0.16222, "45": 0.16081, "50": 0.15983}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.84474, 10.87687, 10.90254, 10.81872, 10.67848, 10.60075, 10.06363, 10.19268, 10.11342, 9.75986]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1776.0, 2161.0, 2052.0, 1892.0, 1971.0, 1946.0, 1701.0, 1985.0, 2295.0, 2293.0]}, "iteration_timing_avg": 0.11052176470588236}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.84474, "5": 10.86418, "10": 10.82155, "15": 10.81195, "20": 10.71872, "25": 10.53036, "30": 10.3358, "35": 10.24082, "40": 10.05008, "45": 9.76762, "50": 9.85505}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1776.0, "5": 2128.0, "10": 1615.0, "15": 2021.0, "20": 1775.0, "25": 1916.0, "30": 2029.0, "35": 2107.0, "40": 2174.0, "45": 2110.0, "50": 2363.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 888098304.0, "5": 888098304.0, "10": 888098304.0, "15": 888098304.0, "20": 888098304.0, "25": 888098304.0, "30": 888098304.0, "35": 888098304.0, "40": 888098304.0, "45": 888098304.0, "50": 888098304.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3215778304.0, "5": 3575244288.0, "10": 3575244288.0, "15": 3575244288.0, "20": 3575244288.0, "25": 3575244288.0, "30": 3575244288.0, "35": 3575244288.0, "40": 3575244288.0, "45": 3575244288.0, "50": 3575244288.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.1728, "5": 0.15783, "10": 0.15696, "15": 0.15564, "20": 0.15887, "25": 0.15731, "30": 0.15635, "35": 0.1571, "40": 0.15637, "45": 0.15705, "50": 0.15413}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 50
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -50,4 +50,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
  --attention-backend: unfused
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
-{
-    "lm loss": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            10.79205,
-            10.86789,
-            10.89149,
-            10.78328,
-            10.66126,
-            10.58275,
-            10.08467,
-            10.19448,
-            10.13785,
-            9.81454
-        ]
-    },
-    "num-zeros": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            1580.0,
-            1778.0,
-            1849.0,
-            1841.0,
-            1884.0,
-            1679.0,
-            1544.0,
-            1953.0,
-            2449.0,
-            2335.0
-        ]
-    },
-    "iteration-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            10.79458,
-            0.16744,
-            0.16286,
-            0.16276,
-            0.16292,
-            0.16346,
-            0.16288,
-            0.16273,
-            0.16282,
-            0.16245
-        ]
-    }
-}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.79205, "5": 10.84695, "10": 10.77106, "15": 10.79093, "20": 10.68042, "25": 10.50715, "30": 10.33325, "35": 10.25545, "40": 10.05544, "45": 9.80575, "50": 9.89082}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1580.0, "5": 1901.0, "10": 1346.0, "15": 1926.0, "20": 1643.0, "25": 1683.0, "30": 1867.0, "35": 2020.0, "40": 2252.0, "45": 2243.0, "50": 2459.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 716833792.0, "5": 716833792.0, "10": 716833792.0, "15": 716833792.0, "20": 716833792.0, "25": 716833792.0, "30": 716833792.0, "35": 716833792.0, "40": 716833792.0, "45": 716833792.0, "50": 716833792.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2399714304.0, "5": 2683412480.0, "10": 2683412480.0, "15": 2683412480.0, "20": 2683412480.0, "25": 2683412480.0, "30": 2683412480.0, "35": 2683412480.0, "40": 2683412480.0, "45": 2683412480.0, "50": 2683412480.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 8.70564, "5": 0.16109, "10": 0.15745, "15": 0.15861, "20": 0.15886, "25": 0.15817, "30": 0.15999, "35": 0.16113, "40": 0.15887, "45": 0.16006, "50": 0.1597}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79205, 10.86789, 10.89149, 10.78328, 10.66126, 10.58275, 10.08467, 10.19448, 10.13785, 9.81454]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1580.0, 1778.0, 1849.0, 1841.0, 1884.0, 1679.0, 1544.0, 1953.0, 2449.0, 2335.0]}, "iteration_timing_avg": 0.12243558823529416}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.79205, "5": 10.84695, "10": 10.77106, "15": 10.79093, "20": 10.68042, "25": 10.50715, "30": 10.33325, "35": 10.25545, "40": 10.05544, "45": 9.80575, "50": 9.89082}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1580.0, "5": 1901.0, "10": 1346.0, "15": 1926.0, "20": 1643.0, "25": 1683.0, "30": 1867.0, "35": 2020.0, "40": 2252.0, "45": 2243.0, "50": 2459.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 714736640.0, "5": 714736640.0, "10": 714736640.0, "15": 714736640.0, "20": 714736640.0, "25": 714736640.0, "30": 714736640.0, "35": 714736640.0, "40": 714736640.0, "45": 714736640.0, "50": 714736640.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2399714304.0, "5": 2681315328.0, "10": 2681315328.0, "15": 2681315328.0, "20": 2681315328.0, "25": 2681315328.0, "30": 2681315328.0, "35": 2681315328.0, "40": 2681315328.0, "45": 2681315328.0, "50": 2681315328.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 13.48918, "5": 0.16426, "10": 0.16419, "15": 0.15777, "20": 0.15716, "25": 0.15773, "30": 0.15842, "35": 0.15959, "40": 0.15581, "45": 0.15603, "50": 0.15595}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 50
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -49,4 +49,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
  --attention-backend: unfused
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.79205, "5": 10.84695, "10": 10.77106, "15": 10.79093, "20": 10.68042, "25": 10.50715, "30": 10.33325, "35": 10.25545, "40": 10.05544, "45": 9.80575, "50": 9.89082}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1580.0, "5": 1901.0, "10": 1346.0, "15": 1926.0, "20": 1643.0, "25": 1683.0, "30": 1867.0, "35": 2020.0, "40": 2252.0, "45": 2243.0, "50": 2459.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 716833792.0, "5": 716833792.0, "10": 716833792.0, "15": 716833792.0, "20": 716833792.0, "25": 716833792.0, "30": 716833792.0, "35": 716833792.0, "40": 716833792.0, "45": 716833792.0, "50": 716833792.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2399714304.0, "5": 2683412480.0, "10": 2683412480.0, "15": 2683412480.0, "20": 2683412480.0, "25": 2683412480.0, "30": 2683412480.0, "35": 2683412480.0, "40": 2683412480.0, "45": 2683412480.0, "50": 2683412480.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 8.70564, "5": 0.16109, "10": 0.15745, "15": 0.15861, "20": 0.15886, "25": 0.15817, "30": 0.15999, "35": 0.16113, "40": 0.15887, "45": 0.16006, "50": 0.1597}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.79205, "5": 10.84695, "10": 10.77106, "15": 10.79093, "20": 10.68042, "25": 10.50715, "30": 10.33325, "35": 10.25545, "40": 10.05544, "45": 9.80575, "50": 9.89082}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1580.0, "5": 1901.0, "10": 1346.0, "15": 1926.0, "20": 1643.0, "25": 1683.0, "30": 1867.0, "35": 2020.0, "40": 2252.0, "45": 2243.0, "50": 2459.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 714736640.0, "5": 714736640.0, "10": 714736640.0, "15": 714736640.0, "20": 714736640.0, "25": 714736640.0, "30": 714736640.0, "35": 714736640.0, "40": 714736640.0, "45": 714736640.0, "50": 714736640.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2399714304.0, "5": 2681315328.0, "10": 2681315328.0, "15": 2681315328.0, "20": 2681315328.0, "25": 2681315328.0, "30": 2681315328.0, "35": 2681315328.0, "40": 2681315328.0, "45": 2681315328.0, "50": 2681315328.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 12.48837, "5": 0.15716, "10": 0.1577, "15": 0.1575, "20": 0.15694, "25": 0.15689, "30": 0.16393, "35": 0.15702, "40": 0.15586, "45": 0.1552, "50": 0.15598}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --disable-bias-linear: true
+  --async-save: true
+  --use-persistent-ckpt-worker: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --attention-backend: unfused
+  --log-memory-to-tensorboard: true
+TEST_TYPE: regular
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.79196, "5": 10.84767, "10": 10.76997, "15": 10.79032, "20": 10.68032, "25": 10.5078, "30": 10.3335, "35": 10.25557, "40": 10.05566, "45": 9.80602, "50": 9.89125, "55": 9.87089, "60": 9.4846, "65": 8.94044, "70": 9.7223, "75": 9.40865, "80": 9.39753, "85": 9.60719, "90": 9.81041, "95": 9.51159, "100": 9.39705}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1605.0, "5": 1978.0, "10": 1442.0, "15": 1952.0, "20": 1667.0, "25": 1734.0, "30": 1952.0, "35": 2043.0, "40": 2231.0, "45": 2197.0, "50": 2405.0, "55": 2212.0, "60": 2367.0, "65": 2639.0, "70": 3196.0, "75": 2592.0, "80": 3222.0, "85": 3406.0, "90": 3002.0, "95": 3368.0, "100": 3152.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 716834304.0, "5": 716834304.0, "10": 716834304.0, "15": 716834304.0, "20": 716834304.0, "25": 716834304.0, "30": 716834304.0, "35": 716834304.0, "40": 716834304.0, "45": 716834304.0, "50": 716834304.0, "55": 716834304.0, "60": 716834304.0, "65": 716834304.0, "70": 716834304.0, "75": 716834304.0, "80": 716834304.0, "85": 716834304.0, "90": 716834304.0, "95": 716834304.0, "100": 716834304.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1910424576.0, "5": 2193074176.0, "10": 2193074176.0, "15": 2193074176.0, "20": 2193074176.0, "25": 2193074176.0, "30": 2193074176.0, "35": 2193074176.0, "40": 2193074176.0, "45": 2193074176.0, "50": 2193074176.0, "55": 2193074176.0, "60": 2193074176.0, "65": 2193074176.0, "70": 2193074176.0, "75": 2193074176.0, "80": 2193074176.0, "85": 2193074176.0, "90": 2193074176.0, "95": 2193074176.0, "100": 2193074176.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 8.46967, "5": 0.1483, "10": 0.14544, "15": 0.14588, "20": 0.14639, "25": 0.14549, "30": 0.14597, "35": 0.14608, "40": 0.14578, "45": 0.14542, "50": 0.14492, "55": 0.14474, "60": 0.14635, "65": 0.14621, "70": 0.14453, "75": 0.14374, "80": 0.14465, "85": 0.14456, "90": 0.14413, "95": 0.14445, "100": 0.14399}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.79205, "5": 10.84695, "10": 10.77106, "15": 10.79093, "20": 10.68042, "25": 10.50715, "30": 10.33325, "35": 10.25545, "40": 10.05544, "45": 9.80575, "50": 9.89082, "55": 9.87063, "60": 9.48478, "65": 8.94022, "70": 9.72243, "75": 9.40907, "80": 9.3976, "85": 9.60746, "90": 9.81041, "95": 9.5116, "100": 9.39722}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1580.0, "5": 1901.0, "10": 1346.0, "15": 1926.0, "20": 1643.0, "25": 1683.0, "30": 1867.0, "35": 2020.0, "40": 2252.0, "45": 2243.0, "50": 2459.0, "55": 2291.0, "60": 2404.0, "65": 2474.0, "70": 3102.0, "75": 2603.0, "80": 3420.0, "85": 3388.0, "90": 2904.0, "95": 3333.0, "100": 3347.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 714736640.0, "5": 714736640.0, "10": 714736640.0, "15": 714736640.0, "20": 714736640.0, "25": 714736640.0, "30": 714736640.0, "35": 714736640.0, "40": 714736640.0, "45": 714736640.0, "50": 714736640.0, "55": 714736640.0, "60": 714736640.0, "65": 714736640.0, "70": 714736640.0, "75": 714736640.0, "80": 714736640.0, "85": 714736640.0, "90": 714736640.0, "95": 714736640.0, "100": 714736640.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2399714304.0, "5": 2681315328.0, "10": 2681315328.0, "15": 2681315328.0, "20": 2681315328.0, "25": 2681315328.0, "30": 2681315328.0, "35": 2681315328.0, "40": 2681315328.0, "45": 2681315328.0, "50": 2681315328.0, "55": 2681315328.0, "60": 2681315328.0, "65": 2681315328.0, "70": 2681315328.0, "75": 2681315328.0, "80": 2681315328.0, "85": 2681315328.0, "90": 2681315328.0, "95": 2681315328.0, "100": 2681315328.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 9.27792, "5": 0.15646, "10": 0.15784, "15": 0.15721, "20": 0.15673, "25": 0.15668, "30": 0.15634, "35": 0.1575, "40": 0.1572, "45": 0.15552, "50": 0.15469, "55": 0.16595, "60": 0.16703, "65": 0.16692, "70": 0.15969, "75": 0.15799, "80": 0.15892, "85": 0.15874, "90": 0.159, "95": 0.16041, "100": 0.15753}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -49,4 +49,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.79196, "5": 10.84767, "10": 10.76997, "15": 10.79032, "20": 10.68032, "25": 10.5078, "30": 10.3335, "35": 10.25557, "40": 10.05566, "45": 9.80602, "50": 9.89125, "55": 9.87089, "60": 9.4846, "65": 8.94044, "70": 9.7223, "75": 9.40865, "80": 9.39753, "85": 9.60719, "90": 9.81041, "95": 9.51159, "100": 9.39705}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1605.0, "5": 1978.0, "10": 1442.0, "15": 1952.0, "20": 1667.0, "25": 1734.0, "30": 1952.0, "35": 2043.0, "40": 2231.0, "45": 2197.0, "50": 2405.0, "55": 2212.0, "60": 2367.0, "65": 2639.0, "70": 3196.0, "75": 2592.0, "80": 3222.0, "85": 3406.0, "90": 3002.0, "95": 3368.0, "100": 3152.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 716834304.0, "5": 716834304.0, "10": 716834304.0, "15": 716834304.0, "20": 716834304.0, "25": 716834304.0, "30": 716834304.0, "35": 716834304.0, "40": 716834304.0, "45": 716834304.0, "50": 716834304.0, "55": 716834304.0, "60": 716834304.0, "65": 716834304.0, "70": 716834304.0, "75": 716834304.0, "80": 716834304.0, "85": 716834304.0, "90": 716834304.0, "95": 716834304.0, "100": 716834304.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1910424576.0, "5": 2193074176.0, "10": 2193074176.0, "15": 2193074176.0, "20": 2193074176.0, "25": 2193074176.0, "30": 2193074176.0, "35": 2193074176.0, "40": 2193074176.0, "45": 2193074176.0, "50": 2193074176.0, "55": 2193074176.0, "60": 2193074176.0, "65": 2193074176.0, "70": 2193074176.0, "75": 2193074176.0, "80": 2193074176.0, "85": 2193074176.0, "90": 2193074176.0, "95": 2193074176.0, "100": 2193074176.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 8.46967, "5": 0.1483, "10": 0.14544, "15": 0.14588, "20": 0.14639, "25": 0.14549, "30": 0.14597, "35": 0.14608, "40": 0.14578, "45": 0.14542, "50": 0.14492, "55": 0.14474, "60": 0.14635, "65": 0.14621, "70": 0.14453, "75": 0.14374, "80": 0.14465, "85": 0.14456, "90": 0.14413, "95": 0.14445, "100": 0.14399}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.79205, "5": 10.84695, "10": 10.77106, "15": 10.79093, "20": 10.68042, "25": 10.50715, "30": 10.33325, "35": 10.25545, "40": 10.05544, "45": 9.80575, "50": 9.89082, "55": 9.87063, "60": 9.48478, "65": 8.94022, "70": 9.72243, "75": 9.40907, "80": 9.3976, "85": 9.60746, "90": 9.81041, "95": 9.5116, "100": 9.39722}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1580.0, "5": 1901.0, "10": 1346.0, "15": 1926.0, "20": 1643.0, "25": 1683.0, "30": 1867.0, "35": 2020.0, "40": 2252.0, "45": 2243.0, "50": 2459.0, "55": 2291.0, "60": 2404.0, "65": 2474.0, "70": 3102.0, "75": 2603.0, "80": 3420.0, "85": 3388.0, "90": 2904.0, "95": 3333.0, "100": 3347.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 714736640.0, "5": 714736640.0, "10": 714736640.0, "15": 714736640.0, "20": 714736640.0, "25": 714736640.0, "30": 714736640.0, "35": 714736640.0, "40": 714736640.0, "45": 714736640.0, "50": 714736640.0, "55": 714736640.0, "60": 714736640.0, "65": 714736640.0, "70": 714736640.0, "75": 714736640.0, "80": 714736640.0, "85": 714736640.0, "90": 714736640.0, "95": 714736640.0, "100": 714736640.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2399714304.0, "5": 2681315328.0, "10": 2681315328.0, "15": 2681315328.0, "20": 2681315328.0, "25": 2681315328.0, "30": 2681315328.0, "35": 2681315328.0, "40": 2681315328.0, "45": 2681315328.0, "50": 2681315328.0, "55": 2681315328.0, "60": 2681315328.0, "65": 2681315328.0, "70": 2681315328.0, "75": 2681315328.0, "80": 2681315328.0, "85": 2681315328.0, "90": 2681315328.0, "95": 2681315328.0, "100": 2681315328.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 11.50484, "5": 0.15616, "10": 0.15661, "15": 0.15542, "20": 0.15597, "25": 0.15623, "30": 0.15732, "35": 0.15649, "40": 0.15774, "45": 0.15673, "50": 0.15646, "55": 0.1599, "60": 0.16087, "65": 0.16049, "70": 0.15987, "75": 0.15957, "80": 0.16064, "85": 0.16045, "90": 0.15984, "95": 0.15992, "100": 0.15958}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
+  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 4
+  --disable-bias-linear: true
+  --async-save: true
+  --use-persistent-ckpt-worker: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-checkpoint-opt_param-scheduler: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --log-memory-to-tensorboard: true
+TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.79196, "5": 10.84662, "10": 10.76844, "15": 10.78913, "20": 10.67859, "25": 10.50479, "30": 10.33089, "35": 10.25263, "40": 10.05242, "45": 9.80271, "50": 9.8884, "55": 9.86828, "60": 9.48223, "65": 8.93813, "70": 9.72081, "75": 9.40746, "80": 9.39636, "85": 9.60619, "90": 9.80953, "95": 9.51078, "100": 9.39612}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1613.0, "5": 1926.0, "10": 1432.0, "15": 1941.0, "20": 1592.0, "25": 1650.0, "30": 1891.0, "35": 1963.0, "40": 2255.0, "45": 2132.0, "50": 2411.0, "55": 2240.0, "60": 2443.0, "65": 2672.0, "70": 3168.0, "75": 2545.0, "80": 3353.0, "85": 3257.0, "90": 3171.0, "95": 3247.0, "100": 3375.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 717083136.0, "5": 717083136.0, "10": 717083136.0, "15": 717083136.0, "20": 717083136.0, "25": 717083136.0, "30": 717083136.0, "35": 717083136.0, "40": 717083136.0, "45": 717083136.0, "50": 717083136.0, "55": 717083136.0, "60": 717083136.0, "65": 717083136.0, "70": 717083136.0, "75": 717083136.0, "80": 717083136.0, "85": 717083136.0, "90": 717083136.0, "95": 717083136.0, "100": 717083136.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1910562816.0, "5": 2193323008.0, "10": 2193323008.0, "15": 2193323008.0, "20": 2193323008.0, "25": 2193323008.0, "30": 2193323008.0, "35": 2193323008.0, "40": 2193323008.0, "45": 2193323008.0, "50": 2193323008.0, "55": 2193323008.0, "60": 2193323008.0, "65": 2193323008.0, "70": 2193323008.0, "75": 2193323008.0, "80": 2193323008.0, "85": 2193323008.0, "90": 2193323008.0, "95": 2193323008.0, "100": 2193323008.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 13.72057, "5": 0.15784, "10": 0.15852, "15": 0.1575, "20": 0.15713, "25": 0.15769, "30": 0.15681, "35": 0.15447, "40": 0.15299, "45": 0.15347, "50": 0.15277, "55": 0.15216, "60": 0.15166, "65": 0.1519, "70": 0.15205, "75": 0.15222, "80": 0.15253, "85": 0.15199, "90": 0.15133, "95": 0.15154, "100": 0.15192}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.79208, "5": 10.8459, "10": 10.76945, "15": 10.78965, "20": 10.67868, "25": 10.50409, "30": 10.33064, "35": 10.25257, "40": 10.0522, "45": 9.80243, "50": 9.88792, "55": 9.86799, "60": 9.48248, "65": 8.93796, "70": 9.72094, "75": 9.40786, "80": 9.39646, "85": 9.60638, "90": 9.8096, "95": 9.51078, "100": 9.39625}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1633.0, "5": 1952.0, "10": 1432.0, "15": 1852.0, "20": 1592.0, "25": 1743.0, "30": 1953.0, "35": 1986.0, "40": 2180.0, "45": 2177.0, "50": 2468.0, "55": 2268.0, "60": 2427.0, "65": 2640.0, "70": 3158.0, "75": 2618.0, "80": 3274.0, "85": 3266.0, "90": 3078.0, "95": 3342.0, "100": 3345.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 714985472.0, "5": 714985472.0, "10": 714985472.0, "15": 714985472.0, "20": 714985472.0, "25": 714985472.0, "30": 714985472.0, "35": 714985472.0, "40": 714985472.0, "45": 714985472.0, "50": 714985472.0, "55": 714985472.0, "60": 714985472.0, "65": 714985472.0, "70": 714985472.0, "75": 714985472.0, "80": 714985472.0, "85": 714985472.0, "90": 714985472.0, "95": 714985472.0, "100": 714985472.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2399852544.0, "5": 2681564160.0, "10": 2681564160.0, "15": 2681564160.0, "20": 2681564160.0, "25": 2681564160.0, "30": 2681564160.0, "35": 2681564160.0, "40": 2681564160.0, "45": 2681564160.0, "50": 2681564160.0, "55": 2681564160.0, "60": 2681564160.0, "65": 2681564160.0, "70": 2681564160.0, "75": 2681564160.0, "80": 2681564160.0, "85": 2681564160.0, "90": 2681564160.0, "95": 2681564160.0, "100": 2681564160.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 15.13387, "5": 0.16765, "10": 0.16782, "15": 0.16572, "20": 0.16589, "25": 0.16624, "30": 0.16596, "35": 0.16694, "40": 0.16658, "45": 0.1656, "50": 0.16593, "55": 0.16847, "60": 0.16671, "65": 0.16618, "70": 0.16477, "75": 0.1663, "80": 0.16601, "85": 0.16704, "90": 0.16563, "95": 0.16515, "100": 0.16582}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -49,4 +49,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.74036, "5": 10.79133, "10": 10.71217, "15": 10.75916, "20": 10.68909, "25": 10.5421, "30": 10.45456, "35": 10.38155, "40": 10.24241, "45": 9.9827, "50": 10.06896, "55": 9.98885, "60": 9.66601, "65": 9.07115, "70": 9.81824, "75": 9.55308, "80": 9.51136, "85": 9.70682, "90": 9.87981, "95": 9.60074, "100": 9.49208}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2496.0, "5": 2768.0, "10": 2420.0, "15": 2572.0, "20": 2580.0, "25": 2521.0, "30": 2632.0, "35": 2626.0, "40": 2628.0, "45": 2362.0, "50": 2543.0, "55": 2498.0, "60": 2239.0, "65": 2652.0, "70": 3100.0, "75": 2597.0, "80": 3019.0, "85": 3171.0, "90": 3464.0, "95": 3134.0, "100": 2555.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 717157888.0, "5": 717157888.0, "10": 717157888.0, "15": 717157888.0, "20": 717157888.0, "25": 717157888.0, "30": 717157888.0, "35": 717157888.0, "40": 717157888.0, "45": 717157888.0, "50": 717157888.0, "55": 717157888.0, "60": 717157888.0, "65": 717157888.0, "70": 717157888.0, "75": 717157888.0, "80": 717157888.0, "85": 717157888.0, "90": 717157888.0, "95": 717157888.0, "100": 717157888.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1910556672.0, "5": 2194053120.0, "10": 2194053120.0, "15": 2194053120.0, "20": 2194053120.0, "25": 2194053120.0, "30": 2194053120.0, "35": 2194053120.0, "40": 2194053120.0, "45": 2194053120.0, "50": 2194053120.0, "55": 2194053120.0, "60": 2194053120.0, "65": 2194053120.0, "70": 2194053120.0, "75": 2194053120.0, "80": 2194053120.0, "85": 2194053120.0, "90": 2194053120.0, "95": 2194053120.0, "100": 2194053120.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 14.70264, "5": 0.15996, "10": 0.15856, "15": 0.15761, "20": 0.15818, "25": 0.15823, "30": 0.15624, "35": 0.1572, "40": 0.15555, "45": 0.15747, "50": 0.15543, "55": 0.15768, "60": 0.15761, "65": 0.1577, "70": 0.41222, "75": 0.15706, "80": 0.15755, "85": 0.15717, "90": 0.15749, "95": 0.15708, "100": 0.15789}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.74049, "5": 10.79201, "10": 10.71088, "15": 10.76031, "20": 10.68908, "25": 10.54336, "30": 10.45425, "35": 10.38323, "40": 10.24297, "45": 9.98344, "50": 10.06864, "55": 9.9892, "60": 9.66702, "65": 9.07244, "70": 9.81879, "75": 9.55278, "80": 9.51061, "85": 9.70753, "90": 9.87996, "95": 9.60069, "100": 9.49261}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2527.0, "5": 2875.0, "10": 2475.0, "15": 2508.0, "20": 2634.0, "25": 2391.0, "30": 2505.0, "35": 2580.0, "40": 2568.0, "45": 2375.0, "50": 2618.0, "55": 2379.0, "60": 2183.0, "65": 2639.0, "70": 3090.0, "75": 2496.0, "80": 3076.0, "85": 3189.0, "90": 3454.0, "95": 3150.0, "100": 2593.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 715322368.0, "5": 715322368.0, "10": 715322368.0, "15": 715322368.0, "20": 715322368.0, "25": 715322368.0, "30": 715322368.0, "35": 715322368.0, "40": 715322368.0, "45": 715322368.0, "50": 715322368.0, "55": 715322368.0, "60": 715322368.0, "65": 715322368.0, "70": 715322368.0, "75": 715322368.0, "80": 715322368.0, "85": 715322368.0, "90": 715322368.0, "95": 715322368.0, "100": 715322368.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2402991104.0, "5": 2683341824.0, "10": 2683341824.0, "15": 2683341824.0, "20": 2683341824.0, "25": 2683341824.0, "30": 2683341824.0, "35": 2683341824.0, "40": 2683341824.0, "45": 2683341824.0, "50": 2683341824.0, "55": 2683341824.0, "60": 2683341824.0, "65": 2683341824.0, "70": 2683341824.0, "75": 2683341824.0, "80": 2683341824.0, "85": 2683341824.0, "90": 2683341824.0, "95": 2683341824.0, "100": 2683341824.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 17.64292, "5": 0.17363, "10": 0.17156, "15": 0.17206, "20": 0.1701, "25": 0.17207, "30": 0.16951, "35": 0.17005, "40": 0.17036, "45": 0.17005, "50": 0.16935, "55": 0.16909, "60": 0.16956, "65": 0.16911, "70": 0.16772, "75": 0.16805, "80": 0.16819, "85": 0.16813, "90": 0.30023, "95": 0.16879, "100": 0.16784}}}
\ No newline at end of file