更新代码

688448db · silencealiang · a02a5490 · 688448db · 688448db · 688448db
Commit 688448db authored Mar 14, 2025 by silencealiang
20 changed files
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8401, 10.87262, 10.85023, 10.79645, 10.68149, 10.60617, 10.1277, 10.22183, 10.13794, 9.8231]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1670.0, 1901.0, 1923.0, 1922.0, 2020.0, 1815.0, 1713.0, 1963.0, 2266.0, 2324.0]}, "iteration_timing_avg": 0.09164500000000002}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.8401, "5": 10.84032, "10": 10.81341, "15": 10.80278, "20": 10.70496, "25": 10.53846, "30": 10.35517, "35": 10.27147, "40": 10.08045, "45": 9.82292, "50": 9.90114}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1670.0, "5": 1970.0, "10": 1436.0, "15": 1918.0, "20": 1786.0, "25": 1610.0, "30": 2039.0, "35": 2001.0, "40": 2321.0, "45": 2205.0, "50": 2365.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1230390272.0, "5": 1230390272.0, "10": 1230390272.0, "15": 1230390272.0, "20": 1230390272.0, "25": 1230390272.0, "30": 1230390272.0, "35": 1230390272.0, "40": 1230390272.0, "45": 1230390272.0, "50": 1230390272.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1984492032.0, "5": 2531972608.0, "10": 2531972608.0, "15": 2531972608.0, "20": 2531972608.0, "25": 2531972608.0, "30": 2531972608.0, "35": 2531972608.0, "40": 2531972608.0, "45": 2531972608.0, "50": 2531972608.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 8.0418, "5": 0.12998, "10": 0.12656, "15": 0.12621, "20": 0.13103, "25": 0.12628, "30": 0.12409, "35": 0.12632, "40": 0.13313, "45": 0.12545, "50": 0.12421}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 50
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -50,4 +50,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
  --attention-backend: unfused
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_dev.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.82974, 10.85934, 10.88536, 10.78981, 10.64534, 10.56415, 9.99534, 10.13972, 10.06259, 9.71481]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [261.0, 256.0, 258.0, 250.0, 243.0, 265.0, 254.0, 299.0, 299.0, 294.0]}, "iteration_timing_avg": 0.3993126470588235}
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.82974, "5": 10.84387, "10": 10.79336, "15": 10.77992, "20": 10.67707, "25": 10.48581, "30": 10.28464, "35": 10.18863, "40": 9.99275, "45": 9.72154, "50": 9.82122}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 214.0, "5": 270.0, "10": 224.0, "15": 235.0, "20": 242.0, "25": 260.0, "30": 280.0, "35": 300.0, "40": 334.0, "45": 324.0, "50": 298.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 829378048.0, "5": 829378048.0, "10": 829378048.0, "15": 829378048.0, "20": 829378048.0, "25": 829378048.0, "30": 829378048.0, "35": 829378048.0, "40": 829378048.0, "45": 829378048.0, "50": 829378048.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 891564544.0, "5": 1248933376.0, "10": 1250505728.0, "15": 1250505728.0, "20": 1250505728.0, "25": 1250505728.0, "30": 1250505728.0, "35": 1250505728.0, "40": 1250505728.0, "45": 1250505728.0, "50": 1250505728.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 20.58657, "5": 0.44565, "10": 0.45716, "15": 0.50953, "20": 0.44872, "25": 0.44791, "30": 0.44871, "35": 0.44188, "40": 0.44233, "45": 0.44161, "50": 0.44069}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_lts.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85803, 10.88122, 10.85832, 10.80987, 10.66115, 10.55375, 10.01843, 10.14234, 10.05958, 9.71149]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [244.0, 231.0, 243.0, 257.0, 247.0, 267.0, 256.0, 299.0, 318.0, 325.0]}, "iteration_timing_avg": 0.3993126470588235}
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.82975, "5": 10.8439, "10": 10.79337, "15": 10.77994, "20": 10.67712, "25": 10.48584, "30": 10.28468, "35": 10.18859, "40": 9.99279, "45": 9.72153, "50": 9.82127}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 226.0, "5": 275.0, "10": 181.0, "15": 253.0, "20": 248.0, "25": 207.0, "30": 265.0, "35": 281.0, "40": 315.0, "45": 282.0, "50": 336.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 831212544.0, "5": 831212544.0, "10": 831212544.0, "15": 831212544.0, "20": 831212544.0, "25": 831212544.0, "30": 831212544.0, "35": 831212544.0, "40": 831212544.0, "45": 831212544.0, "50": 831212544.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 891582464.0, "5": 1250786304.0, "10": 1250786304.0, "15": 1250786304.0, "20": 1250786304.0, "25": 1251832320.0, "30": 1251832320.0, "35": 1251832320.0, "40": 1251832320.0, "45": 1251832320.0, "50": 1251832320.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 13.63617, "5": 0.42436, "10": 0.41552, "15": 0.4158, "20": 0.41223, "25": 0.40643, "30": 0.40417, "35": 0.40442, "40": 0.40546, "45": 0.40627, "50": 0.40596}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml
@@ -17,8 +17,8 @@ MODEL_ARGS:
  --train-iters: 50
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -50,5 +50,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
  --attention-backend: flash
-  
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.82974, "5": 10.84387, "10": 10.79336, "15": 10.77992, "20": 10.67707, "25": 10.48581, "30": 10.28464, "35": 10.18863, "40": 9.99275, "45": 9.72154, "50": 9.82122, "55": 9.79605, "60": 9.41615, "65": 8.85917, "70": 9.67001, "75": 9.3564, "80": 9.34748, "85": 9.55946, "90": 9.77362, "95": 9.47863, "100": 9.35146}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 214.0, "5": 270.0, "10": 224.0, "15": 235.0, "20": 242.0, "25": 260.0, "30": 280.0, "35": 300.0, "40": 334.0, "45": 324.0, "50": 298.0, "55": 390.0, "60": 342.0, "65": 394.0, "70": 411.0, "75": 319.0, "80": 414.0, "85": 441.0, "90": 381.0, "95": 398.0, "100": 431.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 829378048.0, "5": 829378048.0, "10": 829378048.0, "15": 829378048.0, "20": 829378048.0, "25": 829378048.0, "30": 829378048.0, "35": 829378048.0, "40": 829378048.0, "45": 829378048.0, "50": 829378048.0, "55": 829378048.0, "60": 829378048.0, "65": 829378048.0, "70": 829378048.0, "75": 829378048.0, "80": 829378048.0, "85": 829378048.0, "90": 829378048.0, "95": 829378048.0, "100": 829378048.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 892610560.0, "5": 1248933376.0, "10": 1248933376.0, "15": 1248933376.0, "20": 1248933376.0, "25": 1248933376.0, "30": 1248933376.0, "35": 1249456128.0, "40": 1249456128.0, "45": 1249456128.0, "50": 1249980928.0, "55": 1249980928.0, "60": 1249980928.0, "65": 1249980928.0, "70": 1249980928.0, "75": 1250504192.0, "80": 1250504192.0, "85": 1250504192.0, "90": 1250505728.0, "95": 1250505728.0, "100": 1250505728.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 24.66296, "5": 0.45069, "10": 0.44192, "15": 0.44436, "20": 0.442, "25": 0.44288, "30": 0.44618, "35": 0.44139, "40": 0.44072, "45": 0.44429, "50": 0.43893, "55": 0.43569, "60": 0.43551, "65": 0.43912, "70": 0.44568, "75": 0.44023, "80": 0.43745, "85": 0.43617, "90": 0.43925, "95": 0.43653, "100": 0.43561}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.82975, "5": 10.8439, "10": 10.79337, "15": 10.77994, "20": 10.67712, "25": 10.48584, "30": 10.28468, "35": 10.18859, "40": 9.99279, "45": 9.72153, "50": 9.82127, "55": 9.79611, "60": 9.41616, "65": 8.85917, "70": 9.67001, "75": 9.35641, "80": 9.34751, "85": 9.55947, "90": 9.77366, "95": 9.47865, "100": 9.35145}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 226.0, "5": 275.0, "10": 181.0, "15": 253.0, "20": 248.0, "25": 207.0, "30": 265.0, "35": 281.0, "40": 315.0, "45": 282.0, "50": 336.0, "55": 373.0, "60": 343.0, "65": 389.0, "70": 436.0, "75": 337.0, "80": 395.0, "85": 419.0, "90": 412.0, "95": 405.0, "100": 394.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 831212544.0, "5": 831212544.0, "10": 831212544.0, "15": 831212544.0, "20": 831212544.0, "25": 831212544.0, "30": 831212544.0, "35": 831212544.0, "40": 831212544.0, "45": 831212544.0, "50": 831212544.0, "55": 831212544.0, "60": 831212544.0, "65": 831212544.0, "70": 831212544.0, "75": 831212544.0, "80": 831212544.0, "85": 831212544.0, "90": 831212544.0, "95": 831212544.0, "100": 831212544.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 891582464.0, "5": 1250786304.0, "10": 1250786304.0, "15": 1250786304.0, "20": 1250786304.0, "25": 1250786304.0, "30": 1250786304.0, "35": 1250786304.0, "40": 1251834880.0, "45": 1251834880.0, "50": 1251834880.0, "55": 1251834880.0, "60": 1251834880.0, "65": 1251834880.0, "70": 1251834880.0, "75": 1251834880.0, "80": 1251834880.0, "85": 1251834880.0, "90": 1251834880.0, "95": 1251834880.0, "100": 1251834880.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 14.7102, "5": 0.46307, "10": 0.41777, "15": 0.41661, "20": 0.41769, "25": 0.42698, "30": 0.41765, "35": 0.42804, "40": 0.42081, "45": 0.42234, "50": 0.41276, "55": 0.43287, "60": 0.43055, "65": 0.43352, "70": 0.42189, "75": 0.42153, "80": 0.41723, "85": 0.40522, "90": 0.40231, "95": 0.4016, "100": 0.40172}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml
@@ -17,8 +17,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -51,4 +51,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
  --attention-backend: flash
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.82949, "5": 10.84768, "10": 10.79952, "15": 10.83278, "20": 10.75815, "25": 10.59944, "30": 10.44255, "35": 10.35518, "40": 10.17871, "45": 9.93731, "50": 9.99597, "55": 9.96506, "60": 9.59206, "65": 9.01654, "70": 9.78255, "75": 9.48023, "80": 9.4506, "85": 9.65781, "90": 9.84565, "95": 9.54832, "100": 9.43863}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 30437.0, "5": 35925.0, "10": 29186.0, "15": 34264.0, "20": 32053.0, "25": 30879.0, "30": 33163.0, "35": 34561.0, "40": 35765.0, "45": 35584.0, "50": 39786.0, "55": 37204.0, "60": 40266.0, "65": 41421.0, "70": 45637.0, "75": 40348.0, "80": 46876.0, "85": 49638.0, "90": 49468.0, "95": 47017.0, "100": 45528.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 936543232.0, "5": 936543744.0, "10": 936542720.0, "15": 936543232.0, "20": 936544768.0, "25": 936543232.0, "30": 936543232.0, "35": 936541184.0, "40": 936542720.0, "45": 936543232.0, "50": 936544256.0, "55": 936546816.0, "60": 936547328.0, "65": 936556032.0, "70": 936546816.0, "75": 936544256.0, "80": 936556544.0, "85": 936553984.0, "90": 936546304.0, "95": 936548352.0, "100": 936551936.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2451792384.0, "5": 2720901120.0, "10": 2720901120.0, "15": 2720901120.0, "20": 2720901120.0, "25": 2720901120.0, "30": 2720901120.0, "35": 2721362432.0, "40": 2721362432.0, "45": 2721362432.0, "50": 2724393984.0, "55": 2724393984.0, "60": 2728018432.0, "65": 2738826240.0, "70": 2738826240.0, "75": 2740684288.0, "80": 2740684288.0, "85": 2740684288.0, "90": 2741338624.0, "95": 2741338624.0, "100": 2741338624.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 17.42632, "5": 0.24404, "10": 0.242, "15": 0.23944, "20": 0.23931, "25": 0.23806, "30": 0.23357, "35": 0.23421, "40": 0.23628, "45": 0.23522, "50": 0.23575, "55": 0.24699, "60": 0.24808, "65": 0.25066, "70": 0.23754, "75": 0.23814, "80": 0.23925, "85": 0.23699, "90": 0.23541, "95": 0.23763, "100": 0.23866}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.82949, "5": 10.84751, "10": 10.79994, "15": 10.83348, "20": 10.75739, "25": 10.59863, "30": 10.44207, "35": 10.35534, "40": 10.17846, "45": 9.93775, "50": 9.99583, "55": 9.96526, "60": 9.59209, "65": 9.01675, "70": 9.78268, "75": 9.4802, "80": 9.45051, "85": 9.65787, "90": 9.84587, "95": 9.54779, "100": 9.43905}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 30304.0, "5": 35542.0, "10": 29062.0, "15": 34559.0, "20": 31981.0, "25": 30845.0, "30": 32894.0, "35": 34952.0, "40": 36358.0, "45": 35638.0, "50": 40119.0, "55": 36895.0, "60": 39710.0, "65": 41463.0, "70": 45566.0, "75": 40307.0, "80": 46882.0, "85": 50049.0, "90": 49238.0, "95": 47300.0, "100": 45898.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 936567296.0, "5": 936566784.0, "10": 936566784.0, "15": 936567808.0, "20": 936568832.0, "25": 936565760.0, "30": 936568320.0, "35": 936564736.0, "40": 936566784.0, "45": 936566784.0, "50": 936567808.0, "55": 936570880.0, "60": 936570880.0, "65": 936580608.0, "70": 936571392.0, "75": 936568320.0, "80": 936580608.0, "85": 936578560.0, "90": 936569856.0, "95": 936572416.0, "100": 936576512.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 24.80877, "5": 0.2156, "10": 0.23039, "15": 0.21152, "20": 0.21327, "25": 0.2116, "30": 0.20846, "35": 0.2099, "40": 0.20891, "45": 0.20828, "50": 0.20799, "55": 0.20851, "60": 0.20961, "65": 0.21172, "70": 0.20966, "75": 0.20994, "80": 0.21009, "85": 0.20683, "90": 0.20599, "95": 0.20814, "100": 0.20924}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
@@ -17,8 +17,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -52,4 +52,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.84764, "5": 10.86567, "10": 10.82469, "15": 10.81348, "20": 10.72058, "25": 10.53162, "30": 10.33683, "35": 10.24089, "40": 10.05113, "45": 9.76815, "50": 9.85503, "55": 9.82458, "60": 9.44286, "65": 8.89124, "70": 9.67905, "75": 9.36822, "80": 9.35789, "85": 9.56054, "90": 9.77055, "95": 9.48111, "100": 9.34966}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1736.0, "5": 1989.0, "10": 1643.0, "15": 1984.0, "20": 1713.0, "25": 1775.0, "30": 2005.0, "35": 2093.0, "40": 2238.0, "45": 2229.0, "50": 2348.0, "55": 2407.0, "60": 2545.0, "65": 2732.0, "70": 3041.0, "75": 2930.0, "80": 3261.0, "85": 3370.0, "90": 3188.0, "95": 3193.0, "100": 3397.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 894390272.0, "5": 894390272.0, "10": 894390272.0, "15": 894390272.0, "20": 894390272.0, "25": 894390272.0, "30": 894390272.0, "35": 894390272.0, "40": 894390272.0, "45": 894390272.0, "50": 894390272.0, "55": 894390272.0, "60": 894390272.0, "65": 894390272.0, "70": 894390272.0, "75": 894390272.0, "80": 894390272.0, "85": 894390272.0, "90": 894390272.0, "95": 894390272.0, "100": 894390272.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2233004032.0, "5": 2597712896.0, "10": 2597712896.0, "15": 2597712896.0, "20": 2597712896.0, "25": 2597712896.0, "30": 2597712896.0, "35": 2597712896.0, "40": 2597712896.0, "45": 2597712896.0, "50": 2597712896.0, "55": 2597712896.0, "60": 2597712896.0, "65": 2597712896.0, "70": 2597712896.0, "75": 2597712896.0, "80": 2597712896.0, "85": 2597712896.0, "90": 2597712896.0, "95": 2597712896.0, "100": 2597712896.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 9.04286, "5": 0.12525, "10": 0.12905, "15": 0.12687, "20": 0.12848, "25": 0.12854, "30": 0.12621, "35": 0.1283, "40": 0.12782, "45": 0.12535, "50": 0.12584, "55": 0.12504, "60": 0.1249, "65": 0.36941, "70": 0.12553, "75": 0.12455, "80": 0.12658, "85": 0.12479, "90": 0.12521, "95": 0.12546, "100": 0.1255}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.8468, "5": 10.8657, "10": 10.82411, "15": 10.8128, "20": 10.72008, "25": 10.53151, "30": 10.33655, "35": 10.24133, "40": 10.05096, "45": 9.76804, "50": 9.85531, "55": 9.82458, "60": 9.4433, "65": 8.89103, "70": 9.67922, "75": 9.36864, "80": 9.35829, "85": 9.56053, "90": 9.77063, "95": 9.48104, "100": 9.34984}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1707.0, "5": 2121.0, "10": 1606.0, "15": 1959.0, "20": 1756.0, "25": 1848.0, "30": 2091.0, "35": 2089.0, "40": 2156.0, "45": 2137.0, "50": 2317.0, "55": 2485.0, "60": 2487.0, "65": 2748.0, "70": 3067.0, "75": 2801.0, "80": 3131.0, "85": 3343.0, "90": 3084.0, "95": 3062.0, "100": 3270.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 888098304.0, "5": 888098304.0, "10": 888098304.0, "15": 888098304.0, "20": 888098304.0, "25": 888098304.0, "30": 888098304.0, "35": 888098304.0, "40": 888098304.0, "45": 888098304.0, "50": 888098304.0, "55": 888098304.0, "60": 888098304.0, "65": 888098304.0, "70": 888098304.0, "75": 888098304.0, "80": 888098304.0, "85": 888098304.0, "90": 888098304.0, "95": 888098304.0, "100": 888098304.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 3212632576.0, "5": 3572098560.0, "10": 3572098560.0, "15": 3572098560.0, "20": 3572098560.0, "25": 3572098560.0, "30": 3572098560.0, "35": 3572098560.0, "40": 3572098560.0, "45": 3572098560.0, "50": 3572098560.0, "55": 3572098560.0, "60": 3572098560.0, "65": 3572098560.0, "70": 3572098560.0, "75": 3572098560.0, "80": 3572098560.0, "85": 3572098560.0, "90": 3572098560.0, "95": 3572098560.0, "100": 3572098560.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.16354, "5": 0.14461, "10": 0.14503, "15": 0.14287, "20": 0.14648, "25": 0.14267, "30": 0.14304, "35": 0.14471, "40": 0.14334, "45": 0.14299, "50": 0.14181, "55": 0.14263, "60": 0.14235, "65": 0.14203, "70": 0.14227, "75": 0.14188, "80": 0.14258, "85": 0.14302, "90": 0.14176, "95": 0.14354, "100": 0.14267}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -49,4 +49,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.84554, "5": 10.86415, "10": 10.82215, "15": 10.81274, "20": 10.71915, "25": 10.53056, "30": 10.33604, "35": 10.24047, "40": 10.05025, "45": 9.76775, "50": 9.85479, "55": 9.82458, "60": 9.44264, "65": 8.89112, "70": 9.6789, "75": 9.36801, "80": 9.3576, "85": 9.56029, "90": 9.77049, "95": 9.48101, "100": 9.34984}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1700.0, "5": 2064.0, "10": 1561.0, "15": 1975.0, "20": 1696.0, "25": 1796.0, "30": 2014.0, "35": 2041.0, "40": 2189.0, "45": 2150.0, "50": 2403.0, "55": 2453.0, "60": 2540.0, "65": 2707.0, "70": 3080.0, "75": 2725.0, "80": 3156.0, "85": 3362.0, "90": 3032.0, "95": 3108.0, "100": 3352.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 890195968.0, "5": 890195968.0, "10": 890195968.0, "15": 890195968.0, "20": 890195968.0, "25": 890195968.0, "30": 890195968.0, "35": 890195968.0, "40": 890195968.0, "45": 890195968.0, "50": 890195968.0, "55": 890195968.0, "60": 890195968.0, "65": 890195968.0, "70": 890195968.0, "75": 890195968.0, "80": 890195968.0, "85": 890195968.0, "90": 890195968.0, "95": 890195968.0, "100": 890195968.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2236149760.0, "5": 2596664320.0, "10": 2596664320.0, "15": 2596664320.0, "20": 2596664320.0, "25": 2596664320.0, "30": 2596664320.0, "35": 2596664320.0, "40": 2596664320.0, "45": 2596664320.0, "50": 2596664320.0, "55": 2596664320.0, "60": 2596664320.0, "65": 2596664320.0, "70": 2596664320.0, "75": 2596664320.0, "80": 2596664320.0, "85": 2596664320.0, "90": 2596664320.0, "95": 2596664320.0, "100": 2596664320.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 8.95666, "5": 0.15388, "10": 0.15258, "15": 0.15019, "20": 0.14968, "25": 0.14923, "30": 0.14924, "35": 0.14855, "40": 0.14992, "45": 0.14894, "50": 0.14897, "55": 0.15057, "60": 0.14854, "65": 0.14894, "70": 0.15078, "75": 0.14842, "80": 0.1482, "85": 0.14764, "90": 0.14679, "95": 0.14761, "100": 0.1488}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.84474, "5": 10.86418, "10": 10.82155, "15": 10.81195, "20": 10.71872, "25": 10.53036, "30": 10.3358, "35": 10.24082, "40": 10.05008, "45": 9.76762, "50": 9.85505, "55": 9.82465, "60": 9.44305, "65": 8.89104, "70": 9.67902, "75": 9.36836, "80": 9.35799, "85": 9.56032, "90": 9.77055, "95": 9.48101, "100": 9.34997}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1776.0, "5": 2128.0, "10": 1615.0, "15": 2021.0, "20": 1775.0, "25": 1916.0, "30": 2029.0, "35": 2107.0, "40": 2174.0, "45": 2110.0, "50": 2363.0, "55": 2460.0, "60": 2462.0, "65": 2724.0, "70": 2952.0, "75": 2823.0, "80": 3222.0, "85": 3314.0, "90": 3087.0, "95": 3146.0, "100": 3331.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 888098304.0, "5": 888098304.0, "10": 888098304.0, "15": 888098304.0, "20": 888098304.0, "25": 888098304.0, "30": 888098304.0, "35": 888098304.0, "40": 888098304.0, "45": 888098304.0, "50": 888098304.0, "55": 888098304.0, "60": 888098304.0, "65": 888098304.0, "70": 888098304.0, "75": 888098304.0, "80": 888098304.0, "85": 888098304.0, "90": 888098304.0, "95": 888098304.0, "100": 888098304.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 3215778304.0, "5": 3575244288.0, "10": 3575244288.0, "15": 3575244288.0, "20": 3575244288.0, "25": 3575244288.0, "30": 3575244288.0, "35": 3575244288.0, "40": 3575244288.0, "45": 3575244288.0, "50": 3575244288.0, "55": 3575244288.0, "60": 3575244288.0, "65": 3575244288.0, "70": 3575244288.0, "75": 3575244288.0, "80": 3575244288.0, "85": 3575244288.0, "90": 3575244288.0, "95": 3575244288.0, "100": 3575244288.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 13.33569, "5": 0.16136, "10": 0.15782, "15": 0.15802, "20": 0.15824, "25": 0.16808, "30": 0.16851, "35": 0.1675, "40": 0.16865, "45": 0.16815, "50": 0.16766, "55": 0.1655, "60": 0.16617, "65": 0.16519, "70": 0.16575, "75": 0.16497, "80": 0.16524, "85": 0.16595, "90": 0.16421, "95": 0.16539, "100": 0.16546}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -50,4 +50,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_dev.json
-{
-    "lm loss": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            10.8468,
-            10.87769,
-            10.90302,
-            10.82026,
-            10.67979,
-            10.60157,
-            10.06449,
-            10.19316,
-            10.11411,
-            9.76007
-        ]
-    },
-    "num-zeros": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            1692.0,
-            2044.0,
-            2005.0,
-            2007.0,
-            1945.0,
-            1868.0,
-            1701.0,
-            2085.0,
-            2389.0,
-            2377.0
-        ]
-    },
-    "iteration-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            10.20538,
-            0.14353,
-            0.14213,
-            0.14213,
-            0.14068,
-            0.14104,
-            0.14078,
-            0.14149,
-            0.14065,
-            0.14118
-        ]
-    }
-}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.8468, "5": 10.86571, "10": 10.82412, "15": 10.8128, "20": 10.7201, "25": 10.53149, "30": 10.33653, "35": 10.24134, "40": 10.05092, "45": 9.76805, "50": 9.85531}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1692.0, "5": 2135.0, "10": 1681.0, "15": 2053.0, "20": 1708.0, "25": 1835.0, "30": 2038.0, "35": 2087.0, "40": 2276.0, "45": 2125.0, "50": 2363.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 888098304.0, "5": 888098304.0, "10": 888098304.0, "15": 888098304.0, "20": 888098304.0, "25": 888098304.0, "30": 888098304.0, "35": 888098304.0, "40": 888098304.0, "45": 888098304.0, "50": 888098304.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3212632576.0, "5": 3572098560.0, "10": 3572098560.0, "15": 3572098560.0, "20": 3572098560.0, "25": 3572098560.0, "30": 3572098560.0, "35": 3572098560.0, "40": 3572098560.0, "45": 3572098560.0, "50": 3572098560.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 9.88958, "5": 0.14651, "10": 0.14518, "15": 0.14433, "20": 0.14484, "25": 0.14428, "30": 0.14459, "35": 0.1448, "40": 0.14541, "45": 0.14409, "50": 0.14459}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_lts.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8468, 10.87772, 10.90302, 10.82024, 10.67979, 10.60157, 10.06448, 10.19311, 10.1141, 9.76008]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1707.0, 2086.0, 2030.0, 2000.0, 1910.0, 1894.0, 1744.0, 2071.0, 2344.0, 2377.0]}, "iteration_timing_avg": 0.11051617647058823}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.8468, "5": 10.8657, "10": 10.82411, "15": 10.8128, "20": 10.72008, "25": 10.53151, "30": 10.33655, "35": 10.24133, "40": 10.05096, "45": 9.76804, "50": 9.85531}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1707.0, "5": 2121.0, "10": 1606.0, "15": 1959.0, "20": 1756.0, "25": 1848.0, "30": 2091.0, "35": 2089.0, "40": 2156.0, "45": 2137.0, "50": 2317.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 888098304.0, "5": 888098304.0, "10": 888098304.0, "15": 888098304.0, "20": 888098304.0, "25": 888098304.0, "30": 888098304.0, "35": 888098304.0, "40": 888098304.0, "45": 888098304.0, "50": 888098304.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 3212632576.0, "5": 3572098560.0, "10": 3572098560.0, "15": 3572098560.0, "20": 3572098560.0, "25": 3572098560.0, "30": 3572098560.0, "35": 3572098560.0, "40": 3572098560.0, "45": 3572098560.0, "50": 3572098560.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.69368, "5": 0.1479, "10": 0.14574, "15": 0.14499, "20": 0.14659, "25": 0.14524, "30": 0.14507, "35": 0.14609, "40": 0.1467, "45": 0.14341, "50": 0.14274}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 50
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -49,4 +49,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
  --attention-backend: unfused
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular