更新代码

688448db · silencealiang · a02a5490 · 688448db · 688448db · 688448db
Commit 688448db authored Mar 14, 2025 by silencealiang
20 changed files
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/golden_values_lts.json
-{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.80264, 10.85778, 10.86259, 10.83903, 10.82934, 10.81016, 10.60251, 10.61471, 10.54092, 10.27186, 10.24338, 10.02058, 10.03017, 9.99471, 9.84885, 9.34867, 9.67263, 9.2457, 9.53365, 9.67548]}, "num-zeros": {"start_step": 0, "end_step": 84, "step_interval": 5, "values": [8571.0, 7897.0, 7748.0, 9008.0, 9165.0, 8986.0, 9155.0, 7960.0, 7684.0, 9743.0, 8727.0, 9382.0, 10992.0, 11177.0, 11270.0, 13404.0, 11533.0]}, "iteration_timing_avg": 0.3735462686567164}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.76813, "5": 10.82426, "10": 10.7488, "15": 10.82093, "20": 10.79407, "25": 10.74528, "30": 10.68463, "35": 10.62109, "40": 10.47053, "45": 10.24915, "50": 10.27379, "55": 10.20448, "60": 9.84999, "65": 9.28499, "70": 9.94476, "75": 9.62753, "80": 9.57725, "85": 9.76823, "90": 9.93273, "95": 9.64547, "100": 9.53769}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 607448576.0, "5": 607448576.0, "10": 607448576.0, "15": 607448576.0, "20": 944340992.0, "25": 944037888.0, "30": 944954368.0, "35": 944078848.0, "40": 944078848.0, "45": 944078848.0, "50": 944992256.0, "55": 944078848.0, "60": 944078848.0, "65": 943674368.0, "70": 945127424.0, "75": 944078848.0, "80": 944322560.0, "85": 944078848.0, "90": 944078848.0, "95": 944078848.0, "100": 944993280.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1843249152.0, "5": 1843369472.0, "10": 1844654592.0, "15": 1844654592.0, "20": 2181567488.0, "25": 2181567488.0, "30": 2181567488.0, "35": 2181567488.0, "40": 2181567488.0, "45": 2181567488.0, "50": 2181567488.0, "55": 2181567488.0, "60": 2181567488.0, "65": 2181567488.0, "70": 2181567488.0, "75": 2181567488.0, "80": 2181567488.0, "85": 2181567488.0, "90": 2181567488.0, "95": 2181635584.0, "100": 2181635584.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 18.6534, "5": 0.52336, "10": 0.51659, "15": 0.52097, "20": 0.5413, "25": 0.56055, "30": 0.53271, "35": 0.54237, "40": 0.5352, "45": 0.53408, "50": 0.53304, "55": 0.53075, "60": 0.53399, "65": 0.53294, "70": 0.53179, "75": 0.69389, "80": 0.531, "85": 0.52842, "90": 0.53117, "95": 0.53133, "100": 0.72087}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 7024.0, "25": 7902.0, "30": 8336.0, "35": 7346.0, "40": 7522.0, "45": 8100.0, "50": 8998.0, "55": 8207.0, "60": 9031.0, "65": 7785.0, "70": 10580.0, "75": 9533.0, "80": 11195.0, "85": 11864.0, "90": 12414.0, "95": 13058.0, "100": 10097.0}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -51,4 +51,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --fp16: true
  --apply-query-key-layer-scaling: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_dev.json
-{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708, 10.19741, 9.9562, 9.96369, 9.91398, 9.79604, 9.2686, 9.61975, 9.19501, 9.47332, 9.62216]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0, 3656.0, 3275.0, 3203.0, 3297.0, 3364.0, 3789.0, 3277.0, 3660.0, 3733.0, 4815.0]}, "iteration_timing_avg": 0.1628459701492537}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.85929, "5": 10.87929, "10": 10.84772, "15": 10.86867, "20": 10.87317, "25": 10.83338, "30": 10.75624, "35": 10.66844, "40": 10.50171, "45": 10.28002, "50": 10.25621, "55": 10.18314, "60": 9.79897, "65": 9.24752, "70": 9.91362, "75": 9.58564, "80": 9.54312, "85": 9.72736, "90": 9.90472, "95": 9.6077, "100": 9.49935}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 380476416.0, "5": 380476416.0, "10": 380476416.0, "15": 380476416.0, "20": 560287232.0, "25": 560287232.0, "30": 560287232.0, "35": 561073664.0, "40": 560287232.0, "45": 561597952.0, "50": 561597952.0, "55": 561073664.0, "60": 561073664.0, "65": 561597952.0, "70": 560287232.0, "75": 560287232.0, "80": 560287232.0, "85": 560287232.0, "90": 560287232.0, "95": 561597952.0, "100": 560287232.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1702977024.0, "5": 1702977536.0, "10": 1702977536.0, "15": 1702977536.0, "20": 1884361216.0, "25": 1884361216.0, "30": 1884361216.0, "35": 1884361216.0, "40": 1884361216.0, "45": 1884361216.0, "50": 1884361216.0, "55": 1884361216.0, "60": 1884361216.0, "65": 1884361216.0, "70": 1884361216.0, "75": 1884361216.0, "80": 1884361216.0, "85": 1884361216.0, "90": 1884361216.0, "95": 1884361216.0, "100": 1884361216.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 4.70051, "5": 0.18489, "10": 0.1885, "15": 0.18516, "20": 0.19623, "25": 0.19562, "30": 0.19558, "35": 0.19543, "40": 0.19414, "45": 0.19546, "50": 0.1943, "55": 0.19481, "60": 0.19412, "65": 0.19731, "70": 0.19502, "75": 0.1953, "80": 0.19592, "85": 0.19662, "90": 0.19524, "95": 0.19564, "100": 0.19497}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1808.0, "25": 2385.0, "30": 2591.0, "35": 1997.0, "40": 1959.0, "45": 2368.0, "50": 3073.0, "55": 2580.0, "60": 2853.0, "65": 2346.0, "70": 3572.0, "75": 2886.0, "80": 3459.0, "85": 4068.0, "90": 3747.0, "95": 4088.0, "100": 3436.0}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_lts.json
-{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.85929, 10.89211, 10.87639, 10.86988, 10.88179, 10.83898, 10.66589, 10.62691, 10.52461, 10.25708, 10.19741, 9.9562, 9.96369, 9.91398, 9.79604, 9.2686, 9.61975, 9.19501, 9.47332, 9.62216]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2458.0, 2527.0, 2467.0, 2148.0, 2250.0, 2467.0, 2528.0, 3656.0, 3275.0, 3203.0, 3297.0, 3364.0, 3789.0, 3277.0, 3660.0, 3733.0, 4815.0]}, "iteration_timing_avg": 0.1628459701492537}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.85929, "5": 10.87929, "10": 10.84772, "15": 10.86867, "20": 10.87317, "25": 10.83338, "30": 10.75624, "35": 10.66844, "40": 10.50171, "45": 10.28002, "50": 10.25621, "55": 10.18314, "60": 9.79897, "65": 9.24752, "70": 9.91362, "75": 9.58564, "80": 9.54312, "85": 9.72736, "90": 9.90472, "95": 9.6077, "100": 9.49935}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 378903552.0, "5": 379952128.0, "10": 379952128.0, "15": 379952128.0, "20": 560549376.0, "25": 560549376.0, "30": 560549376.0, "35": 560549376.0, "40": 560549376.0, "45": 560549376.0, "50": 560549376.0, "55": 561073664.0, "60": 561073664.0, "65": 560549376.0, "70": 560549376.0, "75": 560549376.0, "80": 560549376.0, "85": 560549376.0, "90": 560549376.0, "95": 560549376.0, "100": 560549376.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1704025600.0, "5": 1704026112.0, "10": 1704026112.0, "15": 1704026112.0, "20": 1886196224.0, "25": 1886196224.0, "30": 1886196224.0, "35": 1886196224.0, "40": 1886196224.0, "45": 1886196224.0, "50": 1886196224.0, "55": 1886196224.0, "60": 1886196224.0, "65": 1886196224.0, "70": 1886196224.0, "75": 1886196224.0, "80": 1886196224.0, "85": 1886196224.0, "90": 1886196224.0, "95": 1886196224.0, "100": 1886196224.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 13.26991, "5": 0.1958, "10": 0.19444, "15": 0.194, "20": 0.20361, "25": 0.20332, "30": 0.20368, "35": 0.20417, "40": 0.20368, "45": 0.20398, "50": 0.2037, "55": 0.20453, "60": 0.20433, "65": 0.20387, "70": 0.20373, "75": 0.20399, "80": 0.20347, "85": 0.20432, "90": 0.2036, "95": 0.20374, "100": 0.20437}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1808.0, "25": 2385.0, "30": 2591.0, "35": 1997.0, "40": 1959.0, "45": 2368.0, "50": 3073.0, "55": 2580.0, "60": 2853.0, "65": 2346.0, "70": 3572.0, "75": 2886.0, "80": 3459.0, "85": 4068.0, "90": 3747.0, "95": 4088.0, "100": 3436.0}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -48,4 +48,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --fp16: true
  --apply-query-key-layer-scaling: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_dev.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23144205882352942}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86312, "5": 10.86984, "10": 10.84273, "15": 10.88712, "20": 10.87623, "25": 10.83465, "30": 10.75356, "35": 10.67297, "40": 10.50224, "45": 10.28079, "50": 10.27239}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 284527616.0, "5": 284527616.0, "10": 284527616.0, "15": 284527616.0, "20": 416513536.0, "25": 416513536.0, "30": 416513536.0, "35": 416513536.0, "40": 416513536.0, "45": 416513536.0, "50": 416513536.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1464318464.0, "5": 1464319488.0, "10": 1464320000.0, "15": 1464320000.0, "20": 1597091840.0, "25": 1597091840.0, "30": 1597091840.0, "35": 1597091840.0, "40": 1597092352.0, "45": 1597092352.0, "50": 1597092352.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 5.08228, "5": 0.27074, "10": 0.26257, "15": 0.26176, "20": 0.27712, "25": 0.27706, "30": 0.27709, "35": 0.28021, "40": 0.28046, "45": 0.27903, "50": 0.27978}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1645.0, "25": 2124.0, "30": 2345.0, "35": 1780.0, "40": 1936.0, "45": 2289.0, "50": 2738.0}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_lts.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23144205882352942}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86312, "5": 10.86984, "10": 10.84273, "15": 10.88712, "20": 10.87623, "25": 10.83465, "30": 10.75356, "35": 10.67297, "40": 10.50224, "45": 10.28079, "50": 10.27239}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 284527616.0, "5": 284527616.0, "10": 284527616.0, "15": 284527616.0, "20": 416513536.0, "25": 416513536.0, "30": 416513536.0, "35": 416513536.0, "40": 416513536.0, "45": 416513536.0, "50": 416513536.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1465368064.0, "5": 1465368064.0, "10": 1465368576.0, "15": 1465368576.0, "20": 1596304896.0, "25": 1596304896.0, "30": 1596304896.0, "35": 1596304896.0, "40": 1596304896.0, "45": 1596305408.0, "50": 1596305408.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 6.797, "5": 0.28708, "10": 0.286, "15": 0.28021, "20": 0.30007, "25": 0.29697, "30": 0.29501, "35": 0.29587, "40": 0.29259, "45": 0.2983, "50": 0.29365}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1645.0, "25": 2124.0, "30": 2345.0, "35": 1780.0, "40": 1936.0, "45": 2289.0, "50": 2738.0}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 50
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -46,4 +46,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --fp16: true
  --apply-query-key-layer-scaling: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_dev.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23131970588235293}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86312, "5": 10.86984, "10": 10.84273, "15": 10.88712, "20": 10.87623, "25": 10.83465, "30": 10.75356, "35": 10.67297, "40": 10.50224, "45": 10.28079, "50": 10.27239}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 284527616.0, "5": 284527616.0, "10": 284527616.0, "15": 284527616.0, "20": 416513536.0, "25": 416513536.0, "30": 416513536.0, "35": 416513536.0, "40": 416513536.0, "45": 416513536.0, "50": 416513536.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1464318464.0, "5": 1464320000.0, "10": 1464320000.0, "15": 1464320000.0, "20": 1597089792.0, "25": 1597091328.0, "30": 1597092352.0, "35": 1597092352.0, "40": 1597092352.0, "45": 1597092352.0, "50": 1597092352.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 4.11891, "5": 0.27161, "10": 0.26629, "15": 0.2637, "20": 0.2814, "25": 0.28361, "30": 0.28297, "35": 0.28276, "40": 0.28313, "45": 0.2873, "50": 0.28552}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1645.0, "25": 2124.0, "30": 2345.0, "35": 1780.0, "40": 1936.0, "45": 2289.0, "50": 2738.0}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_lts.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0]}, "iteration_timing_avg": 0.23131970588235293}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.86312, "5": 10.86984, "10": 10.84273, "15": 10.88712, "20": 10.87623, "25": 10.83465, "30": 10.75356, "35": 10.67297, "40": 10.50224, "45": 10.28079, "50": 10.27239}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 284527616.0, "5": 284527616.0, "10": 284527616.0, "15": 284527616.0, "20": 416513536.0, "25": 416513536.0, "30": 416513536.0, "35": 416513536.0, "40": 416513536.0, "45": 416513536.0, "50": 416513536.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1464319488.0, "5": 1464320000.0, "10": 1465368576.0, "15": 1465368576.0, "20": 1596305408.0, "25": 1596305408.0, "30": 1596305408.0, "35": 1596305408.0, "40": 1596305408.0, "45": 1596305408.0, "50": 1596305920.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 7.22206, "5": 0.28793, "10": 0.2833, "15": 0.28906, "20": 0.29969, "25": 0.30075, "30": 0.29561, "35": 0.30149, "40": 0.29547, "45": 0.30118, "50": 0.29352}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1645.0, "25": 2124.0, "30": 2345.0, "35": 1780.0, "40": 1936.0, "45": 2289.0, "50": 2738.0}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 50
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -47,4 +47,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --fp16: true
  --apply-query-key-layer-scaling: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_dev.json
-{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525, 10.21403, 9.9801, 9.96977, 9.93973, 9.81158, 9.28667, 9.63194, 9.19732, 9.48341, 9.62985]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0, 3451.0, 3205.0, 2940.0, 3143.0, 3310.0, 3884.0, 3232.0, 3491.0, 3751.0, 5022.0]}, "iteration_timing_avg": 0.22914074626865674}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.86312, "5": 10.86984, "10": 10.84273, "15": 10.88712, "20": 10.87623, "25": 10.83465, "30": 10.75356, "35": 10.67297, "40": 10.50224, "45": 10.28079, "50": 10.27239, "55": 10.20076, "60": 9.84045, "65": 9.27781, "70": 9.92981, "75": 9.61573, "80": 9.56042, "85": 9.74259, "90": 9.91759, "95": 9.61376, "100": 9.50538}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 284527616.0, "5": 284527616.0, "10": 284527616.0, "15": 284527616.0, "20": 416513536.0, "25": 416513536.0, "30": 416513536.0, "35": 416513536.0, "40": 416513536.0, "45": 416513536.0, "50": 416513536.0, "55": 416513536.0, "60": 416513536.0, "65": 416513536.0, "70": 416513536.0, "75": 416513536.0, "80": 416513536.0, "85": 416513536.0, "90": 416513536.0, "95": 416513536.0, "100": 416513536.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1464319488.0, "5": 1464319488.0, "10": 1464320000.0, "15": 1464320000.0, "20": 1594994688.0, "25": 1597091840.0, "30": 1597091840.0, "35": 1597091840.0, "40": 1597092352.0, "45": 1597092352.0, "50": 1597092352.0, "55": 1597092352.0, "60": 1597092352.0, "65": 1597092352.0, "70": 1597092352.0, "75": 1597092352.0, "80": 1597092352.0, "85": 1597092352.0, "90": 1597092352.0, "95": 1597092352.0, "100": 1597092352.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 3.9389, "5": 0.26761, "10": 0.26783, "15": 0.26387, "20": 0.27882, "25": 0.27734, "30": 0.2767, "35": 0.277, "40": 0.27635, "45": 0.27694, "50": 0.28016, "55": 0.27883, "60": 0.28002, "65": 0.27862, "70": 0.27887, "75": 0.27972, "80": 0.27714, "85": 0.27759, "90": 0.27766, "95": 0.27789, "100": 0.27817}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1645.0, "25": 2124.0, "30": 2345.0, "35": 1780.0, "40": 1936.0, "45": 2289.0, "50": 2738.0, "55": 2309.0, "60": 2740.0, "65": 2151.0, "70": 3646.0, "75": 2891.0, "80": 3546.0, "85": 3681.0, "90": 3861.0, "95": 4152.0, "100": 3405.0}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_lts.json
-{"lm loss": {"start_step": 0, "end_step": 100, "step_interval": 5, "values": [10.86312, 10.87712, 10.87347, 10.88278, 10.89457, 10.84427, 10.69023, 10.62687, 10.53974, 10.26525, 10.21403, 9.9801, 9.96977, 9.93973, 9.81158, 9.28667, 9.63194, 9.19732, 9.48341, 9.62985]}, "num-zeros": {"start_step": 0, "end_step": 83, "step_interval": 5, "values": [2244.0, 2273.0, 2447.0, 2031.0, 2134.0, 2491.0, 2380.0, 3451.0, 3205.0, 2940.0, 3143.0, 3310.0, 3884.0, 3232.0, 3491.0, 3751.0, 5022.0]}, "iteration_timing_avg": 0.22914074626865674}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.86312, "5": 10.86984, "10": 10.84273, "15": 10.88712, "20": 10.87623, "25": 10.83465, "30": 10.75356, "35": 10.67297, "40": 10.50224, "45": 10.28079, "50": 10.27239, "55": 10.20076, "60": 9.84045, "65": 9.27781, "70": 9.92981, "75": 9.61573, "80": 9.56042, "85": 9.74259, "90": 9.91759, "95": 9.61376, "100": 9.50538}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 284527616.0, "5": 284527616.0, "10": 284527616.0, "15": 284527616.0, "20": 416513536.0, "25": 416513536.0, "30": 416513536.0, "35": 416513536.0, "40": 416513536.0, "45": 416513536.0, "50": 416513536.0, "55": 416513536.0, "60": 416513536.0, "65": 416513536.0, "70": 416513536.0, "75": 416513536.0, "80": 416513536.0, "85": 416513536.0, "90": 416513536.0, "95": 416513536.0, "100": 416513536.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1465368064.0, "5": 1465368064.0, "10": 1465368576.0, "15": 1465368576.0, "20": 1596305920.0, "25": 1596305920.0, "30": 1596305920.0, "35": 1596305920.0, "40": 1596305920.0, "45": 1596305920.0, "50": 1596305920.0, "55": 1596305920.0, "60": 1596305920.0, "65": 1596305920.0, "70": 1596305920.0, "75": 1596305920.0, "80": 1596305920.0, "85": 1596305920.0, "90": 1596305920.0, "95": 1596305920.0, "100": 1596305920.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 9.41683, "5": 0.29649, "10": 0.2936, "15": 0.29369, "20": 0.30302, "25": 0.29665, "30": 0.30347, "35": 0.29671, "40": 0.29818, "45": 0.29562, "50": 0.30562, "55": 0.29659, "60": 0.29349, "65": 0.29455, "70": 0.30009, "75": 0.29572, "80": 0.29482, "85": 0.29505, "90": 0.29548, "95": 0.29481, "100": 0.30221}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": "nan", "5": "nan", "10": "nan", "15": "nan", "20": 1645.0, "25": 2124.0, "30": 2345.0, "35": 1780.0, "40": 1936.0, "45": 2289.0, "50": 2738.0, "55": 2309.0, "60": 2740.0, "65": 2151.0, "70": 3646.0, "75": 2891.0, "80": 3546.0, "85": 3681.0, "90": 3861.0, "95": 4152.0, "100": 3405.0}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 100
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -47,4 +47,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --fp16: true
  --apply-query-key-layer-scaling: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: ckpt-resume
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/model_config.yaml
@@ -17,8 +17,8 @@ MODEL_ARGS:
  --train-iters: 2000
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -45,4 +45,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/model_config.yaml
@@ -19,8 +19,8 @@ MODEL_ARGS:
  --train-iters: 2000
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -45,4 +45,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --fp16: true
  --apply-query-key-layer-scaling: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/model_config.yaml
@@ -20,8 +20,8 @@ MODEL_ARGS:
  --train-iters: 2000
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -49,4 +49,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/model_config.yaml
@@ -20,8 +20,8 @@ MODEL_ARGS:
  --train-iters: 2000
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -49,4 +49,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml
@@ -20,8 +20,8 @@ MODEL_ARGS:
  --train-iters: 2000
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -49,4 +49,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/model_config.yaml
@@ -20,8 +20,8 @@ MODEL_ARGS:
  --train-iters: 2000
  --timing-log-level: 2
  --lr-decay-iters: 320000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-gpt3_00_text_document
  --vocab-file: ${DATA_PATH}/bpe/vocab.json
  --merge-file: ${DATA_PATH}/bpe/merges.txt
@@ -50,4 +50,5 @@ MODEL_ARGS:
  --ckpt-format: torch_dist
  --data-cache-path: ${DATA_CACHE_PATH}
  --bf16: true
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular