更新代码

688448db · silencealiang · a02a5490 · 688448db · 688448db · 688448db
Commit 688448db authored Mar 14, 2025 by silencealiang
20 changed files
--- a/tasks/orqa/unsupervised/tokenizers.py
+++ b/tasks/orqa/unsupervised/tokenizers.py
--- a/tasks/vision/segmentation/metrics.py
+++ b/tasks/vision/segmentation/metrics.py
--- a/tests/functional_tests/python_test_utils/common.py
+++ b/tests/functional_tests/python_test_utils/common.py
@@ -3,7 +3,11 @@ import glob
 import json
 import logging
 import os
+import pathlib
+from typing import Callable, Dict, List, Optional, Union

+import numpy as np
+import pydantic
 from tensorboard.backend.event_processing import event_accumulator

 # By default TB tries to be smart about what to load in memory to avoid OOM
@@ -12,27 +16,94 @@ from tensorboard.backend.event_processing import event_accumulator
 # are small/short.
 SIZE_GUIDANCE = {event_accumulator.TENSORS: 0, event_accumulator.SCALARS: 0}

-logger = logging.getLogger()
+logger = logging.getLogger(__name__)


-class TypeOfTest(enum.Enum):
-    APPROX = 1
+def approximate_threshold(rtol: float) -> Callable:
+    def _func(y_pred: List[Union[float, int]], y_true: List[Union[float, int]]):
+        return np.mean([np.mean(y_pred), np.mean(y_true)]) * rtol
+
+    return _func
+
+
+class TypeOfTestResult(enum.Enum):
+    APPROXIMATE = 1
    DETERMINISTIC = 2


-TYPE_OF_TEST_TO_METRIC = {
-    TypeOfTest.DETERMINISTIC: ["lm loss", "num-zeros"],
-    TypeOfTest.APPROX: ["lm loss", "iteration-time", "mem-allocated-bytes"],
-}
+class Test(pydantic.BaseModel):
+    pass
+
+
+class NotApproximateError(Exception):
+    """Raised if comparison is not within approximate bounds"""
+
+
+class NotDeterminsticError(Exception):
+    """Raised if comparison is not within approximate bounds"""
+
+
+class ApproximateTest(Test):
+    atol: Optional[Union[int, float]] = 0
+    atol_func: Optional[Callable] = None
+    rtol: float = 1e-5
+
+    @property
+    def type_of_test_result(self) -> TypeOfTestResult:
+        return TypeOfTestResult.APPROXIMATE
+
+    def error_message(self, metric_name: str) -> NotApproximateError:
+        return NotApproximateError(f"Approximate comparison of {metric_name}: FAILED")
+
+
+class DeterministicTest(Test):
+    @property
+    def atol(self) -> Union[int, float]:
+        return 0
+
+    atol_func: Optional[Callable] = None
+
+    @property
+    def rtol(self) -> float:
+        return 0.0

-METRIC_TO_THRESHOLD = {
-    "iteration-time": 0.8,
-    "mem-allocated-bytes": 3 * 1000 * 1000,  # 3MB
-    "lm loss": 0.05,
-}
+    @property
+    def type_of_test_result(self) -> TypeOfTestResult:
+        return TypeOfTestResult.DETERMINISTIC

+    def error_message(self, metric_name: str) -> NotDeterminsticError:
+        return NotDeterminsticError(f"Exact comparison of {metric_name}: FAILED")

-def read_tb_logs_as_list(path, index=0):
+
+class GoldenValueMetric(pydantic.BaseModel):
+    start_step: int
+    end_step: int
+    step_interval: int
+    values: Dict[int, Union[int, float, str]]
+
+    def __repr__(self):
+        return f"Values ({self.start_step},{self.end_step},{self.step_interval}): {', '.join([str(f'({step}, {value})') for step, value in self.values.items()])}"
+
+
+class GoldenValues(pydantic.RootModel):
+    root: Dict[str, GoldenValueMetric]
+
+
+class MissingTensorboardLogsError(Exception):
+    """Raised if TensorboardLogs not found"""
+
+
+class UndefinedMetricError(Exception):
+    """Raised of golden values metric has no test definition"""
+
+
+class SkipMetricError(Exception):
+    """Raised if metric shall be skipped"""
+
+
+def read_tb_logs_as_list(
+    path, index: int = 0, train_iters: int = 50, start_idx: int = 1, step_size: int = 5
+) -> Optional[Dict[str, GoldenValueMetric]]:
    """Reads a TensorBoard Events file from the input path, and returns the
    summary specified as input as a list.

@@ -46,13 +117,11 @@ def read_tb_logs_as_list(path, index=0):
    files = glob.glob(f"{path}/events*tfevents*")
    files += glob.glob(f"{path}/results/events*tfevents*")

-    summaries = {}
-
    if not files:
-        logger.info(f"File not found matching: {path}/events* || {path}/results/events*")
-        return summaries
+        logger.error(f"File not found matching: {path}/events* || {path}/results/events*")
+        return None

-    files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
+    files.sort(key=lambda x: os.path.getmtime(os.path.join(path, pathlib.Path(x).name)))
    accumulators = []

    if index == -1:
@@ -66,30 +135,150 @@ def read_tb_logs_as_list(path, index=0):
        ea.Reload()
        accumulators.append(ea)

+    summaries = {}
    for ea in accumulators:
        for scalar_name in ea.Tags()["scalars"]:
            if scalar_name in summaries:
-                summaries[scalar_name] += [round(x.value, 5) for x in ea.Scalars(scalar_name)]
+                for x in ea.Scalars(scalar_name):
+                    if x.step not in summaries[scalar_name]:
+                        summaries[scalar_name][x.step] = round(x.value, 5)
+
            else:
-                summaries[scalar_name] = [round(x.value, 5) for x in ea.Scalars(scalar_name)]
+                summaries[scalar_name] = {
+                    x.step: round(x.value, 5) for x in ea.Scalars(scalar_name)
+                }
+
+    golden_values = {}
+
+    for metric, values in summaries.items():

-            print(
-                f"Extracted {len(summaries[scalar_name])} values of {scalar_name} from Tensorboard \
-    logs. Here are the first 5 values: {summaries[scalar_name][:5]}"
+        # Add missing values
+        values = {
+            k: (values[k] if k in values else "nan")
+            for k in range(1, train_iters + 1)
+            if k == start_idx or (k > start_idx and int(k) % step_size == 0)
+        }
+
+        golden_values[metric] = GoldenValueMetric(
+            start_step=min(values.keys()),
+            end_step=max(values.keys()),
+            step_interval=step_size,
+            values=values,
        )

-    return summaries
+    # for metric_name, golden_value in golden_values.items():
+    #     logger.info(
+    #         f"Extracted {golden_value.end_step} values of {metric_name} from Tensorboard logs. Here are the sampled values: {golden_value.values}"
+    #     )
+
+    return golden_values
+
+
+def read_golden_values_from_json(
+    golden_values_path: Union[str, pathlib.Path]
+) -> Dict[str, GoldenValueMetric]:
+    with open(golden_values_path) as f:
+        if os.path.exists(golden_values_path):
+            with open(golden_values_path) as f:
+                return GoldenValues(**json.load(f)).root

+        raise ValueError(f"File {golden_values_path} not found!")

-def load_expected_data():
-    expected_metrics_file = os.getenv("EXPECTED_METRICS_FILE")

-    if expected_metrics_file is None:
-        raise ValueError("Unknown EXPECTED_METRICS_FILE")
+def _filter_checks(
+    checks: List[Union[ApproximateTest, DeterministicTest]], filter_for_type_of_check
+):
+    return [test for test in checks if test.type_of_test_result == filter_for_type_of_check]

-    with open(expected_metrics_file) as f:
-        if os.path.exists(expected_metrics_file):
-            with open(expected_metrics_file) as f:
-                return json.load(f)
+
+def pipeline(
+    compare_approximate_results: bool,
+    golden_values: Dict[str, GoldenValueMetric],
+    tensorboard_logs: Dict[str, GoldenValueMetric],
+    checks: Dict[str, List[Union[ApproximateTest, DeterministicTest]]],
+):
+
+    all_test_passed = True
+    failed_metrics = []
+
+    for golden_value_key, golden_value in golden_values.items():
+
+        try:
+            if golden_value_key not in list(tensorboard_logs.keys()):
+                raise MissingTensorboardLogsError(
+                    f"Metric {golden_value_key} not found in Tensorboard logs! Please modify `model_config.yaml` to record it."
+                )
+
+            if golden_value_key not in checks or (golden_value_key in checks and len(checks) == 0):
+                logger.debug(
+                    "For metric `%s`, no check was defined. Will fall back to `DeterminsticTest` with exact thresholds.",
+                    golden_value_key,
+                )
+                test = DeterministicTest()
            else:
-            print(f"File {expected_metrics_file} not found!")
+                # For approximate tests, we cannot use deterministic
+                if compare_approximate_results is True:
+                    tests = _filter_checks(checks[golden_value_key], TypeOfTestResult.APPROXIMATE)
+
+                # For deterministic, we can fall back to approximate
+                else:
+                    tests = _filter_checks(
+                        checks[golden_value_key], TypeOfTestResult.DETERMINISTIC
+                    ) or _filter_checks(checks[golden_value_key], TypeOfTestResult.APPROXIMATE)
+
+                if len(tests) != 1:
+                    raise SkipMetricError(
+                        f"No {'approximate' if compare_approximate_results is True else 'deterministic'} check found for {golden_value_key}: SKIPPED"
+                    )
+
+                test = tests[0]
+
+            golden_value_list = list(golden_value.values.values())
+            actual_value_list = [
+                value
+                for value_step, value in tensorboard_logs[golden_value_key].values.items()
+                if value_step in golden_value.values.keys()
+            ]
+
+            if golden_value_key == "iteration-time":
+                actual_value_list = actual_value_list[3:-1]
+                golden_value_list = golden_value_list[3:-1]
+                logger.info(
+                    "For metric `%s`, the first 3 and the last scalars are removed from the list to reduce noise.",
+                    golden_value_key,
+                )
+
+            actual_value_list = [np.inf if type(v) is str else v for v in actual_value_list]
+            golden_value_list = [np.inf if type(v) is str else v for v in golden_value_list]
+
+            if not np.allclose(
+                actual_value_list,
+                golden_value_list,
+                rtol=test.rtol,
+                atol=(
+                    test.atol_func(actual_value_list, golden_value_list)
+                    if test.atol_func is not None
+                    else test.atol
+                ),
+            ):
+                logger.info("Actual values: %s", ", ".join([str(v) for v in actual_value_list]))
+                logger.info("Golden values: %s", ", ".join([str(v) for v in golden_value_list]))
+                raise test.error_message(golden_value_key)
+
+            result = f"{test.type_of_test_result.name} test for metric {golden_value_key}: PASSED"
+            result_code = 0
+
+        except (NotApproximateError, NotDeterminsticError, MissingTensorboardLogsError) as e:
+            result = str(e)
+            result_code = 1
+        except SkipMetricError:
+            logger.info(f"{test.type_of_test_result.name} test for {golden_value_key}: SKIPPED")
+            continue
+
+        log_emitter = logger.info if result_code == 0 else logger.error
+        log_emitter(result)
+        if result_code == 1:
+            all_test_passed = False
+            failed_metrics.append(golden_value_key)
+
+    assert all_test_passed, f"The following metrics failed: {', '.join(failed_metrics)}"
--- a/tests/functional_tests/python_test_utils/conftest.py
+++ b/tests/functional_tests/python_test_utils/conftest.py
+from typing import Dict, List, Union
+
+import pytest
+
+from tests.functional_tests.python_test_utils import common
+
+
+def pytest_addoption(parser):
+    """
+    Additional command-line arguments passed to pytest.
+    """
+    parser.addoption(
+        "--allow-nondeterministic-algo",
+        action="store_true",
+        default=False,
+        help="If set, test system checks for approximate results.",
+    )
+    parser.addoption("--golden-values-path", action="store", help="Path to golden values")
+    parser.addoption(
+        "--train-iters", action="store", default=100, help="Number of train iters", type=int
+    )
+    parser.addoption("--tensorboard-path", action="store", help="Path to tensorboard records")
+    parser.addoption("--model-config-path", action="store", help="Path to model_config.yaml")
+
+
+@pytest.fixture
+def compare_approximate_results(request) -> bool:
+    """Simple fixture returning whether to check against results approximately."""
+    return request.config.getoption("--allow-nondeterministic-algo") is True
+
+
+@pytest.fixture
+def golden_values(request):
+    """Simple fixture returning golden values."""
+    return common.read_golden_values_from_json(request.config.getoption("--golden-values-path"))
+
+
+@pytest.fixture
+def train_iters(request):
+    """Simple fixture returning number of train iters."""
+    return request.config.getoption("--train-iters")
+
+
+@pytest.fixture
+def tensorboard_logs(request, train_iters):
+    """Simple fixture returning tensorboard metrics."""
+    return common.read_tb_logs_as_list(
+        request.config.getoption("--tensorboard-path"), train_iters=train_iters
+    )
+
+
+@pytest.fixture
+def tensorboard_path(request):
+    """Simple fixture returning path to tensorboard logs."""
+    return request.config.getoption("--tensorboard-path")
+
+
+@pytest.fixture
+def model_config_path(request):
+    """Simple fixture returning path to model_config.yaml."""
+    return request.config.getoption("--model-config-path")
--- a/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py
+++ b/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py
@@ -2,14 +2,20 @@ import os

 os.environ["OPENBLAS_NUM_THREADS"] = "1"
 import json
+import logging

 import click
+import yaml

 from tests.functional_tests.python_test_utils import common

+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+

 @click.command()
 @click.option("--logs-dir", required=True, type=str, help="Path to Tensorboard logs")
+@click.option("--train-iters", required=True, type=int, help="Number of train iters")
 @click.option("--output-path", required=False, type=str, help="Path to write golden values")
 @click.option(
    "--is-convergence-test/--is-normal-test",
@@ -17,22 +23,39 @@ from tests.functional_tests.python_test_utils import common
    help="Tensorboard index to extract",
    default=False,
 )
-def collect_train_test_metrics(logs_dir: str, output_path: str, is_convergence_test: bool):
-    summaries = common.read_tb_logs_as_list(logs_dir, index=-1 if is_convergence_test else 0)
-
-    train_metrics = {
-        metric_name: {
-            "start_step": 0,
-            "end_step": len(metric_values),
-            "step_interval": 5,
-            "values": metric_values[0 : len(metric_values) : 5],
-        }
-        for metric_name, metric_values in summaries.items()
+def collect_train_test_metrics(
+    logs_dir: str, train_iters: str, output_path: str, is_convergence_test: bool
+):
+    summaries = common.read_tb_logs_as_list(
+        logs_dir, index=(0 if not is_convergence_test else -1), train_iters=train_iters, start_idx=1
+    )
+
+    if summaries is None:
+        logger.warning("No tensorboard logs found, no golden values created.")
+        return
+
+    summaries = {
+        golden_value_key: golden_value
+        for (golden_value_key, golden_value) in summaries.items()
+        if golden_value_key
+        in [
+            "iteration-time",
+            "mem-allocated-bytes",
+            "mem-max-allocated-bytes",
+            "lm loss",
+            "num-zeros",
+        ]
    }

    if output_path is not None:
        with open(output_path, "w") as fh:
-            json.dump(train_metrics, fh)
+            json.dump(
+                {
+                    golden_value_key: golden_values.model_dump()
+                    for golden_value_key, golden_values in summaries.items()
+                },
+                fh,
+            )


 if __name__ == "__main__":

--- a/tests/functional_tests/python_test_utils/test_ci_pipeline.py
+++ b/tests/functional_tests/python_test_utils/test_ci_pipeline.py
-import os
-from typing import List, Union
-
-import numpy as np
-import pytest
-
-from .common import (
-    METRIC_TO_THRESHOLD,
-    TYPE_OF_TEST_TO_METRIC,
-    TypeOfTest,
-    load_expected_data,
-    read_tb_logs_as_list,
-)
-
-
-@pytest.fixture(params=load_expected_data().items())
-def expected_data(request):
-    return request.param
-
-
-# If we require a variation of tests for any of the other pipelines we can just inherit this class.
-class TestCIPipeline:
-    allow_nondeterministic = bool(int(os.getenv("NVTE_ALLOW_NONDETERMINISTIC_ALGO")))
-    logs_dir = os.getenv("LOGS_DIR")
-
-    # Replace symbol in namespace to fix function call result for lifetime of
-    # this class.
-
-    def _test_helper(self, metric_type: str, metric_dict: List[Union[int, float]], test_type):
-        expected_list = metric_dict['values']
-        print(f"The list of expected values: {expected_list} for metric {metric_type}")
-
-        try:
-            actual_list = read_tb_logs_as_list(self.logs_dir)[metric_type]
-        except KeyError as e:
-            raise KeyError(
-                f"Required metric {metric_type} not found in TB logs. Please make sure your model \
-exports this metric as its required by the test case/golden values file"
-            ) from e
-
-        if actual_list is None:
-            raise ValueError(f"No values of {metric_type} found in TB logs.")
-
-        actual_list_sliced = actual_list[
-            metric_dict["start_step"] : metric_dict["end_step"] : metric_dict["step_interval"]
-        ]
-        print(f"The list of actual values: {actual_list_sliced}")
-
-        if metric_type == "iteration-time":
-            actual_list_sliced = actual_list_sliced[3:]
-            expected_list = expected_list[3:]
-            print("Removing first items of values for metric_type iteration-time")
-
-        if test_type == TypeOfTest.DETERMINISTIC:
-            assert np.allclose(
-                actual_list_sliced, expected_list, rtol=0, atol=0
-            ), f"Actual is not equal to Expected for {metric_type}"
-        elif test_type == TypeOfTest.APPROX:
-            assert np.allclose(
-                actual_list_sliced, expected_list, rtol=1e-5, atol=METRIC_TO_THRESHOLD[metric_type]
-            ), f"Actual is not equal to Expected for {metric_type}"
-        else:
-            raise ValueError(f"Unexpected test_type {test_type} provided")
-
-    def test_approx(self, expected_data):
-        expected_metric, expected_values = expected_data
-
-        if expected_metric in TYPE_OF_TEST_TO_METRIC[TypeOfTest.APPROX]:
-            self._test_helper(expected_metric, expected_values, TypeOfTest.APPROX)
-        else:
-            print(f"Skipping metric {expected_metric} for approximate as it is deterministic only.")
-
-    @pytest.mark.skipif(allow_nondeterministic, reason="Cannot expect exact results")
-    def test_deterministic(self, expected_data):
-        expected_metric, expected_values = expected_data
-
-        if expected_metric in TYPE_OF_TEST_TO_METRIC[TypeOfTest.DETERMINISTIC]:
-            self._test_helper(expected_metric, expected_values, TypeOfTest.DETERMINISTIC)
-        else:
-            print(f"Skipping metric {expected_metric} for deterministic as it is approximate only.")
-
-    # # @TODO: This is inactive, do we want to activate it?
-    # def iteration_timing_node(self):
-    #     expected_iteration_timing_avg = self.expected["train_step_timing_avg"]
-    #     iteration_time = read_tb_logs_as_list(LOGS_DIR)["iteration-time"]
-    #     idx = len(iteration_time) // 3
-    #     iteration_time_avg = sum(iteration_time[idx:]) / len(iteration_time[idx:])
-    #     assert (
-    #         expected_iteration_timing_avg
-    #         == pytest.approx(expected=iteration_time_avg, rel=self.margin_time)
-    #     ), f"The time per global step must be approximately {expected_iteration_timing_avg} but "
-    #         "it is {iteration_time_avg}."
-
-
-# if deterministic, then also approx
-# if not determinstic, then also aprox
--- a/tests/functional_tests/python_test_utils/test_fp8_ci_pipeline.py
+++ b/tests/functional_tests/python_test_utils/test_fp8_ci_pipeline.py
-import json
-import os
-
-import numpy as np
-import pytest
-import scipy.stats as ss
-from scipy.integrate import trapezoid
-
-from .common import read_tb_logs_as_list
-
-LOGS_DIR = os.getenv("LOGS_DIR")
-EXPECTED_METRICS_FILE = os.getenv("EXPECTED_METRICS_FILE")
-
-
-# If we require a variation of tests for any of the other pipelines we can just inherit this class.
-class TestFP8CIPipeline:
-    margin_loss, margin_time = 0.2, 0.1
-    auc_threshold, correlation_threshold = 0.01, 0.999
-    expected = None
-
-    def _setup(self):
-        if os.path.exists(EXPECTED_METRICS_FILE):
-            with open(EXPECTED_METRICS_FILE) as f:
-                self.expected = json.load(f)
-            if self.expected is None:
-                raise FileNotFoundError("Expected data is none")
-
-    def _get_actual(self, loss_type):
-        actual_list = read_tb_logs_as_list(LOGS_DIR)[loss_type]
-        assert (
-            actual_list is not None
-        ), f"No TensorBoard events file was found in the logs for {loss_type}."
-        return actual_list
-
-    def _margin_test_helper(self, loss_type):
-        expected = self.expected[loss_type]
-        expected_list = np.array(expected["values"])
-        actual_list = self._get_actual(loss_type)
-        actual_list_sliced = np.array(
-            actual_list[expected["start_step"] : expected["end_step"] : expected["step_interval"]]
-        )
-
-        max_diff_index = np.argmax(np.abs(actual_list_sliced - expected_list))
-        max_diff = np.abs(actual_list_sliced[max_diff_index] - expected_list[max_diff_index])
-
-        print(
-            "[INFO - margin]: "
-            f"maximum absolute difference for {loss_type} is {max_diff} at index {max_diff_index}, "
-            f"Actual: {actual_list_sliced[max_diff_index]}, "
-            f"Expected: {expected_list[max_diff_index]}"
-        )
-        assert np.allclose(
-            actual_list_sliced, expected_list, rtol=1e-5, atol=self.margin_loss
-        ), f"Actual is not equal to Expected for {loss_type}"
-
-    def _auc_test_helper(self, loss_type):
-        expected = self.expected[loss_type]
-        expected_list = np.array(expected["values"])
-        actual_list = self._get_actual(loss_type)
-        actual_list_sliced = np.array(
-            actual_list[expected["start_step"] : expected["end_step"] : expected["step_interval"]]
-        )
-
-        def compute_auc(y_values):
-            x_values = np.arange(0, len(y_values), 1)
-            area = trapezoid(y_values, x_values)
-            return round(area, 5)
-
-        baseline_area = compute_auc(expected_list)
-        current_area = compute_auc(actual_list_sliced)
-        diff = abs(baseline_area - current_area)
-
-        print(
-            f"[INFO - AUC]: AUC diff: {diff * 100 / baseline_area} %, current: {current_area}, "
-            f"baseline: {baseline_area}"
-        )
-        assert (baseline_area <= 0) or (diff <= self.auc_threshold * baseline_area)
-
-    def _correlation_test_helper(self, loss_type):
-        expected = self.expected[loss_type]
-        expected_list = np.array(expected["values"])
-        actual_list = self._get_actual(loss_type)
-        actual_list_sliced = np.array(
-            actual_list[expected["start_step"] : expected["end_step"] : expected["step_interval"]]
-        )
-        corr = ss.pearsonr(actual_list_sliced, expected_list).statistic
-
-        print(f"[INFO - Corr]: Corr: {corr}")
-        assert corr > self.correlation_threshold
-
-    @pytest.mark.xfail
-    def test_lm_loss_margin(self):
-        self._setup()
-        self._margin_test_helper("lm loss")
-
-    def test_lm_loss_auc(self):
-        self._setup()
-        self._auc_test_helper("lm loss")
-
-    @pytest.mark.xfail
-    def test_lm_loss_correlation(self):
-        self._setup()
-        self._correlation_test_helper("lm loss")
-
-    def iteration_timing_node(self):
-        expected_iteration_timing_avg = self.expected["train_step_timing_avg"]
-        iteration_time = read_tb_logs_as_list(LOGS_DIR)["iteration-time"]
-        idx = len(iteration_time) // 3
-        iteration_time_avg = sum(iteration_time[idx:]) / len(iteration_time[idx:])
-        assert expected_iteration_timing_avg == pytest.approx(
-            expected=iteration_time_avg, rel=self.margin_time
-        ), f"The time per global step must be approximately {expected_iteration_timing_avg} but it \
-is {iteration_time_avg}."
--- a/tests/functional_tests/python_test_utils/test_regular_pipeline.py
+++ b/tests/functional_tests/python_test_utils/test_regular_pipeline.py
+import logging
+from typing import Dict, List, Optional
+
+import numpy as np
+
+from tests.functional_tests.python_test_utils import common
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def test_regular_pipeline(
+    compare_approximate_results: bool,
+    golden_values: Dict[str, common.GoldenValueMetric],
+    tensorboard_logs: Dict[str, common.GoldenValueMetric],
+    checks: Optional[Dict[str, List[common.Test]]] = None,
+):
+    if checks is None:
+        checks = {
+            "iteration-time": [common.ApproximateTest(atol=2.0, rtol=0)],
+            "mem-allocated-bytes": [
+                common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.05), rtol=0)
+            ],
+            "mem-max-allocated-bytes": [
+                common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.05), rtol=0)
+            ],
+            "lm loss": [
+                common.DeterministicTest(),
+                common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.05), rtol=0),
+            ],
+            "num-zeros": [
+                common.DeterministicTest(),
+                common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.20), rtol=0),
+            ],
+        }
+
+        if (
+            len(
+                missing_metrics := [
+                    golden_metric
+                    for golden_metric in checks.keys()
+                    if golden_metric not in golden_values.keys()
+                ]
+            )
+            > 0
+        ):
+            logger.error(
+                f"The following metrics are required but not provided in golden values: {', '.join(missing_metrics)}"
+            )
+            assert False
+
+    common.pipeline(
+        compare_approximate_results=compare_approximate_results,
+        golden_values=golden_values,
+        tensorboard_logs=tensorboard_logs,
+        checks=checks,
+    )
--- a/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
+++ b/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
-import os
-
-os.environ["OPENBLAS_NUM_THREADS"] = "1"
-import pytest
-
-from tests.functional_tests.python_test_utils.common import TypeOfTest, read_tb_logs_as_list
-
-LOGS_DIR = os.getenv("LOGS_DIR")
-ALLOW_NONDETERMINISTIC = os.getenv("NVTE_ALLOW_NONDETERMINISTIC_ALGO")
-STEP_INTERVAL = 5
-
-
-def collect_train_test_metrics(logs_dir, index):
-    train_loss_list = read_tb_logs_as_list(logs_dir, index)["lm loss"]
-    train_loss_list = [round(elem, 3) for elem in train_loss_list]
-    train_metrics = {"lm loss": train_loss_list[0 : len(train_loss_list) : STEP_INTERVAL]}
-    str_train_metrics = str(train_metrics).replace("'", '"')
-    print("\n ----------- The following are the metrics for ----------")
-    print(f"\n {str_train_metrics}", flush=True)
-    return train_metrics
-
-
-class TestCIPipeline:
-    margin_loss = 0.005
-    allow_nondeterministic = bool(int(ALLOW_NONDETERMINISTIC))
-    train_metrics_100 = collect_train_test_metrics(LOGS_DIR, 0)
-    train_metrics_50_to_100 = collect_train_test_metrics(LOGS_DIR, 1)
-
-    def _test_helper(self, loss_type, test_type):
-        expected = self.train_metrics_100[loss_type]
-        assert (
-            len(expected) == 100 // STEP_INTERVAL
-        ), "Train metrics from first run (before checkpoint load) should \
-have {100 // STEP_INTERVAL} elements"
-        print("expected : " + str(expected))
-        actual = self.train_metrics_50_to_100[loss_type]
-        assert (
-            len(actual) == 50 // STEP_INTERVAL
-        ), "Train metrics from second run (after checkpoint load) should have \
-{50 // STEP_INTERVAL} elements"
-        print("actual : " + str(actual))
-        start_idx_expected = len(expected) - len(actual)
-        print("start_idx_expected:", start_idx_expected)
-        # Here we will just be comparing values of actual and second half (50-100) of expected
-        for i, (expected_val, actual_val) in enumerate(zip(expected[start_idx_expected:], actual)):
-            step = start_idx_expected + i * STEP_INTERVAL
-            if test_type == TypeOfTest.APPROX:
-                assert actual_val == pytest.approx(
-                    expected=expected_val, rel=self.margin_loss
-                ), f"The loss at step {step} should be approximately {expected_val} but it is \
-{actual_val}."
-            else:
-                assert (
-                    actual_val == expected_val
-                ), f"The value at step {step} should be {expected_val} but it is {actual_val}."
-
-    @pytest.mark.skipif(allow_nondeterministic, reason="Nondeterministic is allowed.")
-    def test_lm_loss_deterministic(self):
-        self._test_helper("lm loss", TypeOfTest.DETERMINISTIC)
-
-    @pytest.mark.skipif(not allow_nondeterministic, reason="Nondeterministic is not allowed.")
-    def test_lm_loss_nondeterministic(self):
-        self._test_helper("lm loss", TypeOfTest.APPROX)
+import logging
+from typing import Dict
+
+import numpy as np
+import yaml
+
+from tests.functional_tests.python_test_utils import common, test_regular_pipeline
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def test_resume_checkpoint_pipeline(
+    compare_approximate_results: bool, tensorboard_path: str, train_iters: int
+):
+
+    first_run_values = common.read_tb_logs_as_list(
+        tensorboard_path, index=0, train_iters=train_iters, start_idx=(train_iters // 2) + 1
+    )
+    second_run_values = common.read_tb_logs_as_list(
+        tensorboard_path, index=1, train_iters=train_iters, start_idx=(train_iters // 2) + 1
+    )
+
+    checks = {
+        "iteration-time": [common.ApproximateTest(atol=2.0, rtol=0)],
+        "mem-allocated-bytes": [
+            common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.05), rtol=0)
+        ],
+        "mem-max-allocated-bytes": [
+            common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.05), rtol=0)
+        ],
+        "lm loss": [
+            common.DeterministicTest(),
+            common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.05), rtol=0),
+        ],
+        "num-zeros": [
+            common.DeterministicTest(),
+            common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.20), rtol=0),
+        ],
+    }
+
+    if (
+        len(
+            missing_metrics := [
+                golden_metric
+                for golden_metric in checks.keys()
+                if golden_metric not in first_run_values.keys()
+            ]
+        )
+        > 0
+    ):
+        logger.error(
+            f"The following metrics are required but not logged during training: {', '.join(missing_metrics)}"
+        )
+        assert False
+
+    first_run_values = {
+        metric_name: metric_values
+        for (metric_name, metric_values) in first_run_values.items()
+        if metric_name in checks.keys()
+    }
+
+    second_run_values = {
+        metric_name: metric_values
+        for (metric_name, metric_values) in second_run_values.items()
+        if metric_name in checks.keys()
+    }
+
+    logger.info(first_run_values)
+    logger.info(second_run_values)
+
+    test_regular_pipeline.test_regular_pipeline(
+        compare_approximate_results=compare_approximate_results,
+        golden_values=first_run_values,
+        tensorboard_logs=second_run_values,
+        checks=checks,
+    )
--- a/tests/functional_tests/shell_test_utils/_run_training.sh
+++ b/tests/functional_tests/shell_test_utils/_run_training.sh
@@ -26,9 +26,11 @@ MANDATORY_VARS=(
    "TRAINING_PARAMS_PATH"
    "OUTPUT_PATH"
    "TENSORBOARD_PATH"
-    "CHECKPOINT_PATH"
+    "CHECKPOINT_SAVE_PATH"
+    "CHECKPOINT_LOAD_PATH"
    "DATA_PATH"
    "RUN_NUMBER"
+    "REPEAT"
 )
 for mandatory_var in "${MANDATORY_VARS[@]}"; do
    if [[ -z "${!mandatory_var}" ]]; then
@@ -37,12 +39,9 @@ for mandatory_var in "${MANDATORY_VARS[@]}"; do
    fi
 done

-cp $TRAINING_PARAMS_PATH "$TRAINING_PARAMS_PATH.${SLURM_PROCID}"
-TRAINING_PARAMS_PATH="$TRAINING_PARAMS_PATH.${SLURM_PROCID}"
-
 # Envsubst model_params
 cat $TRAINING_PARAMS_PATH | envsubst "$(env | cut -d= -f1 | sed -e 's/^/$/')" >$TRAINING_PARAMS_PATH.tmp
-mv $TRAINING_PARAMS_PATH.tmp "$TRAINING_PARAMS_PATH"
+TRAINING_PARAMS_PATH="$TRAINING_PARAMS_PATH.tmp"

 # Pull env vars to export
 ENV_VARS=$(yq '... comments="" | .ENV_VARS | to_entries | .[] | [.key + "=" + .value] | join(" ")' "$TRAINING_PARAMS_PATH")
@@ -54,13 +53,13 @@ while IFS= read -r ARGUMENT; do

    export "$KEY"="$VALUE"
    echo "$KEY=$VALUE"
-done <<< "$ENV_VARS"
+done <<<"$ENV_VARS"

 # Run before script
-SCRIPT=$(cat "$TRAINING_PARAMS_PATH" | yq '.BEFORE_SCRIPT')
-if [[ "$SCRIPT" != null ]]; then
-    eval "$SCRIPT"
-fi;
+BEFORE_SCRIPT=$(cat "$TRAINING_PARAMS_PATH" | yq '.BEFORE_SCRIPT')
+if [[ "$BEFORE_SCRIPT" != null ]]; then
+    eval "$BEFORE_SCRIPT"
+fi

 # Exit earlier to leave time for properly saving checkpoint
 if [[ $(echo "$TRAINING_SCRIPT_PATH" | tr '[:upper:]' '[:lower:]') == *nemo* ]]; then
@@ -88,6 +87,33 @@ PARAMS="$PARAMS $TRAINING_PARAMS_FROM_CONFIG"
 export PYTHONPATH="$(pwd):${PYTHONPATH:-}"
 export WANDB_API_KEY="${WANDB_API_KEY:-}"

+######## Distributed training settings. ########
+echo "------ARGUMENTS for SLURM ---"
+MASTER_ADDR=${MASTER_ADDR:-localhost}
+MASTER_PORT=${MASTER_PORT:-6000}
+NUM_NODES=${NUM_NODES:-${SLURM_NNODES}}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+NODE_RANK=${SLURM_NODEID:-${SLURM_NODEID}}
+LAST_RANK=7
+export LOG_DIR=$OUTPUT_PATH/logs/$REPEAT
+mkdir -p $LOG_DIR
+
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE
+    --nnodes $NUM_NODES
+    --master_addr $MASTER_ADDR
+    --master_port $MASTER_PORT
+    --node_rank $SLURM_NODEID
+    --log-dir $LOG_DIR
+    --tee "0:3,7:3"
+    --redirects "3"
+)
+
 # Start training
-python $TRAINING_SCRIPT_PATH $PARAMS
+torchrun ${DISTRIBUTED_ARGS[@]} $TRAINING_SCRIPT_PATH $PARAMS || EXIT_CODE=$?

+# Run after script
+AFTER_SCRIPT=$(cat "$TRAINING_PARAMS_PATH" | yq '.AFTER_SCRIPT')
+if [[ "$AFTER_SCRIPT" != null ]]; then
+    eval "$AFTER_SCRIPT"
+fi
--- a/tests/functional_tests/shell_test_utils/run_ci_test.sh
+++ b/tests/functional_tests/shell_test_utils/run_ci_test.sh
@@ -21,7 +21,8 @@ MANDATORY_VARS=(
    "GOLDEN_VALUES_PATH"
    "OUTPUT_PATH"
    "TENSORBOARD_PATH"
-    "CHECKPOINT_PATH"
+    "CHECKPOINT_SAVE_PATH"
+    "CHECKPOINT_LOAD_PATH"
    "DATA_PATH"
    "DATA_CACHE_PATH"
 )
@@ -32,42 +33,97 @@ for mandatory_var in "${MANDATORY_VARS[@]}"; do
    fi
 done

-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+RECORD_CHECKPOINTS=${RECORD_CHECKPOINTS:-"false"}
+
+TEST_TYPES=("regular" "ckpt-resume" "frozen-resume" "frozen-start" "release")
+
+mkdir -p $CHECKPOINT_SAVE_PATH
+mkdir -p $CHECKPOINT_LOAD_PATH
+_CHECKPOINT_LOAD_PATH=$CHECKPOINT_LOAD_PATH
+_CHECKPOINT_SAVE_PATH=$CHECKPOINT_SAVE_PATH
+
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
 ROOT_DIR=$(realpath $SCRIPT_DIR/../../../)

 # Extract settings from params file
-TEST_TYPE=$(cat $TRAINING_PARAMS_PATH \
-            | yq '.TEST_TYPE')
-NVTE_ALLOW_NONDETERMINISTIC_ALGO=$(cat $TRAINING_PARAMS_PATH \
-                                   | yq '.ENV_VARS.NVTE_ALLOW_NONDETERMINISTIC_ALGO')
-SKIP_PYTEST=$(cat $TRAINING_PARAMS_PATH \
-              | yq '.ENV_VARS.SKIP_PYTEST')
-
-for i in $(seq 1 $N_REPEAT);
-do
+TEST_TYPE=$(cat $TRAINING_PARAMS_PATH |
+    yq '.TEST_TYPE')
+NVTE_ALLOW_NONDETERMINISTIC_ALGO=$(cat $TRAINING_PARAMS_PATH |
+    yq '.ENV_VARS.NVTE_ALLOW_NONDETERMINISTIC_ALGO')
+SKIP_PYTEST=$(cat $TRAINING_PARAMS_PATH |
+    yq '.ENV_VARS.SKIP_PYTEST')
+TRAIN_ITERS=$(cat $TRAINING_PARAMS_PATH |
+    yq '.MODEL_ARGS."--train-iters" // "100"')
+
+for i in $(seq 1 $N_REPEAT); do
    if [[ $i -gt 1 ]]; then
-        rm -rf $CHECKPOINT_PATH/*
+        rm -rf $CHECKPOINT_SAVE_PATH/*
+        rm -rf /tmp/checkpoints/*
+        rm -rf $TENSORBOARD_PATH/*
    fi

-    # Training
+    # First run never loads from a checkpoint
    export RUN_NUMBER=1
+    export REPEAT=$i
+    export CHECKPOINT_SAVE_PATH=$_CHECKPOINT_SAVE_PATH
+
+    if [[ "$TEST_TYPE" = "frozen-start" ]]; then
+        export CHECKPOINT_LOAD_PATH=$_CHECKPOINT_LOAD_PATH
+    else
+        export CHECKPOINT_LOAD_PATH=/tmp/checkpoints/
+    fi
+
+    if [[ "$TEST_TYPE" = "release" ]]; then
+        export CHECKPOINT_LOAD_PATH=$_CHECKPOINT_LOAD_PATH
+        export CHECKPOINT_SAVE_PATH=$_CHECKPOINT_SAVE_PATH
+    fi
+
    bash $ROOT_DIR/tests/functional_tests/shell_test_utils/_run_training.sh

-    # Maybe checkpoint resume training
+    if [[ "$TEST_TYPE" = "frozen-resume" && -z "$(ls -A "$_CHECKPOINT_LOAD_PATH" 2>/dev/null)" ]]; then
+        echo "No frozen checkpoint found. Will skip second run."
+
+        export CHECKPOINT_SAVE_PATH=$_CHECKPOINT_SAVE_PATH
+        rm -rf "$CHECKPOINT_SAVE_PATH/iter_0000$TRAIN_ITERS"
+        echo $((TRAIN_ITERS / 2)) >$CHECKPOINT_SAVE_PATH/latest_checkpointed_iteration.txt
+        break
+    fi
+
    if [[ "$TEST_TYPE" == "ckpt-resume" ]]; then
-        if [[ ${SLURM_PROCID} -eq 0 ]]; then
-            rm -rf $CHECKPOINT_PATH/iter_0000100; 
-            echo 50 > $CHECKPOINT_PATH/latest_checkpointed_iteration.txt;
+        export CHECKPOINT_LOAD_PATH=$CHECKPOINT_SAVE_PATH
+
+        rm -rf "$CHECKPOINT_LOAD_PATH/iter_0000$TRAIN_ITERS"
+        echo $((TRAIN_ITERS / 2)) >$CHECKPOINT_LOAD_PATH/latest_checkpointed_iteration.txt
+
+        export RUN_NUMBER=2
+        bash $ROOT_DIR/tests/functional_tests/shell_test_utils/_run_training.sh
    fi

+    if [[ "$TEST_TYPE" == "frozen-resume" ]]; then
+
+        # Checkpoint-resume tests load from prev run
+        export CHECKPOINT_LOAD_PATH=$_CHECKPOINT_LOAD_PATH
+        export CHECKPOINT_SAVE_PATH=/tmp/checkpoints/
+
        export RUN_NUMBER=2
        bash $ROOT_DIR/tests/functional_tests/shell_test_utils/_run_training.sh
+
+        export CHECKPOINT_SAVE_PATH=$_CHECKPOINT_SAVE_PATH
+        rm -rf "$CHECKPOINT_SAVE_PATH/iter_0000$TRAIN_ITERS"
+        echo $((TRAIN_ITERS / 2)) >$CHECKPOINT_SAVE_PATH/latest_checkpointed_iteration.txt
    fi

-    if [[ ${SLURM_PROCID} -gt 0 ]]; then
-        continue
+    if [[ "$TEST_TYPE" == "release" ]]; then
+        SKIP_PYTEST=0
+    fi
+
+    if [[ ${RECORD_CHECKPOINTS} == "true" ]]; then
+        echo "Skipping Pytest during checkpoint recording."
+        SKIP_PYTEST=1
    fi

+    # Maybe run tests
+    if [[ ${SKIP_PYTEST:-0} != 1 ]]; then
        # Save run results
        export PYTHONPATH=$ROOT_DIR
        if [[ "$TEST_TYPE" == "release" ]]; then
@@ -75,29 +131,37 @@ do
        else
            EXTRACT_ARGS=("--is-normal-test")
        fi
+
        python3 $ROOT_DIR/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py \
            --logs-dir $TENSORBOARD_PATH \
+            --train-iters $TRAIN_ITERS \
            --output-path ${OUTPUT_PATH}/$(basename $GOLDEN_VALUES_PATH) \
            "${EXTRACT_ARGS[@]}"

-    # Maybe run tests
-    if [[ ${SKIP_PYTEST:-0} != 1 ]]; then
        export NVTE_ALLOW_NONDETERMINISTIC_ALGO
-        export LOGS_DIR=$TENSORBOARD_PATH
-        
-        if [[ "$TEST_TYPE" == "ckpt-resume" ]]; then
-            echo "Running pytest 1st vs 2nd run comparison"
-            pytest -s $ROOT_DIR/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
+        if [[ "${NVTE_ALLOW_NONDETERMINISTIC_ALGO}" == "1" ]]; then
+            ALLOW_NONDETERMINISTIC_ALGO_ARG="--allow-nondeterministic-algo"
+        fi

-        elif [[ "$TEST_TYPE" == "regular" ]]; then
        echo "Running pytest checks against golden values"
-            export EXPECTED_METRICS_FILE=$GOLDEN_VALUES_PATH 
-            pytest -s $ROOT_DIR/tests/functional_tests/python_test_utils/test_ci_pipeline.py

-        else
+        pytest -s -o log_cli=true --log-cli-level=info $ROOT_DIR/tests/functional_tests/python_test_utils/test_regular_pipeline.py \
+            --golden-values-path $GOLDEN_VALUES_PATH \
+            --tensorboard-path $TENSORBOARD_PATH \
+            --model-config-path ${TRAINING_PARAMS_PATH} \
+            $ALLOW_NONDETERMINISTIC_ALGO_ARG
+
+        if [[ "$TEST_TYPE" == "ckpt-resume" || "$TEST_TYPE" == "frozen-resume" ]]; then
+            echo "Running pytest 1st vs 2nd run comparison"
+            pytest -s -o log_cli=true --log-cli-level=info $ROOT_DIR/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py \
+                --tensorboard-path $TENSORBOARD_PATH \
+                --train-iters $TRAIN_ITERS \
+                --model-config-path ${TRAINING_PARAMS_PATH} \
+                $ALLOW_NONDETERMINISTIC_ALGO_ARG
+        fi
+
+        if [[ ! " ${TEST_TYPES[*]} " =~ " ${TEST_TYPE} " ]]; then
            echo "Test type $TEST_TYPE not yet implemented."
        fi
    fi
 done
-
-
--- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
-{   "lm loss": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            10.49569,
-            10.48173,
-            10.48047,
-            10.45353,
-            10.44394,
-            10.35611,
-            10.13779,
-            10.04017,
-            9.86834,
-            9.67307
-        ]
-    },
-    "num-zeros": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            2254.0,
-            2585.0,
-            2101.0,
-            2157.0,
-            2241.0,
-            2475.0,
-            2890.0,
-            3199.0,
-            3524.0,
-            3090.0
-        ]
-    },
-    "iteration-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            13.65829,
-            1.27589,
-            1.2782,
-            1.32374,
-            1.26543,
-            1.26423,
-            1.26203,
-            1.54723,
-            1.27297,
-            1.26491
-        ]
-    }
-}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.49574, "5": 10.48424, "10": 10.49936, "15": 10.46628, "20": 10.44794, "25": 10.34964, "30": 10.17263, "35": 10.04261, "40": 9.90783, "45": 9.75774, "50": 9.67693}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2182.0, "5": 2584.0, "10": 2205.0, "15": 2539.0, "20": 2089.0, "25": 2604.0, "30": 2913.0, "35": 2967.0, "40": 2378.0, "45": 3923.0, "50": 3599.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1754654208.0, "5": 1754654208.0, "10": 1754654208.0, "15": 1754654208.0, "20": 1754654208.0, "25": 1754654208.0, "30": 1754654208.0, "35": 1754654208.0, "40": 1754654208.0, "45": 1754654208.0, "50": 1754654208.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2313432064.0, "5": 3055894528.0, "10": 3055894528.0, "15": 3055894528.0, "20": 3055894528.0, "25": 3055894528.0, "30": 3055894528.0, "35": 3055894528.0, "40": 3055894528.0, "45": 3055894528.0, "50": 3055894528.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 11.04517, "5": 1.25649, "10": 1.25549, "15": 1.2581, "20": 1.26387, "25": 1.25714, "30": 1.25866, "35": 1.26592, "40": 1.24291, "45": 1.23727, "50": 1.24404}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.49574, 10.48174, 10.4804, 10.45344, 10.44396, 10.35607, 10.13786, 10.04016, 9.86838, 9.67302]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2182.0, 2462.0, 2158.0, 2112.0, 2291.0, 2485.0, 2953.0, 3287.0, 3440.0, 3059.0]}, "iteration_timing_avg": 0.8110379411764704}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.49574, "5": 10.48398, "10": 10.49943, "15": 10.4663, "20": 10.44775, "25": 10.34964, "30": 10.1728, "35": 10.04262, "40": 9.90767, "45": 9.75792, "50": 9.67684}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2182.0, "5": 2568.0, "10": 2108.0, "15": 2533.0, "20": 2151.0, "25": 2601.0, "30": 2801.0, "35": 3107.0, "40": 2294.0, "45": 3909.0, "50": 3482.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1754654208.0, "5": 1754654208.0, "10": 1754654208.0, "15": 1754654208.0, "20": 1754654208.0, "25": 1754654208.0, "30": 1754654208.0, "35": 1754654208.0, "40": 1754654208.0, "45": 1754654208.0, "50": 1754654208.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2300849152.0, "5": 3043311616.0, "10": 3043311616.0, "15": 3043311616.0, "20": 3043311616.0, "25": 3043311616.0, "30": 3043311616.0, "35": 3043311616.0, "40": 3043311616.0, "45": 3043311616.0, "50": 3043311616.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 14.67278, "5": 1.17709, "10": 1.18485, "15": 1.20778, "20": 1.16573, "25": 1.17871, "30": 1.16949, "35": 1.16897, "40": 1.16996, "45": 1.16571, "50": 1.17045}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml
@@ -20,8 +20,8 @@ MODEL_ARGS:
  --train-iters: 50
  --timing-log-level: 2
  --lr-decay-iters: 990000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-bert_00_text_sentence
  --vocab-file: ${DATA_PATH}/vocab.txt
  --split: 949,50,1

--- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G/golden_values_dev.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.49569, "5": 10.48402, "10": 10.49933, "15": 10.46635, "20": 10.44782, "25": 10.34968, "30": 10.17276, "35": 10.04265, "40": 9.90757, "45": 9.75784, "50": 9.67694, "55": 9.55383, "60": 9.45452, "65": 9.42152, "70": 9.30114, "75": 9.3222, "80": 9.26181, "85": 9.2967, "90": 9.23351, "95": 9.23792, "100": 9.10613}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2254.0, "5": 2635.0, "10": 2165.0, "15": 2534.0, "20": 2227.0, "25": 2559.0, "30": 2905.0, "35": 3026.0, "40": 2314.0, "45": 3924.0, "50": 3557.0, "55": 3573.0, "60": 2689.0, "65": 3434.0, "70": 3935.0, "75": 5047.0, "80": 3601.0, "85": 4133.0, "90": 4603.0, "95": 4291.0, "100": 3165.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1754654208.0, "5": 1754654208.0, "10": 1754654208.0, "15": 1754654208.0, "20": 1754654208.0, "25": 1754654208.0, "30": 1754654208.0, "35": 1754654208.0, "40": 1754654208.0, "45": 1754654208.0, "50": 1754654208.0, "55": 1754654208.0, "60": 1754654208.0, "65": 1754654208.0, "70": 1754654208.0, "75": 1754654208.0, "80": 1754654208.0, "85": 1754654208.0, "90": 1754654208.0, "95": 1754654208.0, "100": 1754654208.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2503224320.0, "5": 3245686784.0, "10": 3245686784.0, "15": 3245686784.0, "20": 3245686784.0, "25": 3245686784.0, "30": 3245686784.0, "35": 3245686784.0, "40": 3245686784.0, "45": 3245686784.0, "50": 3245686784.0, "55": 3245686784.0, "60": 3245686784.0, "65": 3245686784.0, "70": 3245686784.0, "75": 3245686784.0, "80": 3245686784.0, "85": 3245686784.0, "90": 3245686784.0, "95": 3245686784.0, "100": 3245686784.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 14.19715, "5": 1.20808, "10": 1.19543, "15": 1.19563, "20": 1.42719, "25": 1.40901, "30": 1.18769, "35": 1.43684, "40": 1.18523, "45": 1.18204, "50": 1.18891, "55": 1.20368, "60": 1.19171, "65": 1.18981, "70": 1.17772, "75": 1.18903, "80": 1.17548, "85": 1.1753, "90": 1.36634, "95": 1.17827, "100": 1.17843}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G/golden_values_lts.json
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.49574, "5": 10.48398, "10": 10.49943, "15": 10.4663, "20": 10.44775, "25": 10.34964, "30": 10.1728, "35": 10.04262, "40": 9.90767, "45": 9.75792, "50": 9.67684, "55": 9.55378, "60": 9.45458, "65": 9.42133, "70": 9.30109, "75": 9.32203, "80": 9.26184, "85": 9.29667, "90": 9.23332, "95": 9.23793, "100": 9.10611}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2182.0, "5": 2568.0, "10": 2108.0, "15": 2533.0, "20": 2151.0, "25": 2601.0, "30": 2801.0, "35": 3107.0, "40": 2294.0, "45": 3909.0, "50": 3482.0, "55": 3606.0, "60": 2653.0, "65": 3341.0, "70": 3849.0, "75": 5090.0, "80": 3613.0, "85": 4194.0, "90": 4618.0, "95": 4439.0, "100": 3224.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1754654208.0, "5": 1754654208.0, "10": 1754654208.0, "15": 1754654208.0, "20": 1754654208.0, "25": 1754654208.0, "30": 1754654208.0, "35": 1754654208.0, "40": 1754654208.0, "45": 1754654208.0, "50": 1754654208.0, "55": 1754654208.0, "60": 1754654208.0, "65": 1754654208.0, "70": 1754654208.0, "75": 1754654208.0, "80": 1754654208.0, "85": 1754654208.0, "90": 1754654208.0, "95": 1754654208.0, "100": 1754654208.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 12.95742, "5": 1.16734, "10": 2.45473, "15": 1.45839, "20": 1.51474, "25": 1.15989, "30": 1.14801, "35": 1.14584, "40": 1.15517, "45": 1.14468, "50": 1.14969, "55": 1.15684, "60": 1.14892, "65": 1.14737, "70": 1.30233, "75": 1.37176, "80": 1.1466, "85": 1.24468, "90": 1.15157, "95": 1.15026, "100": 1.15254}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --num-layers: 24
+  --hidden-size: 1024
+  --num-attention-heads: 16
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 128
+  --seq-length: 512
+  --max-position-embeddings: 512
+  --train-iters: 100
+  --timing-log-level: 2
+  --lr-decay-iters: 990000
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
+  --data-path: ${DATA_PATH}/my-bert_00_text_sentence
+  --vocab-file: ${DATA_PATH}/vocab.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.0001
+  --min-lr: 0.00001
+  --lr-warmup-fraction: 0.01
+  --log-interval: 1
+  --save-interval: 50
+  --eval-interval: 1000
+  --eval-iters: 10
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --use-checkpoint-args: true
+  --use-checkpoint-opt_param-scheduler: true
+  --no-gradient-accumulation-fusion: true
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --ckpt-format: torch
+  --attention-backend: unfused
+TEST_TYPE: frozen-resume
--- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev.json
-{
-    "lm loss": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            10.49566,
-            10.48172,
-            10.48046,
-            10.45369,
-            10.44391,
-            10.35613,
-            10.13791,
-            10.04025,
-            9.86848,
-            9.67328
-        ]
-    },
-    "num-zeros": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            2183.0,
-            2571.0,
-            2097.0,
-            2118.0,
-            2414.0,
-            2464.0,
-            2988.0,
-            3223.0,
-            3481.0,
-            3046.0
-        ]
-    },
-    "mem-allocated-bytes": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            1767237120.0,
-            1767237632.0,
-            1767237632.0,
-            1767237632.0,
-            1767237632.0,
-            1767237632.0,
-            1767237632.0,
-            1767237632.0,
-            1767237632.0,
-            1767237632.0
-        ]
-    },
-    "iteration-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            13.74859,
-            1.16037,
-            1.15664,
-            1.28303,
-            1.16087,
-            1.1576,
-            1.15188,
-            1.1644,
-            1.15171,
-            1.38366
-        ]
-    }
-}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.49566, "5": 10.48412, "10": 10.49946, "15": 10.46625, "20": 10.44783, "25": 10.34967, "30": 10.17283, "35": 10.04281, "40": 9.90782, "45": 9.75786, "50": 9.67692}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2183.0, "5": 2683.0, "10": 2206.0, "15": 2493.0, "20": 2165.0, "25": 2528.0, "30": 2774.0, "35": 3054.0, "40": 2250.0, "45": 3947.0, "50": 3608.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1767237120.0, "5": 1767237120.0, "10": 1767237120.0, "15": 1767237120.0, "20": 1767237120.0, "25": 1768285696.0, "30": 1767237120.0, "35": 1767237120.0, "40": 1767237120.0, "45": 1767237120.0, "50": 1767237120.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2313432064.0, "5": 3055894528.0, "10": 3055894528.0, "15": 3055894528.0, "20": 3055894528.0, "25": 3055894528.0, "30": 3055894528.0, "35": 3055894528.0, "40": 3055894528.0, "45": 3055894528.0, "50": 3055894528.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.83402, "5": 1.12649, "10": 1.1312, "15": 1.12374, "20": 1.12209, "25": 1.13995, "30": 1.38104, "35": 1.14649, "40": 1.14975, "45": 1.14816, "50": 1.15079}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_lts.json
-{
-    "lm loss": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            10.49566,
-            10.48166,
-            10.48045,
-            10.45348,
-            10.44412,
-            10.3561,
-            10.13792,
-            10.04026,
-            9.86832,
-            9.67306
-        ]
-    },
-    "num-zeros": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            2183.0,
-            2469.0,
-            2115.0,
-            2126.0,
-            2281.0,
-            2389.0,
-            3013.0,
-            3255.0,
-            3491.0,
-            3062.0
-        ]
-    },
-    "mem-allocated-bytes": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            1767237120.0,
-            1767237120.0,
-            1767237120.0,
-            1767237120.0,
-            1767237120.0,
-            1767237120.0,
-            1767237120.0,
-            1767237120.0,
-            1767237120.0,
-            1767237120.0
-        ]
-    },
-    "iteration-time": {
-        "start_step": 0,
-        "end_step": 50,
-        "step_interval": 5,
-        "values": [
-            14.75035,
-            1.17988,
-            1.18643,
-            1.18301,
-            1.19116,
-            1.19494,
-            1.54654,
-            1.19342,
-            1.1823,
-            1.18039
-        ]
-    }
-}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.49566, "5": 10.48418, "10": 10.49947, "15": 10.46646, "20": 10.44777, "25": 10.34987, "30": 10.17278, "35": 10.04282, "40": 9.90771, "45": 9.75789, "50": 9.67683}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2183.0, "5": 2533.0, "10": 2162.0, "15": 2548.0, "20": 2180.0, "25": 2557.0, "30": 2908.0, "35": 2999.0, "40": 2252.0, "45": 3808.0, "50": 3622.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1767237120.0, "5": 1767237120.0, "10": 1767237120.0, "15": 1767237120.0, "20": 1767237120.0, "25": 1767237120.0, "30": 1767237120.0, "35": 1767237120.0, "40": 1767237120.0, "45": 1767237120.0, "50": 1767237120.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2313432064.0, "5": 3055894528.0, "10": 3055894528.0, "15": 3055894528.0, "20": 3055894528.0, "25": 3055894528.0, "30": 3055894528.0, "35": 3055894528.0, "40": 3055894528.0, "45": 3055894528.0, "50": 3055894528.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 12.23164, "5": 1.1819, "10": 1.18193, "15": 1.18147, "20": 1.18394, "25": 1.37105, "30": 1.18551, "35": 1.18659, "40": 1.18004, "45": 1.183, "50": 1.196}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml
@@ -20,8 +20,8 @@ MODEL_ARGS:
  --train-iters: 50
  --timing-log-level: 2
  --lr-decay-iters: 990000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --data-path: ${DATA_PATH}/my-bert_00_text_sentence
  --vocab-file: ${DATA_PATH}/vocab.txt
  --split: 949,50,1