更新代码

688448db · silencealiang · a02a5490 · 688448db · 688448db · 688448db
Commit 688448db authored Mar 14, 2025 by silencealiang
20 changed files
--- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_lts.json
+++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_lts.json
-{
-    "lm loss": {
-        "start_step": 0,
-        "end_step": 100,
-        "step_interval": 5,
-        "values": [
-            10.39854,
-            9.41109,
-            8.8833,
-            8.56279,
-            8.28765,
-            8.10226,
-            7.83824,
-            7.53414,
-            7.39426,
-            7.28765,
-            7.36798,
-            7.22207,
-            7.10595,
-            7.05273,
-            6.91414,
-            6.96485,
-            6.97279,
-            7.03525,
-            6.70355,
-            6.97029
-        ]
-    },
-    "num-zeros": {
-        "start_step": 0,
-        "end_step": 100,
-        "step_interval": 5,
-        "values": [
-            43320.0,
-            40948.0,
-            43971.0,
-            41622.0,
-            44740.0,
-            43919.0,
-            41231.0,
-            42497.0,
-            44664.0,
-            43894.0,
-            41149.0,
-            43254.0,
-            39687.0,
-            45400.0,
-            43313.0,
-            43891.0,
-            45351.0,
-            45692.0,
-            46187.0,
-            44657.0
-        ]
-    },
-    "iteration-time": {
-        "start_step": 0,
-        "end_step": 100,
-        "step_interval": 5,
-        "values": [
-            14.46368,
-            0.41717,
-            0.42344,
-            0.4102,
-            0.40332,
-            0.40531,
-            0.40418,
-            0.40386,
-            0.40711,
-            0.4048,
-            0.40536,
-            0.40331,
-            0.40175,
-            0.4047,
-            0.40982,
-            0.40834,
-            0.40594,
-            0.40872,
-            0.40896,
-            0.41014
-        ]
-    }
-}
\ No newline at end of file
+{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.39854, "5": 9.39701, "10": 9.03359, "15": 8.67298, "20": 8.28241, "25": 8.00349, "30": 7.88919, "35": 7.67196, "40": 7.50912, "45": 7.35246, "50": 7.18229, "55": 7.15567, "60": 7.14148, "65": 7.00001, "70": 7.0554, "75": 7.05859, "80": 6.94155, "85": 6.84584, "90": 7.2405, "95": 6.84353, "100": 6.96854}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 43320.0, "5": 45392.0, "10": 45363.0, "15": 43919.0, "20": 44778.0, "25": 42432.0, "30": 43986.0, "35": 43261.0, "40": 43242.0, "45": 43266.0, "50": 43346.0, "55": 43875.0, "60": 41289.0, "65": 44697.0, "70": 45530.0, "75": 44661.0, "80": 41029.0, "85": 43973.0, "90": 44723.0, "95": 44054.0, "100": 42464.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2140224000.0, "5": 2140224000.0, "10": 2140224000.0, "15": 2140224000.0, "20": 2140224000.0, "25": 2140224000.0, "30": 2140224000.0, "35": 2140224000.0, "40": 2140224000.0, "45": 2140224000.0, "50": 2140224000.0, "55": 2140224000.0, "60": 2140224000.0, "65": 2140224000.0, "70": 2140224000.0, "75": 2140224000.0, "80": 2140224000.0, "85": 2140224000.0, "90": 2140224000.0, "95": 2140224000.0, "100": 2140224000.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2372122112.0, "5": 3305918976.0, "10": 3305918976.0, "15": 3305918976.0, "20": 3305918976.0, "25": 3305918976.0, "30": 3305918976.0, "35": 3305918976.0, "40": 3305918976.0, "45": 3305918976.0, "50": 3305918976.0, "55": 3305918976.0, "60": 3305918976.0, "65": 3305918976.0, "70": 3305918976.0, "75": 3305918976.0, "80": 3305918976.0, "85": 3305918976.0, "90": 3305918976.0, "95": 3305918976.0, "100": 3306050048.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 12.40654, "5": 0.40596, "10": 0.41633, "15": 0.39729, "20": 0.39823, "25": 0.39786, "30": 0.39874, "35": 0.39845, "40": 0.40982, "45": 0.39982, "50": 0.39604, "55": 0.39557, "60": 0.39545, "65": 0.39649, "70": 0.39623, "75": 0.39574, "80": 0.40039, "85": 0.39829, "90": 0.39569, "95": 0.39538, "100": 0.39981}}}
\ No newline at end of file
--- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml
@@ -34,8 +34,8 @@ MODEL_ARGS:
  --tokenizer-type: BertWordPieceCase
  --calculate-per-token-loss: true
  --split: 99982,9,9
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --tensorboard-dir: ${TENSORBOARD_PATH}
  --log-params-norm: true
  --log-num-zeros-in-grad: true
@@ -50,4 +50,5 @@ MODEL_ARGS:
  --data-cache-path: ${DATA_CACHE_PATH}
  --deterministic-mode: true
  --ckpt-format: torch
+  --log-memory-to-tensorboard: true
 TEST_TYPE: regular
--- a/tests/functional_tests/test_cases/t5/t5_release/golden_values_0.10.0.json
+++ b/tests/functional_tests/test_cases/t5/t5_release/golden_values_0.10.0.json
--- a/tests/functional_tests/test_cases/t5/t5_release/golden_values_0.9.0.json
+++ b/tests/functional_tests/test_cases/t5/t5_release/golden_values_0.9.0.json
--- a/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml
 ENV_VARS:
  CUDA_DEVICE_MAX_CONNECTIONS: '1'
  NVTE_ALLOW_NONDETERMINISTIC_ALGO: '1'
-
 TEST_TYPE: 'release'
 MODEL_ARGS:
  # T5 model args
@@ -16,7 +15,6 @@ MODEL_ARGS:
  --max-position-embeddings: 512
  --init-method-std: 0.015
  --attention-backend: unfused
-
  # Training args
  --micro-batch-size: 32
  --global-batch-size: 512
@@ -47,8 +45,8 @@ MODEL_ARGS:
  --log-interval: 100
  --save-interval: 2000
  --eval-interval: 1000
-  --save: ${CHECKPOINT_PATH}
-  --load: ${CHECKPOINT_PATH}
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
  --eval-iters: 10
  --tensorboard-dir: ${TENSORBOARD_PATH}
  --log-timers-to-tensorboard: true

--- a/tests/test_utils/python_scripts/common.py
+++ b/tests/test_utils/python_scripts/common.py
@@ -21,13 +21,28 @@ def resolve_cluster_config(cluster: str) -> str:
    raise ValueError(f"Unknown cluster {cluster} provided.")


+def resolve_artifact_config(cluster: str) -> str:
+    if cluster == "dgxh100_eos":
+        return "eos_lustre"
+    if cluster == "dgxa100_dracooci":
+        return "draco-oci_lustre"
+    if cluster == "dgxa100_dracooci-ord":
+        return "draco-oci-ord_lustre"
+    if cluster == "dgxh100_coreweave":
+        return "coreweave_lustre"
+    raise ValueError(f"Unknown cluster {cluster} provided.")
+
+
 def flatten_products(
    workload_manifest: jetclient.JETWorkloadManifest,
 ) -> jetclient.JETWorkloadManifest:
    """Flattens a nested dict of products"""
+
    workload_manifest.products = [
-        dict(zip(inp.keys(), values))
-        for inp in workload_manifest.products
+        dict(**dict(zip(inp.keys(), values)), **{"test_case": product['test_case'][0]})
+        for product in workload_manifest.products
+        if "products" in product
+        for inp in product['products']
        for values in itertools.product(*inp.values())
    ]

@@ -195,6 +210,7 @@ def load_workloads(
    model: Optional[str] = None,
    test_case: Optional[str] = None,
    container_image: Optional[str] = None,
+    record_checkpoints: Optional[str] = None,
 ) -> List[jetclient.JETWorkloadManifest]:
    """Return all workloads from disk that match scope and platform."""
    recipes_dir = BASE_PATH / ".." / "recipes"
@@ -238,4 +254,17 @@ def load_workloads(
                workloads.append(build_workload)
        workload.spec.n_repeat = n_repeat
        workload.spec.time_limit = time_limit
+
+        if record_checkpoints == 'true':
+            workload.outputs = [
+                {
+                    "type": "artifact",
+                    "key": f"unverified/model/mcore-ci/{container_tag}/{{model}}/{{name}}",
+                    "subdir": "checkpoints",
+                    "name": r"{model}/{name}",
+                    "description": r"Checkpoint of {model}/{name}",
+                    "pic": {"name": "Mcore CI", "email": "okoenig@nvidia.com"},
+                    "labels": {"origin": "ADLR/Megatron-LM"},
+                }
+            ]
    return workloads
--- a/tests/test_utils/python_scripts/download_coverage_results.py
+++ b/tests/test_utils/python_scripts/download_coverage_results.py
+import glob
+import logging
+import os
+import pathlib
+import shutil
+import zipfile
+
+import click
+import gitlab
+
+BASE_PATH = pathlib.Path(__file__).parent.resolve()
+PROJECT_ID = int(os.getenv("CI_PROJECT_ID", 19378))
+
+logger = logging.getLogger(__name__)
+
+
+@click.command()
+@click.option("--pipeline-id", required=True, type=int, help="Pipeline ID")
+def main(pipeline_id: int):
+    logging.basicConfig(level=logging.INFO)
+    logger.info('Started')
+
+    gl = gitlab.Gitlab(
+        f"https://{os.getenv('GITLAB_ENDPOINT')}", private_token=os.getenv("RO_API_TOKEN")
+    )
+
+    project = gl.projects.get(PROJECT_ID)
+    pipeline = project.pipelines.get(pipeline_id)
+    print(pipeline.bridges.list())
+
+    pipeline_bridges = [
+        pipeline_bridge
+        for pipeline_bridge in pipeline.bridges.list()
+        if pipeline_bridge.name.startswith("test:unit_tests")
+        and pipeline_bridge.downstream_pipeline is not None
+    ]
+
+    ASSETS_DIR = pathlib.Path("tmp") / "results" / "iteration=0"
+    for pipeline_bridge in pipeline_bridges:
+        functional_pipeline = project.pipelines.get(pipeline_bridge.downstream_pipeline['id'])
+
+        functional_pipeline_jobs = functional_pipeline.jobs.list(get_all=True)
+        if "legacy" in pipeline_bridge.name:
+            continue
+
+        logger.info("Starting with pipeline %s", pipeline_bridge.name)
+        for functional_pipeline_job in functional_pipeline_jobs:
+            job = project.jobs.get(functional_pipeline_job.id)
+            logger.info("Starting with job %s", job.name)
+
+            try:
+                file_name = '__artifacts.zip'
+                with open(file_name, "wb") as f:
+                    job.artifacts(streamed=True, action=f.write)
+                zip = zipfile.ZipFile(file_name)
+                zip.extractall("tmp")
+                logger.info("Downloaded artifacts of job %s", job.name)
+            except Exception:
+                continue
+
+            os.unlink(file_name)
+            restart_dir = os.listdir(pathlib.Path("tmp") / "results" / "iteration=0")[-1]
+            coverage_report_source = list(
+                glob.glob(
+                    str(
+                        pathlib.Path(ASSETS_DIR)
+                        / f"{restart_dir}"
+                        / "assets"
+                        / "basic"
+                        / "*"
+                        / "coverage_report"
+                    )
+                )
+            )[0]
+
+            coverage_report_target = (
+                pathlib.Path("coverage_results") / job.name.replace("/", "-") / "coverage_report"
+            )
+
+            if pathlib.Path(coverage_report_source).exists():
+                pathlib.Path(coverage_report_target.parent).mkdir(parents=True, exist_ok=True)
+                logger.info(
+                    "Move artifacts from %s to %s", coverage_report_source, coverage_report_target
+                )
+
+                shutil.move(coverage_report_source, coverage_report_target)
+            else:
+                logger.info(
+                    "coverage_report for %s does not exist. Skip.", str(f"{job.stage} / {job.name}")
+                )
+
+            shutil.rmtree("tmp")
+
+    logger.info("beep boop: All done!")
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/test_utils/python_scripts/download_golden_values.py
+++ b/tests/test_utils/python_scripts/download_golden_values.py
+import logging
+import os
+import pathlib
+import shutil
+import zipfile
+
+import click
+import gitlab
+
+BASE_PATH = pathlib.Path(__file__).parent.resolve()
+PROJECT_ID = int(os.getenv("CI_PROJECT_ID", 19378))
+
+logger = logging.getLogger(__name__)
+
+
+@click.command()
+@click.option("--pipeline-id", required=True, type=int, help="Pipeline ID")
+def main(pipeline_id: int):
+    logging.basicConfig(level=logging.INFO)
+    logger.info('Started')
+
+    gl = gitlab.Gitlab(
+        f"https://{os.getenv('GITLAB_ENDPOINT')}", private_token=os.getenv("RO_API_TOKEN")
+    )
+
+    project = gl.projects.get(PROJECT_ID)
+    pipeline = project.pipelines.get(pipeline_id)
+    print(pipeline.bridges.list())
+
+    pipeline_bridges = [
+        pipeline_bridge
+        for pipeline_bridge in pipeline.bridges.list()
+        if pipeline_bridge.name.startswith("functional")
+        and pipeline_bridge.downstream_pipeline is not None
+    ]
+
+    ASSETS_DIR = pathlib.Path("tmp") / "results" / "iteration=0"
+    for pipeline_bridge in pipeline_bridges:
+        functional_pipeline = project.pipelines.get(pipeline_bridge.downstream_pipeline['id'])
+        environment = pipeline_bridge.name[len("functional:run_") :]
+        functional_pipeline_jobs = functional_pipeline.jobs.list(get_all=True)
+        logger.info("Starting with pipeline %s", pipeline_bridge.name)
+        for functional_pipeline_job in functional_pipeline_jobs:
+            job = project.jobs.get(functional_pipeline_job.id)
+            logger.info("Starting with job %s", job.name)
+
+            try:
+                file_name = '__artifacts.zip'
+                with open(file_name, "wb") as f:
+                    job.artifacts(streamed=True, action=f.write)
+                zip = zipfile.ZipFile(file_name)
+                zip.extractall("tmp")
+                logger.info("Downloaded artifacts of job %s", job.name)
+            except Exception:
+                continue
+
+            os.unlink(file_name)
+            restart_dir = os.listdir(pathlib.Path("tmp") / "results" / "iteration=0")[-1]
+            golden_values_source = (
+                pathlib.Path(ASSETS_DIR)
+                / f"{restart_dir}"
+                / "assets"
+                / "basic"
+                / f"{job.name.replace('_', '-').lower()}-{environment}"
+                / f"golden_values_{environment}.json"
+            )
+            golden_values_target = (
+                pathlib.Path("tests")
+                / "functional_tests"
+                / 'test_cases'
+                / job.stage
+                / job.name
+                / f"golden_values_{environment}.json"
+            )
+
+            if golden_values_source.exists():
+                pathlib.Path(golden_values_target.parent).mkdir(parents=True, exist_ok=True)
+                logger.info(
+                    "Move artifacts from %s to %s", golden_values_source, golden_values_target
+                )
+
+                shutil.move(golden_values_source, golden_values_target)
+            else:
+                logger.info(
+                    "Golden values for %s does not exist. Skip.", str(f"{job.stage} / {job.name}")
+                )
+
+            shutil.rmtree("tmp")
+
+    logger.info("beep boop: All done!")
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/test_utils/python_scripts/generate_jet_trigger_job.py
+++ b/tests/test_utils/python_scripts/generate_jet_trigger_job.py
@@ -19,6 +19,12 @@ BASE_PATH = pathlib.Path(__file__).parent.resolve()
 )
 @click.option("--a100-cluster", required=True, type=str, help="A100 Cluster to run on")
 @click.option("--h100-cluster", required=True, type=str, help="H100 Cluster to run on")
+@click.option(
+    "--a100-partition", required=False, type=str, help="Slurm partition to use", default=None
+)
+@click.option(
+    "--h100-partition", required=False, type=str, help="Slurm partition to use", default=None
+)
 @click.option("--output-path", required=True, type=str, help="Path to write GitLab job to")
 @click.option("--container-image", required=True, type=str, help="LTS Container image to use")
 @click.option("--container-tag", required=True, type=str, help="Container tag to use")
@@ -28,6 +34,8 @@ BASE_PATH = pathlib.Path(__file__).parent.resolve()
    type=str,
    help="Name of job that created the downstream pipeline",
 )
+@click.option("--record-checkpoints", required=False, type=str, help="Values are 'true' or 'false'")
+@click.option("--slurm-account", required=True, type=str, help="Slurm account to use")
 @click.option("--tag", required=False, type=str, help="Tag (only relevant for unit tests)")
 @click.option(
    "--run-name", required=False, type=str, help="Run name (only relevant for release tests)"
@@ -46,10 +54,14 @@ def main(
    test_cases: str,
    a100_cluster: str,
    h100_cluster: str,
+    a100_partition: Optional[str],
+    h100_partition: Optional[str],
    output_path: str,
    container_image: str,
    container_tag: str,
    dependent_job: str,
+    record_checkpoints: str,
+    slurm_account: str,
    tag: Optional[str] = None,
    run_name: Optional[str] = None,
    wandb_experiment: Optional[str] = None,
@@ -97,14 +109,19 @@ def main(
    else:
        gitlab_pipeline = {
            "stages": list(set([test_case.spec.model for test_case in list_of_test_cases])),
-            "default": {"interruptible": True},
+            "default": {
+                "interruptible": True,
+                "retry": {"max": 2, "when": "runner_system_failure"},
+            },
        }

        for test_case in list_of_test_cases:
            if test_case.spec.platforms == "dgx_a100":
                cluster = a100_cluster
+                partition = a100_partition
            elif test_case.spec.platforms == "dgx_h100":
                cluster = h100_cluster
+                partition = h100_partition
            else:
                raise ValueError(f"Platform {test_case.spec.platforms} unknown")

@@ -118,11 +135,17 @@ def main(
                f"--environment {test_case.spec.environment}",
                f"--n-repeat {n_repeat}",
                f"--time-limit {time_limit}",
+                f"--scope {scope}",
                f"--test-case '{test_case.spec.test_case}'",
                f"--container-tag {container_tag}",
                f"--cluster {cluster}",
+                f"--record-checkpoints {record_checkpoints}",
+                f"--account {slurm_account}",
            ]

+            if partition is not None:
+                script.append(f"--partition {partition}")
+
            if tag is not None:
                script.append(f"--tag {tag}")


--- a/tests/test_utils/python_scripts/generate_local_jobs.py
+++ b/tests/test_utils/python_scripts/generate_local_jobs.py
@@ -29,6 +29,12 @@ def load_script(config_path: str) -> str:
 @click.option(
    "--test-case", required=False, type=str, help="Returns a single test-case with matching name."
 )
+@click.option(
+    "--environment",
+    required=True,
+    type=str,
+    help="Pass 'lts' for PyTorch 24.01 and 'dev' for a more recent version.",
+)
 @click.option(
    "--output-path",
    required=True,
@@ -36,9 +42,20 @@ def load_script(config_path: str) -> str:
    help="Directory where the functional test will write its artifacts to (Tensorboard logs)",
    default="/opt/megatron-lm",
 )
-def main(model: Optional[str], scope: Optional[str], test_case: Optional[str], output_path: str):
+def main(
+    model: Optional[str],
+    scope: Optional[str],
+    test_case: Optional[str],
+    environment: str,
+    output_path: str,
+):
    workloads = common.load_workloads(
-        container_image='none', scope=scope, model=model, test_case=test_case, container_tag='none'
+        container_image='none',
+        scope=scope,
+        model=model,
+        test_case=test_case,
+        environment=environment,
+        container_tag='none',
    )

    for workload in workloads:
@@ -46,6 +63,7 @@ def main(model: Optional[str], scope: Optional[str], test_case: Optional[str], o
            continue
        magic_values = dict(workload.spec)
        magic_values["assets_dir"] = output_path
+        magic_values["artifacts_dir"] = output_path

        file_path = (
            pathlib.Path.cwd()

--- a/tests/test_utils/python_scripts/launch_jet_workload.py
+++ b/tests/test_utils/python_scripts/launch_jet_workload.py
 import json
+import logging
 import os
 import pathlib
 import re
@@ -12,7 +13,6 @@ import click
 import jetclient
 import requests
 import yaml
-from jet import workloads
 from jetclient.facades.objects import log as jet_log
 from jetclient.services.dtos.pipeline import PipelineStatus

@@ -20,6 +20,8 @@ from tests.test_utils.python_scripts import common

 BASE_PATH = pathlib.Path(__file__).parent.resolve()

+logger = logging.getLogger(__name__)
+

 def register_pipeline_terminator(pipeline: jetclient.JETPipeline):
    def sigterm_handler(_signo, _stack_frame):
@@ -37,17 +39,24 @@ def launch_and_wait_for_completion(
    environment: str,
    n_repeat: int,
    time_limit: int,
+    scope: str,
    container_image: Optional[str],
    container_tag: str,
    cluster: str,
    account: str,
+    record_checkpoints: str,
+    partition: Optional[str],
    tag: Optional[str],
    run_name: Optional[str],
    wandb_experiment: Optional[str],
 ) -> jetclient.JETPipeline:
-    n_submit_errors = 0
+    cluster_config = {"account": account}
+    if partition is not None:
+        cluster_config['partition'] = partition

-    while n_submit_errors < 3:
+    n_submission_attempts = 0
+    while n_submission_attempts < 3:
+        try:
            pipeline = jetclient.JETClient(
                customer='mcore', gitlab_ci_token=os.getenv("RO_API_TOKEN"), env="prod"
            ).workloads.submit(
@@ -56,13 +65,15 @@ def launch_and_wait_for_completion(
                    n_repeat=n_repeat,
                    time_limit=time_limit,
                    tag=tag,
+                    scope=scope,
                    container_image=container_image,
                    container_tag=container_tag,
                    environment=environment,
+                    record_checkpoints=record_checkpoints,
                ),
                config_id=f"mcore/{common.resolve_cluster_config(cluster)}",
                custom_config={
-                "launchers": {cluster: {"account": account, "ntasks_per_node": 8}},
+                    "launchers": {cluster: cluster_config},
                    "executors": {
                        "jet-ci": {
                            "environments": {
@@ -71,68 +82,77 @@ def launch_and_wait_for_completion(
                                        "RUN_NAME": run_name or "",
                                        "WANDB_API_KEY": os.getenv("WANDB_API_KEY") or "",
                                        "WANDB_EXPERIMENT": wandb_experiment or "",
+                                        "RECORD_CHECKPOINTS": str(
+                                            "Record checkpoints"
+                                            in os.getenv("CI_MERGE_REQUEST_LABELS", "")
+                                        ).lower(),
                                    }
                                }
                            }
                        }
                    },
+                    "outputs": {
+                        "enabled": True,
+                        "artifacts_storages": [common.resolve_artifact_config(cluster)],
+                    },
                },
                wait_for_validation=True,
                max_wait_time=(60 * 60),
            )
+        except jetclient.clients.gitlab.GitlabAPIError as e:
+            logger.error(f"Faced {str(e)}. Waiting and retrying...")
+            n_submission_attempts += 1
+            time.sleep(2**n_submission_attempts * 5)
+            continue
+
        if pipeline.get_status() == PipelineStatus.SUBMISSION_FAILED:
-            n_submit_errors += 1
-            print(f"Failed submitting pipeline. Let's try again ({n_submit_errors}/3)")
+            n_submission_attempts += 1
+            logger.info("Submission failed, attempt again (%s/3)", str(n_submission_attempts))
            continue
        break

    register_pipeline_terminator(pipeline=pipeline)

-    print(
-        f"Pipeline triggered; inspect it here: https://gitlab-master.nvidia.com/dl/jet/ci/-/pipelines/{pipeline.jet_id}",
-        flush=True,
+    logger.info(
+        "Pipeline triggered; inspect it here: https://gitlab-master.nvidia.com/dl/jet/ci/-/pipelines/%s",
+        pipeline.jet_id,
    )

-    n_wait_attempts = 0
-    while n_wait_attempts < 3:
-        try:
-            pipeline.wait(max_wait_time=60 * 60 * 24 * 7, interval=60 * 1)
-            break
-        except (requests.exceptions.ConnectionError, json.decoder.JSONDecodeError) as e:
-            print(e)
-            time.sleep(60 * 3**n_wait_attempts)
-            pipeline = workloads.get_pipeline(pipeline.jet_id)
-            n_wait_attempts += 1
+    pipeline.wait(max_wait_time=60 * 60 * 24 * 7, interval=60 * 1, retries_on_error=3)

-    print(f"Pipeline terminated; status: {pipeline.get_status()}")
+    logger.info(f"Pipeline terminated; status: {pipeline.get_status()}")
    return pipeline


 def download_job_assets(logs: List[jet_log.JETLog], iteration: int = 0) -> List[str]:
    if not logs:
+        logger.info("No logs found for download.")
        return [""]

-    assets_base_path = BASE_PATH / ".." / ".." / ".." / ".." / "results" / f"iteration={iteration}"
+    assets_base_path = BASE_PATH / ".." / ".." / ".." / "results" / f"iteration={iteration}"

    for restart_idx, log in enumerate(logs):
        assets = log.get_assets()
        assets_path = assets_base_path / f"restart={restart_idx}"
        assets_path.mkdir(parents=True, exist_ok=True)
-        for log_filename in assets.keys():
-            with open(assets_path / log_filename, "w") as fh:
-                assets[log_filename].download(pathlib.Path(fh.name))
+        for asset in assets:
+            (assets_path / asset.source_path).parent.mkdir(parents=True, exist_ok=True)
+            with open(assets_path / asset.source_path, "w") as fh:
+                dest = pathlib.Path(fh.name)
+                logger.info("Downloading log %s to %s", asset.source_path, str(dest))
+                asset.download(dest)
    return assets


 def extract_logs_to_string(logs: List[jet_log.JETLog]) -> List[str]:
    if not logs:
+        logger.info("No logs found for download.")
        return [""]

-    assets = logs[0].get_assets()
-    log_filename = [key for key in assets.keys() if key.endswith(".log")][0]
-
    with tempfile.NamedTemporaryFile() as tmp_file:
-        assets[log_filename].download(pathlib.Path(tmp_file.name))
+        assets = logs[-1].get_assets()
+        asset = [asset for asset in assets if asset.name == "output_script-0.log"][0]
+        asset.download(pathlib.Path(tmp_file.name))
        with open(pathlib.Path(tmp_file.name), "r") as fh:
            return fh.readlines()

@@ -161,6 +181,7 @@ def parse_finished_training(logs: List[str]) -> Optional[bool]:
 )
 @click.option("--n-repeat", required=False, default=1, type=int)
 @click.option("--time-limit", required=False, default=1800, type=int)
+@click.option("--scope", required=False, default="mr", type=str)
 @click.option(
    "--account",
    required=False,
@@ -168,10 +189,12 @@ def parse_finished_training(logs: List[str]) -> Optional[bool]:
    help="Slurm account to use",
    default="coreai_dlalgo_mcore",
 )
+@click.option("--partition", required=False, type=str, help="Slurm partition to use", default=None)
 @click.option("--cluster", required=True, type=str, help="Cluster to run on")
 @click.option("--container-tag", required=True, type=str, help="Base image of Mcore image")
 @click.option("--container-image", required=False, type=str, help="Base image of Mcore image")
 @click.option("--tag", required=False, type=str, help="Tag (only relevant for unit tests)")
+@click.option("--record-checkpoints", required=False, type=str, help="Values are 'true' or 'false'")
 @click.option(
    "--run-name", required=False, type=str, help="Run name (only relevant for release tests)"
 )
@@ -187,14 +210,20 @@ def main(
    environment: str,
    n_repeat: int,
    time_limit: int,
+    scope: str,
    account: str,
+    partition: Optional[str],
    cluster: str,
    container_tag: str,
+    record_checkpoints: str,
    tag: Optional[str] = None,
    container_image: Optional[str] = None,
    run_name: Optional[str] = None,
    wandb_experiment: Optional[str] = None,
 ):
+    logging.basicConfig(level=logging.INFO)
+    logger.info('Started')
+
    model_config_path = pathlib.Path(
        BASE_PATH
        / ".."
@@ -217,8 +246,10 @@ def main(
    else:
        test_type = "unit_test"

+    logger.info('test_type will be %s', test_type)
+
    if test_type == "release" and (run_name is None or wandb_experiment is None):
-        print(f"Not all arguments provided ({run_name=}, {wandb_experiment=})")
+        logger.error(f"Not all arguments provided ({run_name=}, {wandb_experiment=})")
        sys.exit(1)

    n_attempts = 0
@@ -230,13 +261,16 @@ def main(
            environment=environment,
            n_repeat=n_repeat,
            time_limit=time_limit,
+            scope=scope,
            container_image=container_image,
            container_tag=container_tag,
            cluster=cluster,
            account=account,
+            partition=partition,
            tag=tag,
            run_name=run_name,
            wandb_experiment=wandb_experiment,
+            record_checkpoints=record_checkpoints,
        )

        main_job = [job for job in pipeline.get_jobs() if job.name.startswith("basic")][0]
@@ -247,25 +281,39 @@ def main(
                jet_log = main_job.get_logs()
                logs = extract_logs_to_string(logs=jet_log)
                download_job_assets(logs=jet_log, iteration=n_iteration)
+                no_log = False
                break
-            except (requests.exceptions.ConnectionError, json.decoder.JSONDecodeError) as e:
-                print(e)
-                time.sleep((3**n_download_attempt) * 60)
+            except (
+                requests.exceptions.ConnectionError,
+                json.decoder.JSONDecodeError,
+                UnicodeDecodeError,
+            ) as e:
+                logger.error(e)
+                time.sleep(2 * n_download_attempt * 15)
                n_download_attempt += 1
+                no_log = True
+            except (KeyError, IndexError) as e:
+                logger.error(e)
+                no_log = True
+                break
+
+        if no_log:
+            logger.error("Did not find any logs to download, retry.")
+            continue

        concat_logs = "\n".join(logs)
+        if concat_logs.strip() == "":
+            logger.error("No logs found. Try again.")
+            n_attempts += 1
+            continue
+
+        if test_type != "release":
            print(f"Logs:\n{concat_logs}")

        success = pipeline.get_status() == PipelineStatus.SUCCESS
+        logger.info("Pipeline terminated with status %s", pipeline.get_status().name)

        if test_type == "unit_test":
-            success = success and (
-                (
-                    re.search(r'=.*?\bpassed\b.*?=', concat_logs)
-                    and not re.search(r'=.*?\bfailed\b.*?=', concat_logs)
-                )
-                or "0 selected" in concat_logs
-            )
            sys.exit(int(not success))  # invert for exit 0

        if test_type != "release":
@@ -277,22 +325,24 @@ def main(
                or "uncorrectable ECC error encountered" in concat_logs
                or "illegal memory access" in concat_logs
                or "illegal instruction" in concat_logs
+                or "torch.distributed.DistNetworkError" in concat_logs
            ):
-                print("Detected NCCL failure, attempt restart.")
+                logger.error("Detected NCCL failure, attempt restart.")
                n_attempts += 1
                continue

-            if "FAILED tests/functional_tests/python_test_utils/test_ci_pipeline.py" in concat_logs:
-                print("Non-determinism, let's try another node.")
+            if "FAILED tests/functional_tests/python_test_utils" in concat_logs:
+                logger.error("Non-determinism, let's try another node.")
                n_nondeterminism_attemps += 1
                continue

+            sys.exit(1)
+
        if parse_failed_job(logs=logs):
            n_attempts += 1
            continue

        if parse_finished_training(logs=logs):
-            success = pipeline.get_status() == PipelineStatus.SUCCESS
            sys.exit(int(not success))  # invert for exit 0
        n_iteration += 1
    sys.exit(1)

--- a/tests/test_utils/recipes/bert.yaml
+++ b/tests/test_utils/recipes/bert.yaml
@@ -3,14 +3,17 @@ format_version: 1
 maintainers: [mcore]
 loggers: [stdout]
 spec:
-  name: "{test_case}"
+  name: '{test_case}_{environment}'
  model: bert
  nodes: 1
  build: mcore-pyt-{environment}
  gpus: 8
  platforms: dgx_a100
+  time_limit:
+  n_repeat:
  artifacts:
    /workspace/data/bert_data: text/the_pile/bert_shard00
+    /workspace/checkpoints/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G_dev: model/mcore_bert/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G_dev/22410107
  script: |-
    ls
    cd /opt/megatron-lm
@@ -20,7 +23,8 @@ spec:
        "DATA_CACHE_PATH=/workspace/data/cache" 
        "OUTPUT_PATH={assets_dir}"
        "TENSORBOARD_PATH={assets_dir}/tensorboard"
-        "CHECKPOINT_PATH=/workspace/checkpoints"
+        "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints"
+        "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/{name}"
        "TRAINING_SCRIPT_PATH=pretrain_bert.py"
        "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
        "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
@@ -30,26 +34,75 @@ spec:
    bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}

 products:
-  - environment: [lts, dev]
-    scope: [mr]
-    time_limit: [1800]
-    n_repeat: [5]
-    test_case: 
-    - bert_mr_mcore_tp2_pp2_dgx_a100_1N8G
-    - bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G
-    - bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G
-    - bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G
-    - bert_mr_tp1_pp4_vp2_dgx_a100_1N8G
-    - bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G
-    - bert_mr_tp2_pp2_dgx_a100_1N8G
-    - bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G
-  - environment: [lts, dev]
-    scope: [nightly]
-    n_repeat: [5]
-    time_limit: [3600]
-    test_case:
-    - bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2
-    - bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2
-    - bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1
-    - bert_nightly_dgx_a100_1N8G_tp1_pp2
-    - bert_nightly_dgx_a100_1N8G_tp4_pp1
+  - test_case: [bert_mr_mcore_tp2_pp2_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [bert_mr_tp1_pp4_vp2_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [bert_mr_tp2_pp2_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [bert_nightly_dgx_a100_1N8G_tp1_pp2]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [bert_nightly_dgx_a100_1N8G_tp4_pp1]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  # - test_case: [bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G]
+  #   products:
+  #     - environment: [dev] Update checkpoint
+  #       scope: [mr]
--- a/tests/test_utils/recipes/gpt-modelopt.yaml
+++ b/tests/test_utils/recipes/gpt-modelopt.yaml
@@ -3,11 +3,14 @@ format_version: 1
 maintainers: [mcore]
 loggers: [stdout]
 spec:
-  name: "{test_case}"
+  name: '{test_case}_{environment}'
  model: gpt
  build: mcore-pyt-{environment}
  nodes: 1
  gpus: 2
+  platforms: dgx_a100
+  time_limit:
+  n_repeat:
  artifacts:
    /workspace/data/gpt3_data: text/the_pile/shard00
    /workspace/checkpoints/teacher: model/gpt_dummy_pyt/ckpt/24.10.0_bf16_teacher
@@ -20,7 +23,8 @@ spec:
        "DATA_CACHE_PATH=/workspace/data/cache"
        "OUTPUT_PATH={assets_dir}"
        "TENSORBOARD_PATH={assets_dir}/tensorboard"
-        "CHECKPOINT_PATH=/workspace/checkpoints"
+        "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints"
+        "CHECKPOINT_LOAD_PATH=/workspace/checkpoints"
        "TRAINING_SCRIPT_PATH=./examples/export/knowledge_distillation/pretrain_gpt_modelopt.py"
        "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
        "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
@@ -29,9 +33,7 @@ spec:
    bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}

 products:
-  - scope: [nightly]
-    platforms: [dgx_a100]
-    time_limit: [1200]
-    environment: [lts, dev] # Disable dev for now
-    test_case:
-      - gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume
+  - test_case: [gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
--- a/tests/test_utils/recipes/gpt-nemo.yaml
+++ b/tests/test_utils/recipes/gpt-nemo.yaml
@@ -3,14 +3,14 @@ format_version: 1
 maintainers: [mcore]
 loggers: [stdout]
 spec:
-  name: "{test_case}"
+  name: '{test_case}_{environment}'
  model: gpt-nemo
  build: mcore-nemo
  nodes: 1
  gpus: 8
  platforms: dgx_a100
  time_limit: 1800
-  scope: null
+  scope:
  script: |-
    ls
    cd /opt/NeMo
@@ -20,7 +20,8 @@ spec:
        "DATA_CACHE_PATH='-'"
        "OUTPUT_PATH={assets_dir}"
        "TENSORBOARD_PATH={assets_dir}/tensorboard"
-        "CHECKPOINT_PATH=/workspace/checkpoints"
+       "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints"
+        "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/{name}"
        "TRAINING_SCRIPT_PATH=/opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py"
        "TRAINING_PARAMS_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
        "GOLDEN_VALUES_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
@@ -30,10 +31,15 @@ spec:
    bash /opt/megatron-lm/tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}

 products:
+  - test_case: [gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+  - test_case: [gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+  - test_case: [gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_8experts_tp2_ep2_pp1_dgx_a100_1N8G]
+    products:
      - environment: [dev]
        scope: [mr]
-    n_repeat: [5]
-    test_case:
-    - gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G
-    - gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G
-    
\ No newline at end of file
--- a/tests/test_utils/recipes/gpt.yaml
+++ b/tests/test_utils/recipes/gpt.yaml
@@ -3,13 +3,17 @@ format_version: 1
 maintainers: [mcore]
 loggers: [stdout]
 spec:
-  name: "{test_case}"
+  name: '{test_case}_{environment}'
  model: gpt
  build: mcore-pyt-{environment}
  nodes: 1
  gpus: 8
+  n_repeat: 5
+  platforms: dgx_a100
  artifacts:
    /workspace/data/gpt3_data: text/the_pile/shard00
+    /workspace/checkpoints/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G_dev: model/mcore_gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G_dev/22410107
+    /workspace/checkpoints/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G_dev: model/mcore_gpt/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G_dev/22410107
  script: |-
    ls
    cd /opt/megatron-lm
@@ -19,7 +23,8 @@ spec:
        "DATA_CACHE_PATH=/workspace/data/cache"
        "OUTPUT_PATH={assets_dir}"
        "TENSORBOARD_PATH={assets_dir}/tensorboard"
-        "CHECKPOINT_PATH=/workspace/checkpoints"
+        "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints"
+        "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/{name}"
        "TRAINING_SCRIPT_PATH=pretrain_gpt.py"
        "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
        "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
@@ -29,138 +34,694 @@ spec:
    bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}

 products:
-  - environment: [lts, dev]
-    scope: [mr]
-    platforms: [dgx_a100]
-    time_limit: [1800]
-    n_repeat: [5]
-    test_case:
-    - gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G
-    # - gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G # torch >= 2.4.0
-    - gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G
-    - gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G
-    - gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G
-    - gpt3_mr_te_tp2_pp2_dgx_a100_1N8G
-    - gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G
-    - gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G
-    - gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G
-    - gpt3_mr_tp2_pp2_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G
-    - gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G  # cp and attention
-    - gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G  # cp and attention
-    - gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G  # cp and attention
-    - gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G  # cp and attention
-    - gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G  # cp and attention with a2a+p2p comm type
-    - gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G  # cp and attention with a2a+p2p comm type
-  - environment: [lts, dev]
-    scope: [nightly]
-    platforms: [dgx_a100]
-    time_limit: [3600]
-    n_repeat: [5]
-    test_case:
-    - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather
-    # - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_fsdp2_resume_torch_dist_te # torch >= 2.4.0
-    - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather
-    - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2
-    - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist
-    - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4
-    - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist
-    - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel
-    # - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts  # non-determinism
-    - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1
-    - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch
-    - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist
-    - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce
-    - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce
-    - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2
-    - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch
-    - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4
-    - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce
-    - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch
-    - gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce
-    - gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce
-    - gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts
-    - gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce
-    - gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1
-    - gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce
-    - gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch
-  - environment: [lts]
-    scope: [nightly]
-    platforms: [dgx_a100]
-    time_limit: [3600]
-    n_repeat: [5]
-    test_case:
-    - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel  # non-determinism in dev
-  - environment: [lts, dev]
+  #######################################################################
+  # Nightly tests: Run both DEV and LTS unless something is flaky       #
+  #######################################################################
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel]
+    products:
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel]
+    products:
+      - environment: [dev]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last]
+    products:
+      - environment: [dev]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  # - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts]
+  #   products:
+  #     - environment: [dev, lts]
+  #       scope: [nightly]
+  # - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_fsdp2_resume_torch_dist_te]
+  #   products:
+  #     - environment: [dev, lts]
+  #       scope: [nightly]
+  #######################################################################
+  # Weekly tests: Run both DEV and LTS unless something is flaky        #
+  #######################################################################
+  - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel]
+    products:
+      - environment: [dev, lts]
+        scope: [weekly]
+  - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline]
+    products:
+      - environment: [dev, lts]
+        scope: [weekly]
+  - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel]
+    products:
+      - environment: [dev, lts]
        scope: [weekly]
-    platforms: [dgx_h100]
-    time_limit: [9000]
-    test_case:
-    - gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel
-    - gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline
-    - gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel
-    - gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp
-    - gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp
-    - gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp
-    - gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp
-    - gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp
+  - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp]
+    products:
+      - environment: [dev, lts]
+        scope: [weekly]
+  - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp]
+    products:
+      - environment: [dev, lts]
+        scope: [weekly]
+  - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp]
+    products:
+      - environment: [dev, lts]
+        scope: [weekly]
+  - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp]
+    products:
+      - environment: [dev, lts]
+        scope: [weekly]
+  - test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp]
+    products:
+      - environment: [dev, lts]
+        scope: [weekly]
+  #######################################################################
+  # MR tests: Mostly DEV on MR, and LTS on nightly cadence, except for  #
+  #             some very important tests.                              #
+  #######################################################################
+  - test_case: [gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G]
+    products:
+      # - environment: [dev] Until TE is at 1.12
+      #   scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G]
+    products:
+      # - environment: [dev] Until TE is at 1.12
+      #   scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  # - test_case: [gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G]  Failing on max-memory
+  #   products:
+  #     - environment: [dev]
+  #       scope: [mr]
+  #     - environment: [lts]
+  #       scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+
+  - test_case: [gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  # - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [mr]
+  #     - environment: [lts]
+  #       scope: [nightly]
+  # - test_case:  # Failing on max-memory[gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [mr]
+  #     - environment: [lts]
+  #       scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+
+  # - test_case: [gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [mr]
+  #     - environment: [lts]
+  #       scope: [nightly]
+  # - test_case: [gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [mr]
+  #     - environment: [lts]
+  #       scope: [nightly]
+  #######################################################################
+  # Super important MR tests that run for both DEV and LTS per MR       #
+  #######################################################################
+  - test_case: [gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G]
+    products:
+      - environment: [dev, lts]
+        scope: [mr]
+  - test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G]
+    products:
+      - environment: [dev, lts]
+        scope: [mr]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G]
+    products:
+      - environment: [dev, lts]
+        scope: [mr]
+  - test_case: [gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G]
+    products:
+      - environment: [dev, lts]
+        scope: [mr]
+  - test_case: [gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G]
+    products:
+      - environment: [dev, lts]
+        scope: [mr]
+  - test_case: [gpt3_mr_te_tp2_pp2_dgx_a100_1N8G]
+    products:
+      - environment: [dev, lts]
+        scope: [mr]
+  - test_case: [gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G]
+    products:
+      - environment: [dev, lts]
+        scope: [mr]
+  - test_case: [gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G]
+    products:
+      - environment: [dev, lts]
+        scope: [mr]
+  - test_case: [gpt3_mr_tp2_pp2_dgx_a100_1N8G]
+    products:
+      - environment: [dev, lts]
+        scope: [mr]
+  - test_case: [gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+  - test_case: [gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+
+  # - test_case: [gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G]
+  #   products:
+  #     - environment: [dev, lts]
+  #       scope: [mr]
--- a/tests/test_utils/recipes/multimodal-llava.yaml
+++ b/tests/test_utils/recipes/multimodal-llava.yaml
@@ -2,18 +2,17 @@ type: basic
 format_version: 1
 maintainers: [mcore]
 loggers: [stdout]
-launchers:
-  type:slurm:
-    ntasks_per_node: '{gpus}'
 spec:
-  name: '{test_case}'
+  name: '{test_case}_{environment}'
  model: multimodal-llava
  build: mcore-pyt-{environment}
  nodes: 1
  gpus: 8
  platforms: dgx_a100
-  time_limit: 1800
-  scope: null
+  time_limit:
+  n_repeat:
+  test_case:
+  scope:
  script: |-
    ls
    cd /opt/megatron-lm
@@ -23,7 +22,8 @@ spec:
        "DATA_CACHE_PATH='-'"
        "OUTPUT_PATH={assets_dir}"
        "TENSORBOARD_PATH={assets_dir}/tensorboard"
-        "CHECKPOINT_PATH=/workspace/checkpoints"
+        "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints"
+        "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/{name}/checkpoints"
        "TRAINING_SCRIPT_PATH=pretrain_vlm.py"
        "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
        "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
@@ -33,19 +33,39 @@ spec:
    bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}

 products:
-  - environment: [lts, dev]
+  - test_case: [multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
        scope: [mr]
-    n_repeat: [5]
-    gpus: [8]
-    test_case:
-      - multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G
-      - multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G
-      - multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G
-      - multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G
-  - environment: [lts, dev]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
        scope: [mr]
-    n_repeat: [5]
-    gpus: [7]
-    test_case:
-      - multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G
-      - multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
--- a/tests/test_utils/recipes/t5.yaml
+++ b/tests/test_utils/recipes/t5.yaml
@@ -3,7 +3,7 @@ format_version: 1
 maintainers: [mcore]
 loggers: [stdout]
 spec:
-  name: "{test_case}"
+  name: '{test_case}_{environment}'
  model: t5
  build: mcore-pyt-{environment}
  nodes: 1
@@ -11,6 +11,7 @@ spec:
  platforms: dgx_a100
  artifacts:
    /workspace/data/t5_data: text/the_pile/t5_shard00
+    /workspace/checkpoints/t5_220m_mr_mcore_te_tp2_pp2_frozen_resume_torch_dgx_a100_1N8G_dev: model/mcore_t5/t5_220m_mr_mcore_te_tp2_pp2_frozen_resume_torch_dgx_a100_1N8G_dev/22410107
  script: |-
    ls
    cd /opt/megatron-lm
@@ -20,7 +21,8 @@ spec:
        "DATA_CACHE_PATH=/workspace/data/cache"
        "OUTPUT_PATH={assets_dir}"
        "TENSORBOARD_PATH={assets_dir}/tensorboard"
-        "CHECKPOINT_PATH=/workspace/checkpoints"
+        "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints"
+        "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/{name}"
        "TRAINING_SCRIPT_PATH=pretrain_t5.py"
        "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
        "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
@@ -30,32 +32,77 @@ spec:
    bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}

 products:
-  - environment: [lts, dev]
-    scope: [mr]
-    time_limit: [1800]
-    n_repeat: [5]
-    test_case:
-    - t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G
-    - t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G
-    - t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G
-    - t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G
-    - t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G
-    - t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G
-    - t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G
-  - environment: [lts]
-    scope: [mr]
-    time_limit: [1800]
-    n_repeat: [5]
-    test_case:
-    - t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G
-  - environment: [lts, dev]
-    scope: [nightly]
-    time_limit: [9000]
-    n_repeat: [1]
-    test_case:
-    - t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch
-    - t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1
-    - t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel
-    - t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1
-    - t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch
-    - t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1
+  - test_case: [t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G]
+    products:
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
+      - environment: [lts]
+        scope: [nightly]
+  - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1]
+    products:
+      - environment: [dev, lts]
+        scope: [nightly]
+  - test_case: [t5_220m_mr_mcore_te_tp2_pp2_frozen_resume_torch_dgx_a100_1N8G]
+    products:
+      - environment: [dev]
+        scope: [mr]
--- a/tests/test_utils/recipes/unit-tests.yaml
+++ b/tests/test_utils/recipes/unit-tests.yaml
@@ -3,7 +3,7 @@ format_version: 1
 maintainers: [mcore]
 loggers: [stdout]
 spec:
-  name: '{test_case}'
+  name: '{test_case}_{environment}_{tag}'
  model: unit-tests
  nodes: 1
  build: mcore-pyt-{environment}
@@ -60,21 +60,84 @@ spec:
      fi
    done <<< "$IGNORE_TEST_CASES"

+    echo "------ARGUMENTS for SLURM ---"
+    MASTER_ADDR=${{MASTER_ADDR:-localhost}}
+    MASTER_PORT=${{MASTER_PORT:-6000}}
+    NUM_NODES=${{NUM_NODES:-${{SLURM_NNODES}}}}
+    GPUS_PER_NODE=${{GPUS_PER_NODE:-8}}
+    NODE_RANK=${{SLURM_NODEID:-${{SLURM_NODEID}}}}
+    DISTRIBUTED_ARGS=(
+        --nproc_per_node $GPUS_PER_NODE
+        --nnodes $NUM_NODES
+        --master_addr $MASTER_ADDR
+        --master_port $MASTER_PORT
+        --node_rank $SLURM_NODEID
+        --log-dir {assets_dir}
+        --tee "0:3"
+        --redirects "3"
+    )
+
+    # Reduce memory usage by NCCL
+    export NCCL_MAX_NCHANNELS=1
+    export NCCL_NVLS_ENABLE=0
+
    for i in $(seq $UNIT_TEST_REPEAT); do
-      CMD=$(echo pytest -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail ${{IGNORE_ARGS[@]}} -m "'${{MARKER_ARG}}'" $BUCKET)
+      CMD=$(echo torchrun ${{DISTRIBUTED_ARGS[@]}} -m pytest \
+        -xvs \
+        --cov-report=term \
+        --cov-branch \
+        --cov=megatron/core \
+        --cov-report xml:coverage.xml \
+        --no-cov-on-fail ${{IGNORE_ARGS[@]}} \
+        -m "'${{MARKER_ARG}}'" $BUCKET)
+      
      eval "$CMD"    
    done

+    ls -al 
+    cp .coverage_0 {assets_dir}/coverage_report
+    cp coverage.xml {assets_dir}
+
 products:
+  - test_case: [tests/unit_tests/data/]
+    products:
+      - environment: [lts, dev]
+        tag: [latest, legacy]
+        scope: [unit-tests]
+        n_repeat: [1]
+        time_limit: [1800]
+  - test_case: [tests/unit_tests/dist_checkpointing/*.py]
+    products:
+      - environment: [lts, dev]
+        tag: [latest, legacy]
+        scope: [unit-tests]
+        n_repeat: [1]
+        time_limit: [1800]
+  - test_case: [tests/unit_tests/dist_checkpointing/models/]
+    products:
+      - environment: [lts, dev]
+        tag: [latest, legacy]
+        scope: [unit-tests]
+        n_repeat: [1]
+        time_limit: [1800]
+  - test_case: [tests/unit_tests/transformer/*.py]
+    products:
+      - environment: [lts, dev]
+        tag: [latest, legacy]
+        scope: [unit-tests]
+        n_repeat: [1]
+        time_limit: [1800]
+  - test_case: [tests/unit_tests/transformer/moe]
+    products:
+      - environment: [lts, dev]
+        tag: [latest, legacy]
+        scope: [unit-tests]
+        n_repeat: [1]
+        time_limit: [1800]
+  - test_case: [tests/unit_tests]
+    products:
      - environment: [lts, dev]
        tag: [latest, legacy]
        scope: [unit-tests]
        n_repeat: [1]
        time_limit: [1800]
-    test_case:
-      - tests/unit_tests/data/
-      - tests/unit_tests/dist_checkpointing/*.py
-      - tests/unit_tests/dist_checkpointing/models/
-      - tests/unit_tests/transformer/*.py
-      - tests/unit_tests/transformer/moe
-      - tests/unit_tests
--- a/tests/unit_tests/data/test_builder.py
+++ b/tests/unit_tests/data/test_builder.py
@@ -192,63 +192,15 @@ def test_builder():
        assert datasets[1] is None
        assert datasets[2] is None

+        # This build used to fail when building datasets without a sample buffer
        config = BlendedMegatronDatasetConfig(
            random_seed=1234,
            sequence_length=_SEQUENCE_LENGTH,
            blend_per_split=[blends[Split.train], None, None],
        )
-        try:
        datasets = BlendedMegatronDatasetBuilder(
            TestDataset, [1000, None, None], lambda: True, config
        ).build()
-            raise RuntimeError
-        except IndexError:
-            ##
-            #
-            # The size per dataset is a function of the requested size, the weight per dataset,
-            # and a constant coefficient. The sizes, and consequently the total size to request,
-            # are modified such that the weights may or may not be sufficiently representative.
-            # To fix this, the weights should be reset according to the new sizes:
-            #
-            # S := size
-            # W := weights
-            #
-            # S = func(S, W)
-            #
-            # W = S / sum(S)
-            #
-            ##
-            config = BlendedMegatronDatasetConfig(
-                random_seed=1234,
-                sequence_length=_SEQUENCE_LENGTH,
-                blend_per_split=[blends[Split.train], None, None],
-                renormalize_blend_weights=True,
-            )
-            datasets = BlendedMegatronDatasetBuilder(
-                TestDataset, [1000, None, None], lambda: True, config
-            ).build()
-            assert (
-                len(datasets[0]) >= 1000
-                and len(datasets[0]) <= 1000 * (1 + _MARGIN) + _NUM_DATASETS
-            )
-
-            config = BlendedMegatronDatasetConfig(
-                random_seed=1234,
-                sequence_length=_SEQUENCE_LENGTH,
-                blend_per_split=[blends[Split.train], blends[Split.valid], blends[Split.test]],
-            )
-            datasets = BlendedMegatronDatasetBuilder(
-                TestDataset, [100, 100, 100], lambda: True, config
-            ).build()
-            assert (
-                len(datasets[0]) >= 100 and len(datasets[0]) <= 100 * (1 + _MARGIN) + _NUM_DATASETS
-            )
-            assert (
-                len(datasets[1]) >= 100 and len(datasets[1]) <= 100 * (1 + _MARGIN) + _NUM_DATASETS
-            )
-            assert (
-                len(datasets[2]) >= 100 and len(datasets[2]) <= 100 * (1 + _MARGIN) + _NUM_DATASETS
-            )

        config = BlendedMegatronDatasetConfig(
            random_seed=1234,
@@ -322,73 +274,16 @@ def test_builder():
        assert len(datasets[1]) == sum(_SIZES[Split.train]) / 2
        assert datasets[2] is None

-        # 990 9 1
-        # 100000 1000 1
-        # []
-        config = BlendedMegatronDatasetConfig(
-            random_seed=1234,
-            sequence_length=_SEQUENCE_LENGTH,
-            blend=blends[Split.train],
-            split="990,9,1",
-        )
-        try:
-            # All three of 100000, 1000, and 1 result in error, yet 10000 and 100 do not
-            datasets = BlendedMegatronDatasetBuilder(
-                TestDataset, [100000, 1000, 1], lambda: True, config
-            ).build()
-        except IndexError:
-            ##
-            #
-            # The size per dataset is a function of the requested size, the weight per dataset,
-            # and a constant coefficient. The sizes, and consequently the total size to request,
-            # are modified such that the weights may or may not be sufficiently representative.
-            # To fix this, the weights should be reset according to the new sizes:
-            #
-            # S := size
-            # W := weights
-            #
-            # S = func(S, W)
-            #
-            # W = S / sum(S)
-            #
-            ##
+        # This build used to fail when building datasets without a sample buffer
        config = BlendedMegatronDatasetConfig(
            random_seed=1234,
            sequence_length=_SEQUENCE_LENGTH,
            blend=blends[Split.train],
            split="990,9,1",
-                renormalize_blend_weights=True,
        )
        datasets = BlendedMegatronDatasetBuilder(
            TestDataset, [100000, 1000, 1], lambda: True, config
        ).build()
-            assert (
-                len(datasets[0]) >= 100000
-                and len(datasets[0]) <= 100000 * (1 + _MARGIN) + _NUM_DATASETS
-            )
-            assert (
-                len(datasets[1]) >= 1000
-                and len(datasets[1]) <= 1000 * (1 + _MARGIN) + _NUM_DATASETS
-            )
-            assert len(datasets[2]) >= 1 and len(datasets[2]) <= 1 * (1 + _MARGIN) + _NUM_DATASETS
-
-            config = BlendedMegatronDatasetConfig(
-                random_seed=1234,
-                sequence_length=_SEQUENCE_LENGTH,
-                blend=blends[Split.train],
-                split="990,9,1",
-            )
-            datasets = BlendedMegatronDatasetBuilder(
-                TestDataset, [10000, 100, 0], lambda: True, config
-            ).build()
-            assert (
-                len(datasets[0]) >= 10000
-                and len(datasets[0]) <= 10000 * (1 + _MARGIN) + _NUM_DATASETS
-            )
-            assert (
-                len(datasets[1]) >= 100 and len(datasets[1]) <= 100 * (1 + _MARGIN) + _NUM_DATASETS
-            )
-            assert len(datasets[2]) == 0


 if __name__ == "__main__":

--- a/tests/unit_tests/dist_checkpointing/conftest.py
+++ b/tests/unit_tests/dist_checkpointing/conftest.py
@@ -10,6 +10,11 @@ def pytest_sessionfinish(session, exitstatus):
        session.exitstatus = 0


+@pytest.fixture(scope="class")
+def tmp_dir_per_class(tmp_path_factory):
+    return tmp_path_factory.mktemp("data")
+
+
 @pytest.fixture(scope='session', autouse=True)
 def set_default_dist_ckpt_strategy():
    def get_pyt_dist_save_sharded_strategy():