Commit 688448db authored by silencealiang's avatar silencealiang
Browse files

更新代码

parent a02a5490
Pipeline #2503 passed with stage
{
"lm loss": {
"start_step": 0,
"end_step": 100,
"step_interval": 5,
"values": [
10.39854,
9.41109,
8.8833,
8.56279,
8.28765,
8.10226,
7.83824,
7.53414,
7.39426,
7.28765,
7.36798,
7.22207,
7.10595,
7.05273,
6.91414,
6.96485,
6.97279,
7.03525,
6.70355,
6.97029
]
},
"num-zeros": {
"start_step": 0,
"end_step": 100,
"step_interval": 5,
"values": [
43320.0,
40948.0,
43971.0,
41622.0,
44740.0,
43919.0,
41231.0,
42497.0,
44664.0,
43894.0,
41149.0,
43254.0,
39687.0,
45400.0,
43313.0,
43891.0,
45351.0,
45692.0,
46187.0,
44657.0
]
},
"iteration-time": {
"start_step": 0,
"end_step": 100,
"step_interval": 5,
"values": [
14.46368,
0.41717,
0.42344,
0.4102,
0.40332,
0.40531,
0.40418,
0.40386,
0.40711,
0.4048,
0.40536,
0.40331,
0.40175,
0.4047,
0.40982,
0.40834,
0.40594,
0.40872,
0.40896,
0.41014
]
}
}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.39854, "5": 9.39701, "10": 9.03359, "15": 8.67298, "20": 8.28241, "25": 8.00349, "30": 7.88919, "35": 7.67196, "40": 7.50912, "45": 7.35246, "50": 7.18229, "55": 7.15567, "60": 7.14148, "65": 7.00001, "70": 7.0554, "75": 7.05859, "80": 6.94155, "85": 6.84584, "90": 7.2405, "95": 6.84353, "100": 6.96854}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 43320.0, "5": 45392.0, "10": 45363.0, "15": 43919.0, "20": 44778.0, "25": 42432.0, "30": 43986.0, "35": 43261.0, "40": 43242.0, "45": 43266.0, "50": 43346.0, "55": 43875.0, "60": 41289.0, "65": 44697.0, "70": 45530.0, "75": 44661.0, "80": 41029.0, "85": 43973.0, "90": 44723.0, "95": 44054.0, "100": 42464.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2140224000.0, "5": 2140224000.0, "10": 2140224000.0, "15": 2140224000.0, "20": 2140224000.0, "25": 2140224000.0, "30": 2140224000.0, "35": 2140224000.0, "40": 2140224000.0, "45": 2140224000.0, "50": 2140224000.0, "55": 2140224000.0, "60": 2140224000.0, "65": 2140224000.0, "70": 2140224000.0, "75": 2140224000.0, "80": 2140224000.0, "85": 2140224000.0, "90": 2140224000.0, "95": 2140224000.0, "100": 2140224000.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2372122112.0, "5": 3305918976.0, "10": 3305918976.0, "15": 3305918976.0, "20": 3305918976.0, "25": 3305918976.0, "30": 3305918976.0, "35": 3305918976.0, "40": 3305918976.0, "45": 3305918976.0, "50": 3305918976.0, "55": 3305918976.0, "60": 3305918976.0, "65": 3305918976.0, "70": 3305918976.0, "75": 3305918976.0, "80": 3305918976.0, "85": 3305918976.0, "90": 3305918976.0, "95": 3305918976.0, "100": 3306050048.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 12.40654, "5": 0.40596, "10": 0.41633, "15": 0.39729, "20": 0.39823, "25": 0.39786, "30": 0.39874, "35": 0.39845, "40": 0.40982, "45": 0.39982, "50": 0.39604, "55": 0.39557, "60": 0.39545, "65": 0.39649, "70": 0.39623, "75": 0.39574, "80": 0.40039, "85": 0.39829, "90": 0.39569, "95": 0.39538, "100": 0.39981}}}
\ No newline at end of file
......@@ -34,8 +34,8 @@ MODEL_ARGS:
--tokenizer-type: BertWordPieceCase
--calculate-per-token-loss: true
--split: 99982,9,9
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--tensorboard-dir: ${TENSORBOARD_PATH}
--log-params-norm: true
--log-num-zeros-in-grad: true
......@@ -50,4 +50,5 @@ MODEL_ARGS:
--data-cache-path: ${DATA_CACHE_PATH}
--deterministic-mode: true
--ckpt-format: torch
--log-memory-to-tensorboard: true
TEST_TYPE: regular
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
ENV_VARS:
CUDA_DEVICE_MAX_CONNECTIONS: '1'
NVTE_ALLOW_NONDETERMINISTIC_ALGO: '1'
TEST_TYPE: 'release'
MODEL_ARGS:
# T5 model args
......@@ -16,7 +15,6 @@ MODEL_ARGS:
--max-position-embeddings: 512
--init-method-std: 0.015
--attention-backend: unfused
# Training args
--micro-batch-size: 32
--global-batch-size: 512
......@@ -47,8 +45,8 @@ MODEL_ARGS:
--log-interval: 100
--save-interval: 2000
--eval-interval: 1000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--eval-iters: 10
--tensorboard-dir: ${TENSORBOARD_PATH}
--log-timers-to-tensorboard: true
......
......@@ -21,13 +21,28 @@ def resolve_cluster_config(cluster: str) -> str:
raise ValueError(f"Unknown cluster {cluster} provided.")
def resolve_artifact_config(cluster: str) -> str:
if cluster == "dgxh100_eos":
return "eos_lustre"
if cluster == "dgxa100_dracooci":
return "draco-oci_lustre"
if cluster == "dgxa100_dracooci-ord":
return "draco-oci-ord_lustre"
if cluster == "dgxh100_coreweave":
return "coreweave_lustre"
raise ValueError(f"Unknown cluster {cluster} provided.")
def flatten_products(
workload_manifest: jetclient.JETWorkloadManifest,
) -> jetclient.JETWorkloadManifest:
"""Flattens a nested dict of products"""
workload_manifest.products = [
dict(zip(inp.keys(), values))
for inp in workload_manifest.products
dict(**dict(zip(inp.keys(), values)), **{"test_case": product['test_case'][0]})
for product in workload_manifest.products
if "products" in product
for inp in product['products']
for values in itertools.product(*inp.values())
]
......@@ -195,6 +210,7 @@ def load_workloads(
model: Optional[str] = None,
test_case: Optional[str] = None,
container_image: Optional[str] = None,
record_checkpoints: Optional[str] = None,
) -> List[jetclient.JETWorkloadManifest]:
"""Return all workloads from disk that match scope and platform."""
recipes_dir = BASE_PATH / ".." / "recipes"
......@@ -238,4 +254,17 @@ def load_workloads(
workloads.append(build_workload)
workload.spec.n_repeat = n_repeat
workload.spec.time_limit = time_limit
if record_checkpoints == 'true':
workload.outputs = [
{
"type": "artifact",
"key": f"unverified/model/mcore-ci/{container_tag}/{{model}}/{{name}}",
"subdir": "checkpoints",
"name": r"{model}/{name}",
"description": r"Checkpoint of {model}/{name}",
"pic": {"name": "Mcore CI", "email": "okoenig@nvidia.com"},
"labels": {"origin": "ADLR/Megatron-LM"},
}
]
return workloads
import glob
import logging
import os
import pathlib
import shutil
import zipfile
import click
import gitlab
BASE_PATH = pathlib.Path(__file__).parent.resolve()
PROJECT_ID = int(os.getenv("CI_PROJECT_ID", 19378))
logger = logging.getLogger(__name__)
@click.command()
@click.option("--pipeline-id", required=True, type=int, help="Pipeline ID")
def main(pipeline_id: int):
logging.basicConfig(level=logging.INFO)
logger.info('Started')
gl = gitlab.Gitlab(
f"https://{os.getenv('GITLAB_ENDPOINT')}", private_token=os.getenv("RO_API_TOKEN")
)
project = gl.projects.get(PROJECT_ID)
pipeline = project.pipelines.get(pipeline_id)
print(pipeline.bridges.list())
pipeline_bridges = [
pipeline_bridge
for pipeline_bridge in pipeline.bridges.list()
if pipeline_bridge.name.startswith("test:unit_tests")
and pipeline_bridge.downstream_pipeline is not None
]
ASSETS_DIR = pathlib.Path("tmp") / "results" / "iteration=0"
for pipeline_bridge in pipeline_bridges:
functional_pipeline = project.pipelines.get(pipeline_bridge.downstream_pipeline['id'])
functional_pipeline_jobs = functional_pipeline.jobs.list(get_all=True)
if "legacy" in pipeline_bridge.name:
continue
logger.info("Starting with pipeline %s", pipeline_bridge.name)
for functional_pipeline_job in functional_pipeline_jobs:
job = project.jobs.get(functional_pipeline_job.id)
logger.info("Starting with job %s", job.name)
try:
file_name = '__artifacts.zip'
with open(file_name, "wb") as f:
job.artifacts(streamed=True, action=f.write)
zip = zipfile.ZipFile(file_name)
zip.extractall("tmp")
logger.info("Downloaded artifacts of job %s", job.name)
except Exception:
continue
os.unlink(file_name)
restart_dir = os.listdir(pathlib.Path("tmp") / "results" / "iteration=0")[-1]
coverage_report_source = list(
glob.glob(
str(
pathlib.Path(ASSETS_DIR)
/ f"{restart_dir}"
/ "assets"
/ "basic"
/ "*"
/ "coverage_report"
)
)
)[0]
coverage_report_target = (
pathlib.Path("coverage_results") / job.name.replace("/", "-") / "coverage_report"
)
if pathlib.Path(coverage_report_source).exists():
pathlib.Path(coverage_report_target.parent).mkdir(parents=True, exist_ok=True)
logger.info(
"Move artifacts from %s to %s", coverage_report_source, coverage_report_target
)
shutil.move(coverage_report_source, coverage_report_target)
else:
logger.info(
"coverage_report for %s does not exist. Skip.", str(f"{job.stage} / {job.name}")
)
shutil.rmtree("tmp")
logger.info("beep boop: All done!")
if __name__ == "__main__":
main()
import logging
import os
import pathlib
import shutil
import zipfile
import click
import gitlab
BASE_PATH = pathlib.Path(__file__).parent.resolve()
PROJECT_ID = int(os.getenv("CI_PROJECT_ID", 19378))
logger = logging.getLogger(__name__)
@click.command()
@click.option("--pipeline-id", required=True, type=int, help="Pipeline ID")
def main(pipeline_id: int):
logging.basicConfig(level=logging.INFO)
logger.info('Started')
gl = gitlab.Gitlab(
f"https://{os.getenv('GITLAB_ENDPOINT')}", private_token=os.getenv("RO_API_TOKEN")
)
project = gl.projects.get(PROJECT_ID)
pipeline = project.pipelines.get(pipeline_id)
print(pipeline.bridges.list())
pipeline_bridges = [
pipeline_bridge
for pipeline_bridge in pipeline.bridges.list()
if pipeline_bridge.name.startswith("functional")
and pipeline_bridge.downstream_pipeline is not None
]
ASSETS_DIR = pathlib.Path("tmp") / "results" / "iteration=0"
for pipeline_bridge in pipeline_bridges:
functional_pipeline = project.pipelines.get(pipeline_bridge.downstream_pipeline['id'])
environment = pipeline_bridge.name[len("functional:run_") :]
functional_pipeline_jobs = functional_pipeline.jobs.list(get_all=True)
logger.info("Starting with pipeline %s", pipeline_bridge.name)
for functional_pipeline_job in functional_pipeline_jobs:
job = project.jobs.get(functional_pipeline_job.id)
logger.info("Starting with job %s", job.name)
try:
file_name = '__artifacts.zip'
with open(file_name, "wb") as f:
job.artifacts(streamed=True, action=f.write)
zip = zipfile.ZipFile(file_name)
zip.extractall("tmp")
logger.info("Downloaded artifacts of job %s", job.name)
except Exception:
continue
os.unlink(file_name)
restart_dir = os.listdir(pathlib.Path("tmp") / "results" / "iteration=0")[-1]
golden_values_source = (
pathlib.Path(ASSETS_DIR)
/ f"{restart_dir}"
/ "assets"
/ "basic"
/ f"{job.name.replace('_', '-').lower()}-{environment}"
/ f"golden_values_{environment}.json"
)
golden_values_target = (
pathlib.Path("tests")
/ "functional_tests"
/ 'test_cases'
/ job.stage
/ job.name
/ f"golden_values_{environment}.json"
)
if golden_values_source.exists():
pathlib.Path(golden_values_target.parent).mkdir(parents=True, exist_ok=True)
logger.info(
"Move artifacts from %s to %s", golden_values_source, golden_values_target
)
shutil.move(golden_values_source, golden_values_target)
else:
logger.info(
"Golden values for %s does not exist. Skip.", str(f"{job.stage} / {job.name}")
)
shutil.rmtree("tmp")
logger.info("beep boop: All done!")
if __name__ == "__main__":
main()
......@@ -19,6 +19,12 @@ BASE_PATH = pathlib.Path(__file__).parent.resolve()
)
@click.option("--a100-cluster", required=True, type=str, help="A100 Cluster to run on")
@click.option("--h100-cluster", required=True, type=str, help="H100 Cluster to run on")
@click.option(
"--a100-partition", required=False, type=str, help="Slurm partition to use", default=None
)
@click.option(
"--h100-partition", required=False, type=str, help="Slurm partition to use", default=None
)
@click.option("--output-path", required=True, type=str, help="Path to write GitLab job to")
@click.option("--container-image", required=True, type=str, help="LTS Container image to use")
@click.option("--container-tag", required=True, type=str, help="Container tag to use")
......@@ -28,6 +34,8 @@ BASE_PATH = pathlib.Path(__file__).parent.resolve()
type=str,
help="Name of job that created the downstream pipeline",
)
@click.option("--record-checkpoints", required=False, type=str, help="Values are 'true' or 'false'")
@click.option("--slurm-account", required=True, type=str, help="Slurm account to use")
@click.option("--tag", required=False, type=str, help="Tag (only relevant for unit tests)")
@click.option(
"--run-name", required=False, type=str, help="Run name (only relevant for release tests)"
......@@ -46,10 +54,14 @@ def main(
test_cases: str,
a100_cluster: str,
h100_cluster: str,
a100_partition: Optional[str],
h100_partition: Optional[str],
output_path: str,
container_image: str,
container_tag: str,
dependent_job: str,
record_checkpoints: str,
slurm_account: str,
tag: Optional[str] = None,
run_name: Optional[str] = None,
wandb_experiment: Optional[str] = None,
......@@ -97,14 +109,19 @@ def main(
else:
gitlab_pipeline = {
"stages": list(set([test_case.spec.model for test_case in list_of_test_cases])),
"default": {"interruptible": True},
"default": {
"interruptible": True,
"retry": {"max": 2, "when": "runner_system_failure"},
},
}
for test_case in list_of_test_cases:
if test_case.spec.platforms == "dgx_a100":
cluster = a100_cluster
partition = a100_partition
elif test_case.spec.platforms == "dgx_h100":
cluster = h100_cluster
partition = h100_partition
else:
raise ValueError(f"Platform {test_case.spec.platforms} unknown")
......@@ -118,11 +135,17 @@ def main(
f"--environment {test_case.spec.environment}",
f"--n-repeat {n_repeat}",
f"--time-limit {time_limit}",
f"--scope {scope}",
f"--test-case '{test_case.spec.test_case}'",
f"--container-tag {container_tag}",
f"--cluster {cluster}",
f"--record-checkpoints {record_checkpoints}",
f"--account {slurm_account}",
]
if partition is not None:
script.append(f"--partition {partition}")
if tag is not None:
script.append(f"--tag {tag}")
......
......@@ -29,6 +29,12 @@ def load_script(config_path: str) -> str:
@click.option(
"--test-case", required=False, type=str, help="Returns a single test-case with matching name."
)
@click.option(
"--environment",
required=True,
type=str,
help="Pass 'lts' for PyTorch 24.01 and 'dev' for a more recent version.",
)
@click.option(
"--output-path",
required=True,
......@@ -36,9 +42,20 @@ def load_script(config_path: str) -> str:
help="Directory where the functional test will write its artifacts to (Tensorboard logs)",
default="/opt/megatron-lm",
)
def main(model: Optional[str], scope: Optional[str], test_case: Optional[str], output_path: str):
def main(
model: Optional[str],
scope: Optional[str],
test_case: Optional[str],
environment: str,
output_path: str,
):
workloads = common.load_workloads(
container_image='none', scope=scope, model=model, test_case=test_case, container_tag='none'
container_image='none',
scope=scope,
model=model,
test_case=test_case,
environment=environment,
container_tag='none',
)
for workload in workloads:
......@@ -46,6 +63,7 @@ def main(model: Optional[str], scope: Optional[str], test_case: Optional[str], o
continue
magic_values = dict(workload.spec)
magic_values["assets_dir"] = output_path
magic_values["artifacts_dir"] = output_path
file_path = (
pathlib.Path.cwd()
......
import json
import logging
import os
import pathlib
import re
......@@ -12,7 +13,6 @@ import click
import jetclient
import requests
import yaml
from jet import workloads
from jetclient.facades.objects import log as jet_log
from jetclient.services.dtos.pipeline import PipelineStatus
......@@ -20,6 +20,8 @@ from tests.test_utils.python_scripts import common
BASE_PATH = pathlib.Path(__file__).parent.resolve()
logger = logging.getLogger(__name__)
def register_pipeline_terminator(pipeline: jetclient.JETPipeline):
def sigterm_handler(_signo, _stack_frame):
......@@ -37,17 +39,24 @@ def launch_and_wait_for_completion(
environment: str,
n_repeat: int,
time_limit: int,
scope: str,
container_image: Optional[str],
container_tag: str,
cluster: str,
account: str,
record_checkpoints: str,
partition: Optional[str],
tag: Optional[str],
run_name: Optional[str],
wandb_experiment: Optional[str],
) -> jetclient.JETPipeline:
n_submit_errors = 0
cluster_config = {"account": account}
if partition is not None:
cluster_config['partition'] = partition
while n_submit_errors < 3:
n_submission_attempts = 0
while n_submission_attempts < 3:
try:
pipeline = jetclient.JETClient(
customer='mcore', gitlab_ci_token=os.getenv("RO_API_TOKEN"), env="prod"
).workloads.submit(
......@@ -56,13 +65,15 @@ def launch_and_wait_for_completion(
n_repeat=n_repeat,
time_limit=time_limit,
tag=tag,
scope=scope,
container_image=container_image,
container_tag=container_tag,
environment=environment,
record_checkpoints=record_checkpoints,
),
config_id=f"mcore/{common.resolve_cluster_config(cluster)}",
custom_config={
"launchers": {cluster: {"account": account, "ntasks_per_node": 8}},
"launchers": {cluster: cluster_config},
"executors": {
"jet-ci": {
"environments": {
......@@ -71,68 +82,77 @@ def launch_and_wait_for_completion(
"RUN_NAME": run_name or "",
"WANDB_API_KEY": os.getenv("WANDB_API_KEY") or "",
"WANDB_EXPERIMENT": wandb_experiment or "",
"RECORD_CHECKPOINTS": str(
"Record checkpoints"
in os.getenv("CI_MERGE_REQUEST_LABELS", "")
).lower(),
}
}
}
}
},
"outputs": {
"enabled": True,
"artifacts_storages": [common.resolve_artifact_config(cluster)],
},
},
wait_for_validation=True,
max_wait_time=(60 * 60),
)
except jetclient.clients.gitlab.GitlabAPIError as e:
logger.error(f"Faced {str(e)}. Waiting and retrying...")
n_submission_attempts += 1
time.sleep(2**n_submission_attempts * 5)
continue
if pipeline.get_status() == PipelineStatus.SUBMISSION_FAILED:
n_submit_errors += 1
print(f"Failed submitting pipeline. Let's try again ({n_submit_errors}/3)")
n_submission_attempts += 1
logger.info("Submission failed, attempt again (%s/3)", str(n_submission_attempts))
continue
break
register_pipeline_terminator(pipeline=pipeline)
print(
f"Pipeline triggered; inspect it here: https://gitlab-master.nvidia.com/dl/jet/ci/-/pipelines/{pipeline.jet_id}",
flush=True,
logger.info(
"Pipeline triggered; inspect it here: https://gitlab-master.nvidia.com/dl/jet/ci/-/pipelines/%s",
pipeline.jet_id,
)
n_wait_attempts = 0
while n_wait_attempts < 3:
try:
pipeline.wait(max_wait_time=60 * 60 * 24 * 7, interval=60 * 1)
break
except (requests.exceptions.ConnectionError, json.decoder.JSONDecodeError) as e:
print(e)
time.sleep(60 * 3**n_wait_attempts)
pipeline = workloads.get_pipeline(pipeline.jet_id)
n_wait_attempts += 1
pipeline.wait(max_wait_time=60 * 60 * 24 * 7, interval=60 * 1, retries_on_error=3)
print(f"Pipeline terminated; status: {pipeline.get_status()}")
logger.info(f"Pipeline terminated; status: {pipeline.get_status()}")
return pipeline
def download_job_assets(logs: List[jet_log.JETLog], iteration: int = 0) -> List[str]:
if not logs:
logger.info("No logs found for download.")
return [""]
assets_base_path = BASE_PATH / ".." / ".." / ".." / ".." / "results" / f"iteration={iteration}"
assets_base_path = BASE_PATH / ".." / ".." / ".." / "results" / f"iteration={iteration}"
for restart_idx, log in enumerate(logs):
assets = log.get_assets()
assets_path = assets_base_path / f"restart={restart_idx}"
assets_path.mkdir(parents=True, exist_ok=True)
for log_filename in assets.keys():
with open(assets_path / log_filename, "w") as fh:
assets[log_filename].download(pathlib.Path(fh.name))
for asset in assets:
(assets_path / asset.source_path).parent.mkdir(parents=True, exist_ok=True)
with open(assets_path / asset.source_path, "w") as fh:
dest = pathlib.Path(fh.name)
logger.info("Downloading log %s to %s", asset.source_path, str(dest))
asset.download(dest)
return assets
def extract_logs_to_string(logs: List[jet_log.JETLog]) -> List[str]:
if not logs:
logger.info("No logs found for download.")
return [""]
assets = logs[0].get_assets()
log_filename = [key for key in assets.keys() if key.endswith(".log")][0]
with tempfile.NamedTemporaryFile() as tmp_file:
assets[log_filename].download(pathlib.Path(tmp_file.name))
assets = logs[-1].get_assets()
asset = [asset for asset in assets if asset.name == "output_script-0.log"][0]
asset.download(pathlib.Path(tmp_file.name))
with open(pathlib.Path(tmp_file.name), "r") as fh:
return fh.readlines()
......@@ -161,6 +181,7 @@ def parse_finished_training(logs: List[str]) -> Optional[bool]:
)
@click.option("--n-repeat", required=False, default=1, type=int)
@click.option("--time-limit", required=False, default=1800, type=int)
@click.option("--scope", required=False, default="mr", type=str)
@click.option(
"--account",
required=False,
......@@ -168,10 +189,12 @@ def parse_finished_training(logs: List[str]) -> Optional[bool]:
help="Slurm account to use",
default="coreai_dlalgo_mcore",
)
@click.option("--partition", required=False, type=str, help="Slurm partition to use", default=None)
@click.option("--cluster", required=True, type=str, help="Cluster to run on")
@click.option("--container-tag", required=True, type=str, help="Base image of Mcore image")
@click.option("--container-image", required=False, type=str, help="Base image of Mcore image")
@click.option("--tag", required=False, type=str, help="Tag (only relevant for unit tests)")
@click.option("--record-checkpoints", required=False, type=str, help="Values are 'true' or 'false'")
@click.option(
"--run-name", required=False, type=str, help="Run name (only relevant for release tests)"
)
......@@ -187,14 +210,20 @@ def main(
environment: str,
n_repeat: int,
time_limit: int,
scope: str,
account: str,
partition: Optional[str],
cluster: str,
container_tag: str,
record_checkpoints: str,
tag: Optional[str] = None,
container_image: Optional[str] = None,
run_name: Optional[str] = None,
wandb_experiment: Optional[str] = None,
):
logging.basicConfig(level=logging.INFO)
logger.info('Started')
model_config_path = pathlib.Path(
BASE_PATH
/ ".."
......@@ -217,8 +246,10 @@ def main(
else:
test_type = "unit_test"
logger.info('test_type will be %s', test_type)
if test_type == "release" and (run_name is None or wandb_experiment is None):
print(f"Not all arguments provided ({run_name=}, {wandb_experiment=})")
logger.error(f"Not all arguments provided ({run_name=}, {wandb_experiment=})")
sys.exit(1)
n_attempts = 0
......@@ -230,13 +261,16 @@ def main(
environment=environment,
n_repeat=n_repeat,
time_limit=time_limit,
scope=scope,
container_image=container_image,
container_tag=container_tag,
cluster=cluster,
account=account,
partition=partition,
tag=tag,
run_name=run_name,
wandb_experiment=wandb_experiment,
record_checkpoints=record_checkpoints,
)
main_job = [job for job in pipeline.get_jobs() if job.name.startswith("basic")][0]
......@@ -247,25 +281,39 @@ def main(
jet_log = main_job.get_logs()
logs = extract_logs_to_string(logs=jet_log)
download_job_assets(logs=jet_log, iteration=n_iteration)
no_log = False
break
except (requests.exceptions.ConnectionError, json.decoder.JSONDecodeError) as e:
print(e)
time.sleep((3**n_download_attempt) * 60)
except (
requests.exceptions.ConnectionError,
json.decoder.JSONDecodeError,
UnicodeDecodeError,
) as e:
logger.error(e)
time.sleep(2 * n_download_attempt * 15)
n_download_attempt += 1
no_log = True
except (KeyError, IndexError) as e:
logger.error(e)
no_log = True
break
if no_log:
logger.error("Did not find any logs to download, retry.")
continue
concat_logs = "\n".join(logs)
if concat_logs.strip() == "":
logger.error("No logs found. Try again.")
n_attempts += 1
continue
if test_type != "release":
print(f"Logs:\n{concat_logs}")
success = pipeline.get_status() == PipelineStatus.SUCCESS
logger.info("Pipeline terminated with status %s", pipeline.get_status().name)
if test_type == "unit_test":
success = success and (
(
re.search(r'=.*?\bpassed\b.*?=', concat_logs)
and not re.search(r'=.*?\bfailed\b.*?=', concat_logs)
)
or "0 selected" in concat_logs
)
sys.exit(int(not success)) # invert for exit 0
if test_type != "release":
......@@ -277,22 +325,24 @@ def main(
or "uncorrectable ECC error encountered" in concat_logs
or "illegal memory access" in concat_logs
or "illegal instruction" in concat_logs
or "torch.distributed.DistNetworkError" in concat_logs
):
print("Detected NCCL failure, attempt restart.")
logger.error("Detected NCCL failure, attempt restart.")
n_attempts += 1
continue
if "FAILED tests/functional_tests/python_test_utils/test_ci_pipeline.py" in concat_logs:
print("Non-determinism, let's try another node.")
if "FAILED tests/functional_tests/python_test_utils" in concat_logs:
logger.error("Non-determinism, let's try another node.")
n_nondeterminism_attemps += 1
continue
sys.exit(1)
if parse_failed_job(logs=logs):
n_attempts += 1
continue
if parse_finished_training(logs=logs):
success = pipeline.get_status() == PipelineStatus.SUCCESS
sys.exit(int(not success)) # invert for exit 0
n_iteration += 1
sys.exit(1)
......
......@@ -3,14 +3,17 @@ format_version: 1
maintainers: [mcore]
loggers: [stdout]
spec:
name: "{test_case}"
name: '{test_case}_{environment}'
model: bert
nodes: 1
build: mcore-pyt-{environment}
gpus: 8
platforms: dgx_a100
time_limit:
n_repeat:
artifacts:
/workspace/data/bert_data: text/the_pile/bert_shard00
/workspace/checkpoints/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G_dev: model/mcore_bert/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G_dev/22410107
script: |-
ls
cd /opt/megatron-lm
......@@ -20,7 +23,8 @@ spec:
"DATA_CACHE_PATH=/workspace/data/cache"
"OUTPUT_PATH={assets_dir}"
"TENSORBOARD_PATH={assets_dir}/tensorboard"
"CHECKPOINT_PATH=/workspace/checkpoints"
"CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints"
"CHECKPOINT_LOAD_PATH=/workspace/checkpoints/{name}"
"TRAINING_SCRIPT_PATH=pretrain_bert.py"
"TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
"GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
......@@ -30,26 +34,75 @@ spec:
bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
products:
- environment: [lts, dev]
scope: [mr]
time_limit: [1800]
n_repeat: [5]
test_case:
- bert_mr_mcore_tp2_pp2_dgx_a100_1N8G
- bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G
- bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G
- bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G
- bert_mr_tp1_pp4_vp2_dgx_a100_1N8G
- bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G
- bert_mr_tp2_pp2_dgx_a100_1N8G
- bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G
- environment: [lts, dev]
scope: [nightly]
n_repeat: [5]
time_limit: [3600]
test_case:
- bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2
- bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2
- bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1
- bert_nightly_dgx_a100_1N8G_tp1_pp2
- bert_nightly_dgx_a100_1N8G_tp4_pp1
- test_case: [bert_mr_mcore_tp2_pp2_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [bert_mr_tp1_pp4_vp2_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [bert_mr_tp2_pp2_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [bert_nightly_dgx_a100_1N8G_tp1_pp2]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [bert_nightly_dgx_a100_1N8G_tp4_pp1]
products:
- environment: [dev, lts]
scope: [nightly]
# - test_case: [bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G]
# products:
# - environment: [dev] Update checkpoint
# scope: [mr]
......@@ -3,11 +3,14 @@ format_version: 1
maintainers: [mcore]
loggers: [stdout]
spec:
name: "{test_case}"
name: '{test_case}_{environment}'
model: gpt
build: mcore-pyt-{environment}
nodes: 1
gpus: 2
platforms: dgx_a100
time_limit:
n_repeat:
artifacts:
/workspace/data/gpt3_data: text/the_pile/shard00
/workspace/checkpoints/teacher: model/gpt_dummy_pyt/ckpt/24.10.0_bf16_teacher
......@@ -20,7 +23,8 @@ spec:
"DATA_CACHE_PATH=/workspace/data/cache"
"OUTPUT_PATH={assets_dir}"
"TENSORBOARD_PATH={assets_dir}/tensorboard"
"CHECKPOINT_PATH=/workspace/checkpoints"
"CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints"
"CHECKPOINT_LOAD_PATH=/workspace/checkpoints"
"TRAINING_SCRIPT_PATH=./examples/export/knowledge_distillation/pretrain_gpt_modelopt.py"
"TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
"GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
......@@ -29,9 +33,7 @@ spec:
bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
products:
- scope: [nightly]
platforms: [dgx_a100]
time_limit: [1200]
environment: [lts, dev] # Disable dev for now
test_case:
- gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume
- test_case: [gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume]
products:
- environment: [dev, lts]
scope: [nightly]
......@@ -3,14 +3,14 @@ format_version: 1
maintainers: [mcore]
loggers: [stdout]
spec:
name: "{test_case}"
name: '{test_case}_{environment}'
model: gpt-nemo
build: mcore-nemo
nodes: 1
gpus: 8
platforms: dgx_a100
time_limit: 1800
scope: null
scope:
script: |-
ls
cd /opt/NeMo
......@@ -20,7 +20,8 @@ spec:
"DATA_CACHE_PATH='-'"
"OUTPUT_PATH={assets_dir}"
"TENSORBOARD_PATH={assets_dir}/tensorboard"
"CHECKPOINT_PATH=/workspace/checkpoints"
"CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints"
"CHECKPOINT_LOAD_PATH=/workspace/checkpoints/{name}"
"TRAINING_SCRIPT_PATH=/opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py"
"TRAINING_PARAMS_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
"GOLDEN_VALUES_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
......@@ -30,10 +31,15 @@ spec:
bash /opt/megatron-lm/tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
products:
- test_case: [gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- test_case: [gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- test_case: [gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_8experts_tp2_ep2_pp1_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
n_repeat: [5]
test_case:
- gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G
- gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G
\ No newline at end of file
......@@ -3,13 +3,17 @@ format_version: 1
maintainers: [mcore]
loggers: [stdout]
spec:
name: "{test_case}"
name: '{test_case}_{environment}'
model: gpt
build: mcore-pyt-{environment}
nodes: 1
gpus: 8
n_repeat: 5
platforms: dgx_a100
artifacts:
/workspace/data/gpt3_data: text/the_pile/shard00
/workspace/checkpoints/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G_dev: model/mcore_gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G_dev/22410107
/workspace/checkpoints/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G_dev: model/mcore_gpt/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G_dev/22410107
script: |-
ls
cd /opt/megatron-lm
......@@ -19,7 +23,8 @@ spec:
"DATA_CACHE_PATH=/workspace/data/cache"
"OUTPUT_PATH={assets_dir}"
"TENSORBOARD_PATH={assets_dir}/tensorboard"
"CHECKPOINT_PATH=/workspace/checkpoints"
"CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints"
"CHECKPOINT_LOAD_PATH=/workspace/checkpoints/{name}"
"TRAINING_SCRIPT_PATH=pretrain_gpt.py"
"TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
"GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
......@@ -29,138 +34,694 @@ spec:
bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
products:
- environment: [lts, dev]
scope: [mr]
platforms: [dgx_a100]
time_limit: [1800]
n_repeat: [5]
test_case:
- gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G
# - gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G # torch >= 2.4.0
- gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G
- gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G
- gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G
- gpt3_mr_te_tp2_pp2_dgx_a100_1N8G
- gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G
- gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G
- gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G
- gpt3_mr_tp2_pp2_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G # cp and attention
- gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G # cp and attention
- gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G # cp and attention
- gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G # cp and attention
- gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G # cp and attention with a2a+p2p comm type
- gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G # cp and attention with a2a+p2p comm type
- environment: [lts, dev]
scope: [nightly]
platforms: [dgx_a100]
time_limit: [3600]
n_repeat: [5]
test_case:
- gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather
# - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_fsdp2_resume_torch_dist_te # torch >= 2.4.0
- gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather
- gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2
- gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist
- gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4
- gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist
- gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel
# - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts # non-determinism
- gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1
- gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch
- gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist
- gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce
- gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce
- gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2
- gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch
- gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4
- gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce
- gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch
- gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce
- gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce
- gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts
- gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce
- gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1
- gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce
- gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch
- environment: [lts]
scope: [nightly]
platforms: [dgx_a100]
time_limit: [3600]
n_repeat: [5]
test_case:
- gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel # non-determinism in dev
- environment: [lts, dev]
#######################################################################
# Nightly tests: Run both DEV and LTS unless something is flaky #
#######################################################################
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel]
products:
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel]
products:
- environment: [dev]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_etp2_te_4experts2parallel_dp_last]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_cp2_pp2_ep2_te_4experts2parallel_dp_last]
products:
- environment: [dev]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_cp2_pp2_ep2_te_4experts2parallel_nondeterministic_dp_last]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch]
products:
- environment: [dev, lts]
scope: [nightly]
# - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts]
# products:
# - environment: [dev, lts]
# scope: [nightly]
# - test_case: [gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_fsdp2_resume_torch_dist_te]
# products:
# - environment: [dev, lts]
# scope: [nightly]
#######################################################################
# Weekly tests: Run both DEV and LTS unless something is flaky #
#######################################################################
- test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel]
products:
- environment: [dev, lts]
scope: [weekly]
- test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline]
products:
- environment: [dev, lts]
scope: [weekly]
- test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel]
products:
- environment: [dev, lts]
scope: [weekly]
platforms: [dgx_h100]
time_limit: [9000]
test_case:
- gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel
- gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline
- gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel
- gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp
- gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp
- gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp
- gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp
- gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp
- test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp]
products:
- environment: [dev, lts]
scope: [weekly]
- test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp]
products:
- environment: [dev, lts]
scope: [weekly]
- test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp]
products:
- environment: [dev, lts]
scope: [weekly]
- test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp]
products:
- environment: [dev, lts]
scope: [weekly]
- test_case: [gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp]
products:
- environment: [dev, lts]
scope: [weekly]
#######################################################################
# MR tests: Mostly DEV on MR, and LTS on nightly cadence, except for #
# some very important tests. #
#######################################################################
- test_case: [gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G]
products:
# - environment: [dev] Until TE is at 1.12
# scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G]
products:
# - environment: [dev] Until TE is at 1.12
# scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- test_case: [gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_persistent_ckpt_disable_bias_linear_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_dp_last_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
# - test_case: [gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G] Failing on max-memory
# products:
# - environment: [dev]
# scope: [mr]
# - environment: [lts]
# scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
# - test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G]
# products:
# - environment: [dev]
# scope: [mr]
# - environment: [lts]
# scope: [nightly]
# - test_case: # Failing on max-memory[gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G]
# products:
# - environment: [dev]
# scope: [mr]
# - environment: [lts]
# scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_7b_mr_dgx_a100_1N8G_tp1_pp4_memory_speed]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_7b_mr_dgx_a100_1N8G_tp4_pp1_memory_speed]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [gpt3_mr_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
# - test_case: [gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective_dgx_a100_1N8G]
# products:
# - environment: [dev]
# scope: [mr]
# - environment: [lts]
# scope: [nightly]
# - test_case: [gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G]
# products:
# - environment: [dev]
# scope: [mr]
# - environment: [lts]
# scope: [nightly]
#######################################################################
# Super important MR tests that run for both DEV and LTS per MR #
#######################################################################
- test_case: [gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G]
products:
- environment: [dev, lts]
scope: [mr]
- test_case: [gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G]
products:
- environment: [dev, lts]
scope: [mr]
- test_case: [gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G]
products:
- environment: [dev, lts]
scope: [mr]
- test_case: [gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G]
products:
- environment: [dev, lts]
scope: [mr]
- test_case: [gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G]
products:
- environment: [dev, lts]
scope: [mr]
- test_case: [gpt3_mr_te_tp2_pp2_dgx_a100_1N8G]
products:
- environment: [dev, lts]
scope: [mr]
- test_case: [gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G]
products:
- environment: [dev, lts]
scope: [mr]
- test_case: [gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G]
products:
- environment: [dev, lts]
scope: [mr]
- test_case: [gpt3_mr_tp2_pp2_dgx_a100_1N8G]
products:
- environment: [dev, lts]
scope: [mr]
- test_case: [gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- test_case: [gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
# - test_case: [gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G]
# products:
# - environment: [dev, lts]
# scope: [mr]
......@@ -2,18 +2,17 @@ type: basic
format_version: 1
maintainers: [mcore]
loggers: [stdout]
launchers:
type:slurm:
ntasks_per_node: '{gpus}'
spec:
name: '{test_case}'
name: '{test_case}_{environment}'
model: multimodal-llava
build: mcore-pyt-{environment}
nodes: 1
gpus: 8
platforms: dgx_a100
time_limit: 1800
scope: null
time_limit:
n_repeat:
test_case:
scope:
script: |-
ls
cd /opt/megatron-lm
......@@ -23,7 +22,8 @@ spec:
"DATA_CACHE_PATH='-'"
"OUTPUT_PATH={assets_dir}"
"TENSORBOARD_PATH={assets_dir}/tensorboard"
"CHECKPOINT_PATH=/workspace/checkpoints"
"CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints"
"CHECKPOINT_LOAD_PATH=/workspace/checkpoints/{name}/checkpoints"
"TRAINING_SCRIPT_PATH=pretrain_vlm.py"
"TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
"GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
......@@ -33,19 +33,39 @@ spec:
bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
products:
- environment: [lts, dev]
- test_case: [multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
n_repeat: [5]
gpus: [8]
test_case:
- multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G
- multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G
- multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G
- multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G
- environment: [lts, dev]
- environment: [lts]
scope: [nightly]
- test_case: [multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
n_repeat: [5]
gpus: [7]
test_case:
- multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G
- multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G
- environment: [lts]
scope: [nightly]
- test_case: [multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
......@@ -3,7 +3,7 @@ format_version: 1
maintainers: [mcore]
loggers: [stdout]
spec:
name: "{test_case}"
name: '{test_case}_{environment}'
model: t5
build: mcore-pyt-{environment}
nodes: 1
......@@ -11,6 +11,7 @@ spec:
platforms: dgx_a100
artifacts:
/workspace/data/t5_data: text/the_pile/t5_shard00
/workspace/checkpoints/t5_220m_mr_mcore_te_tp2_pp2_frozen_resume_torch_dgx_a100_1N8G_dev: model/mcore_t5/t5_220m_mr_mcore_te_tp2_pp2_frozen_resume_torch_dgx_a100_1N8G_dev/22410107
script: |-
ls
cd /opt/megatron-lm
......@@ -20,7 +21,8 @@ spec:
"DATA_CACHE_PATH=/workspace/data/cache"
"OUTPUT_PATH={assets_dir}"
"TENSORBOARD_PATH={assets_dir}/tensorboard"
"CHECKPOINT_PATH=/workspace/checkpoints"
"CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints"
"CHECKPOINT_LOAD_PATH=/workspace/checkpoints/{name}"
"TRAINING_SCRIPT_PATH=pretrain_t5.py"
"TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
"GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
......@@ -30,32 +32,77 @@ spec:
bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
products:
- environment: [lts, dev]
scope: [mr]
time_limit: [1800]
n_repeat: [5]
test_case:
- t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G
- t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G
- t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G
- t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G
- t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G
- t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G
- t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G
- environment: [lts]
scope: [mr]
time_limit: [1800]
n_repeat: [5]
test_case:
- t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G
- environment: [lts, dev]
scope: [nightly]
time_limit: [9000]
n_repeat: [1]
test_case:
- t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch
- t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1
- t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel
- t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1
- t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch
- t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1
- test_case: [t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G]
products:
- environment: [lts]
scope: [nightly]
- test_case: [t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
- environment: [lts]
scope: [nightly]
- test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1]
products:
- environment: [dev, lts]
scope: [nightly]
- test_case: [t5_220m_mr_mcore_te_tp2_pp2_frozen_resume_torch_dgx_a100_1N8G]
products:
- environment: [dev]
scope: [mr]
......@@ -3,7 +3,7 @@ format_version: 1
maintainers: [mcore]
loggers: [stdout]
spec:
name: '{test_case}'
name: '{test_case}_{environment}_{tag}'
model: unit-tests
nodes: 1
build: mcore-pyt-{environment}
......@@ -60,21 +60,84 @@ spec:
fi
done <<< "$IGNORE_TEST_CASES"
echo "------ARGUMENTS for SLURM ---"
MASTER_ADDR=${{MASTER_ADDR:-localhost}}
MASTER_PORT=${{MASTER_PORT:-6000}}
NUM_NODES=${{NUM_NODES:-${{SLURM_NNODES}}}}
GPUS_PER_NODE=${{GPUS_PER_NODE:-8}}
NODE_RANK=${{SLURM_NODEID:-${{SLURM_NODEID}}}}
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
--node_rank $SLURM_NODEID
--log-dir {assets_dir}
--tee "0:3"
--redirects "3"
)
# Reduce memory usage by NCCL
export NCCL_MAX_NCHANNELS=1
export NCCL_NVLS_ENABLE=0
for i in $(seq $UNIT_TEST_REPEAT); do
CMD=$(echo pytest -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail ${{IGNORE_ARGS[@]}} -m "'${{MARKER_ARG}}'" $BUCKET)
CMD=$(echo torchrun ${{DISTRIBUTED_ARGS[@]}} -m pytest \
-xvs \
--cov-report=term \
--cov-branch \
--cov=megatron/core \
--cov-report xml:coverage.xml \
--no-cov-on-fail ${{IGNORE_ARGS[@]}} \
-m "'${{MARKER_ARG}}'" $BUCKET)
eval "$CMD"
done
ls -al
cp .coverage_0 {assets_dir}/coverage_report
cp coverage.xml {assets_dir}
products:
- test_case: [tests/unit_tests/data/]
products:
- environment: [lts, dev]
tag: [latest, legacy]
scope: [unit-tests]
n_repeat: [1]
time_limit: [1800]
- test_case: [tests/unit_tests/dist_checkpointing/*.py]
products:
- environment: [lts, dev]
tag: [latest, legacy]
scope: [unit-tests]
n_repeat: [1]
time_limit: [1800]
- test_case: [tests/unit_tests/dist_checkpointing/models/]
products:
- environment: [lts, dev]
tag: [latest, legacy]
scope: [unit-tests]
n_repeat: [1]
time_limit: [1800]
- test_case: [tests/unit_tests/transformer/*.py]
products:
- environment: [lts, dev]
tag: [latest, legacy]
scope: [unit-tests]
n_repeat: [1]
time_limit: [1800]
- test_case: [tests/unit_tests/transformer/moe]
products:
- environment: [lts, dev]
tag: [latest, legacy]
scope: [unit-tests]
n_repeat: [1]
time_limit: [1800]
- test_case: [tests/unit_tests]
products:
- environment: [lts, dev]
tag: [latest, legacy]
scope: [unit-tests]
n_repeat: [1]
time_limit: [1800]
test_case:
- tests/unit_tests/data/
- tests/unit_tests/dist_checkpointing/*.py
- tests/unit_tests/dist_checkpointing/models/
- tests/unit_tests/transformer/*.py
- tests/unit_tests/transformer/moe
- tests/unit_tests
......@@ -192,63 +192,15 @@ def test_builder():
assert datasets[1] is None
assert datasets[2] is None
# This build used to fail when building datasets without a sample buffer
config = BlendedMegatronDatasetConfig(
random_seed=1234,
sequence_length=_SEQUENCE_LENGTH,
blend_per_split=[blends[Split.train], None, None],
)
try:
datasets = BlendedMegatronDatasetBuilder(
TestDataset, [1000, None, None], lambda: True, config
).build()
raise RuntimeError
except IndexError:
##
#
# The size per dataset is a function of the requested size, the weight per dataset,
# and a constant coefficient. The sizes, and consequently the total size to request,
# are modified such that the weights may or may not be sufficiently representative.
# To fix this, the weights should be reset according to the new sizes:
#
# S := size
# W := weights
#
# S = func(S, W)
#
# W = S / sum(S)
#
##
config = BlendedMegatronDatasetConfig(
random_seed=1234,
sequence_length=_SEQUENCE_LENGTH,
blend_per_split=[blends[Split.train], None, None],
renormalize_blend_weights=True,
)
datasets = BlendedMegatronDatasetBuilder(
TestDataset, [1000, None, None], lambda: True, config
).build()
assert (
len(datasets[0]) >= 1000
and len(datasets[0]) <= 1000 * (1 + _MARGIN) + _NUM_DATASETS
)
config = BlendedMegatronDatasetConfig(
random_seed=1234,
sequence_length=_SEQUENCE_LENGTH,
blend_per_split=[blends[Split.train], blends[Split.valid], blends[Split.test]],
)
datasets = BlendedMegatronDatasetBuilder(
TestDataset, [100, 100, 100], lambda: True, config
).build()
assert (
len(datasets[0]) >= 100 and len(datasets[0]) <= 100 * (1 + _MARGIN) + _NUM_DATASETS
)
assert (
len(datasets[1]) >= 100 and len(datasets[1]) <= 100 * (1 + _MARGIN) + _NUM_DATASETS
)
assert (
len(datasets[2]) >= 100 and len(datasets[2]) <= 100 * (1 + _MARGIN) + _NUM_DATASETS
)
config = BlendedMegatronDatasetConfig(
random_seed=1234,
......@@ -322,73 +274,16 @@ def test_builder():
assert len(datasets[1]) == sum(_SIZES[Split.train]) / 2
assert datasets[2] is None
# 990 9 1
# 100000 1000 1
# []
config = BlendedMegatronDatasetConfig(
random_seed=1234,
sequence_length=_SEQUENCE_LENGTH,
blend=blends[Split.train],
split="990,9,1",
)
try:
# All three of 100000, 1000, and 1 result in error, yet 10000 and 100 do not
datasets = BlendedMegatronDatasetBuilder(
TestDataset, [100000, 1000, 1], lambda: True, config
).build()
except IndexError:
##
#
# The size per dataset is a function of the requested size, the weight per dataset,
# and a constant coefficient. The sizes, and consequently the total size to request,
# are modified such that the weights may or may not be sufficiently representative.
# To fix this, the weights should be reset according to the new sizes:
#
# S := size
# W := weights
#
# S = func(S, W)
#
# W = S / sum(S)
#
##
# This build used to fail when building datasets without a sample buffer
config = BlendedMegatronDatasetConfig(
random_seed=1234,
sequence_length=_SEQUENCE_LENGTH,
blend=blends[Split.train],
split="990,9,1",
renormalize_blend_weights=True,
)
datasets = BlendedMegatronDatasetBuilder(
TestDataset, [100000, 1000, 1], lambda: True, config
).build()
assert (
len(datasets[0]) >= 100000
and len(datasets[0]) <= 100000 * (1 + _MARGIN) + _NUM_DATASETS
)
assert (
len(datasets[1]) >= 1000
and len(datasets[1]) <= 1000 * (1 + _MARGIN) + _NUM_DATASETS
)
assert len(datasets[2]) >= 1 and len(datasets[2]) <= 1 * (1 + _MARGIN) + _NUM_DATASETS
config = BlendedMegatronDatasetConfig(
random_seed=1234,
sequence_length=_SEQUENCE_LENGTH,
blend=blends[Split.train],
split="990,9,1",
)
datasets = BlendedMegatronDatasetBuilder(
TestDataset, [10000, 100, 0], lambda: True, config
).build()
assert (
len(datasets[0]) >= 10000
and len(datasets[0]) <= 10000 * (1 + _MARGIN) + _NUM_DATASETS
)
assert (
len(datasets[1]) >= 100 and len(datasets[1]) <= 100 * (1 + _MARGIN) + _NUM_DATASETS
)
assert len(datasets[2]) == 0
if __name__ == "__main__":
......
......@@ -10,6 +10,11 @@ def pytest_sessionfinish(session, exitstatus):
session.exitstatus = 0
@pytest.fixture(scope="class")
def tmp_dir_per_class(tmp_path_factory):
return tmp_path_factory.mktemp("data")
@pytest.fixture(scope='session', autouse=True)
def set_default_dist_ckpt_strategy():
def get_pyt_dist_save_sharded_strategy():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment