Commit 688448db authored by silencealiang's avatar silencealiang
Browse files

更新代码

parent a02a5490
Pipeline #2503 passed with stage
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
......@@ -3,7 +3,11 @@ import glob
import json
import logging
import os
import pathlib
from typing import Callable, Dict, List, Optional, Union
import numpy as np
import pydantic
from tensorboard.backend.event_processing import event_accumulator
# By default TB tries to be smart about what to load in memory to avoid OOM
......@@ -12,27 +16,94 @@ from tensorboard.backend.event_processing import event_accumulator
# are small/short.
SIZE_GUIDANCE = {event_accumulator.TENSORS: 0, event_accumulator.SCALARS: 0}
logger = logging.getLogger()
logger = logging.getLogger(__name__)
class TypeOfTest(enum.Enum):
APPROX = 1
def approximate_threshold(rtol: float) -> Callable:
def _func(y_pred: List[Union[float, int]], y_true: List[Union[float, int]]):
return np.mean([np.mean(y_pred), np.mean(y_true)]) * rtol
return _func
class TypeOfTestResult(enum.Enum):
APPROXIMATE = 1
DETERMINISTIC = 2
TYPE_OF_TEST_TO_METRIC = {
TypeOfTest.DETERMINISTIC: ["lm loss", "num-zeros"],
TypeOfTest.APPROX: ["lm loss", "iteration-time", "mem-allocated-bytes"],
}
class Test(pydantic.BaseModel):
pass
class NotApproximateError(Exception):
"""Raised if comparison is not within approximate bounds"""
class NotDeterminsticError(Exception):
"""Raised if comparison is not within approximate bounds"""
class ApproximateTest(Test):
atol: Optional[Union[int, float]] = 0
atol_func: Optional[Callable] = None
rtol: float = 1e-5
@property
def type_of_test_result(self) -> TypeOfTestResult:
return TypeOfTestResult.APPROXIMATE
def error_message(self, metric_name: str) -> NotApproximateError:
return NotApproximateError(f"Approximate comparison of {metric_name}: FAILED")
class DeterministicTest(Test):
@property
def atol(self) -> Union[int, float]:
return 0
atol_func: Optional[Callable] = None
@property
def rtol(self) -> float:
return 0.0
METRIC_TO_THRESHOLD = {
"iteration-time": 0.8,
"mem-allocated-bytes": 3 * 1000 * 1000, # 3MB
"lm loss": 0.05,
}
@property
def type_of_test_result(self) -> TypeOfTestResult:
return TypeOfTestResult.DETERMINISTIC
def error_message(self, metric_name: str) -> NotDeterminsticError:
return NotDeterminsticError(f"Exact comparison of {metric_name}: FAILED")
def read_tb_logs_as_list(path, index=0):
class GoldenValueMetric(pydantic.BaseModel):
start_step: int
end_step: int
step_interval: int
values: Dict[int, Union[int, float, str]]
def __repr__(self):
return f"Values ({self.start_step},{self.end_step},{self.step_interval}): {', '.join([str(f'({step}, {value})') for step, value in self.values.items()])}"
class GoldenValues(pydantic.RootModel):
root: Dict[str, GoldenValueMetric]
class MissingTensorboardLogsError(Exception):
"""Raised if TensorboardLogs not found"""
class UndefinedMetricError(Exception):
"""Raised of golden values metric has no test definition"""
class SkipMetricError(Exception):
"""Raised if metric shall be skipped"""
def read_tb_logs_as_list(
path, index: int = 0, train_iters: int = 50, start_idx: int = 1, step_size: int = 5
) -> Optional[Dict[str, GoldenValueMetric]]:
"""Reads a TensorBoard Events file from the input path, and returns the
summary specified as input as a list.
......@@ -46,13 +117,11 @@ def read_tb_logs_as_list(path, index=0):
files = glob.glob(f"{path}/events*tfevents*")
files += glob.glob(f"{path}/results/events*tfevents*")
summaries = {}
if not files:
logger.info(f"File not found matching: {path}/events* || {path}/results/events*")
return summaries
logger.error(f"File not found matching: {path}/events* || {path}/results/events*")
return None
files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
files.sort(key=lambda x: os.path.getmtime(os.path.join(path, pathlib.Path(x).name)))
accumulators = []
if index == -1:
......@@ -66,30 +135,150 @@ def read_tb_logs_as_list(path, index=0):
ea.Reload()
accumulators.append(ea)
summaries = {}
for ea in accumulators:
for scalar_name in ea.Tags()["scalars"]:
if scalar_name in summaries:
summaries[scalar_name] += [round(x.value, 5) for x in ea.Scalars(scalar_name)]
for x in ea.Scalars(scalar_name):
if x.step not in summaries[scalar_name]:
summaries[scalar_name][x.step] = round(x.value, 5)
else:
summaries[scalar_name] = [round(x.value, 5) for x in ea.Scalars(scalar_name)]
summaries[scalar_name] = {
x.step: round(x.value, 5) for x in ea.Scalars(scalar_name)
}
golden_values = {}
for metric, values in summaries.items():
print(
f"Extracted {len(summaries[scalar_name])} values of {scalar_name} from Tensorboard \
logs. Here are the first 5 values: {summaries[scalar_name][:5]}"
# Add missing values
values = {
k: (values[k] if k in values else "nan")
for k in range(1, train_iters + 1)
if k == start_idx or (k > start_idx and int(k) % step_size == 0)
}
golden_values[metric] = GoldenValueMetric(
start_step=min(values.keys()),
end_step=max(values.keys()),
step_interval=step_size,
values=values,
)
return summaries
# for metric_name, golden_value in golden_values.items():
# logger.info(
# f"Extracted {golden_value.end_step} values of {metric_name} from Tensorboard logs. Here are the sampled values: {golden_value.values}"
# )
return golden_values
def read_golden_values_from_json(
golden_values_path: Union[str, pathlib.Path]
) -> Dict[str, GoldenValueMetric]:
with open(golden_values_path) as f:
if os.path.exists(golden_values_path):
with open(golden_values_path) as f:
return GoldenValues(**json.load(f)).root
raise ValueError(f"File {golden_values_path} not found!")
def load_expected_data():
expected_metrics_file = os.getenv("EXPECTED_METRICS_FILE")
if expected_metrics_file is None:
raise ValueError("Unknown EXPECTED_METRICS_FILE")
def _filter_checks(
checks: List[Union[ApproximateTest, DeterministicTest]], filter_for_type_of_check
):
return [test for test in checks if test.type_of_test_result == filter_for_type_of_check]
with open(expected_metrics_file) as f:
if os.path.exists(expected_metrics_file):
with open(expected_metrics_file) as f:
return json.load(f)
def pipeline(
compare_approximate_results: bool,
golden_values: Dict[str, GoldenValueMetric],
tensorboard_logs: Dict[str, GoldenValueMetric],
checks: Dict[str, List[Union[ApproximateTest, DeterministicTest]]],
):
all_test_passed = True
failed_metrics = []
for golden_value_key, golden_value in golden_values.items():
try:
if golden_value_key not in list(tensorboard_logs.keys()):
raise MissingTensorboardLogsError(
f"Metric {golden_value_key} not found in Tensorboard logs! Please modify `model_config.yaml` to record it."
)
if golden_value_key not in checks or (golden_value_key in checks and len(checks) == 0):
logger.debug(
"For metric `%s`, no check was defined. Will fall back to `DeterminsticTest` with exact thresholds.",
golden_value_key,
)
test = DeterministicTest()
else:
print(f"File {expected_metrics_file} not found!")
# For approximate tests, we cannot use deterministic
if compare_approximate_results is True:
tests = _filter_checks(checks[golden_value_key], TypeOfTestResult.APPROXIMATE)
# For deterministic, we can fall back to approximate
else:
tests = _filter_checks(
checks[golden_value_key], TypeOfTestResult.DETERMINISTIC
) or _filter_checks(checks[golden_value_key], TypeOfTestResult.APPROXIMATE)
if len(tests) != 1:
raise SkipMetricError(
f"No {'approximate' if compare_approximate_results is True else 'deterministic'} check found for {golden_value_key}: SKIPPED"
)
test = tests[0]
golden_value_list = list(golden_value.values.values())
actual_value_list = [
value
for value_step, value in tensorboard_logs[golden_value_key].values.items()
if value_step in golden_value.values.keys()
]
if golden_value_key == "iteration-time":
actual_value_list = actual_value_list[3:-1]
golden_value_list = golden_value_list[3:-1]
logger.info(
"For metric `%s`, the first 3 and the last scalars are removed from the list to reduce noise.",
golden_value_key,
)
actual_value_list = [np.inf if type(v) is str else v for v in actual_value_list]
golden_value_list = [np.inf if type(v) is str else v for v in golden_value_list]
if not np.allclose(
actual_value_list,
golden_value_list,
rtol=test.rtol,
atol=(
test.atol_func(actual_value_list, golden_value_list)
if test.atol_func is not None
else test.atol
),
):
logger.info("Actual values: %s", ", ".join([str(v) for v in actual_value_list]))
logger.info("Golden values: %s", ", ".join([str(v) for v in golden_value_list]))
raise test.error_message(golden_value_key)
result = f"{test.type_of_test_result.name} test for metric {golden_value_key}: PASSED"
result_code = 0
except (NotApproximateError, NotDeterminsticError, MissingTensorboardLogsError) as e:
result = str(e)
result_code = 1
except SkipMetricError:
logger.info(f"{test.type_of_test_result.name} test for {golden_value_key}: SKIPPED")
continue
log_emitter = logger.info if result_code == 0 else logger.error
log_emitter(result)
if result_code == 1:
all_test_passed = False
failed_metrics.append(golden_value_key)
assert all_test_passed, f"The following metrics failed: {', '.join(failed_metrics)}"
from typing import Dict, List, Union
import pytest
from tests.functional_tests.python_test_utils import common
def pytest_addoption(parser):
"""
Additional command-line arguments passed to pytest.
"""
parser.addoption(
"--allow-nondeterministic-algo",
action="store_true",
default=False,
help="If set, test system checks for approximate results.",
)
parser.addoption("--golden-values-path", action="store", help="Path to golden values")
parser.addoption(
"--train-iters", action="store", default=100, help="Number of train iters", type=int
)
parser.addoption("--tensorboard-path", action="store", help="Path to tensorboard records")
parser.addoption("--model-config-path", action="store", help="Path to model_config.yaml")
@pytest.fixture
def compare_approximate_results(request) -> bool:
"""Simple fixture returning whether to check against results approximately."""
return request.config.getoption("--allow-nondeterministic-algo") is True
@pytest.fixture
def golden_values(request):
"""Simple fixture returning golden values."""
return common.read_golden_values_from_json(request.config.getoption("--golden-values-path"))
@pytest.fixture
def train_iters(request):
"""Simple fixture returning number of train iters."""
return request.config.getoption("--train-iters")
@pytest.fixture
def tensorboard_logs(request, train_iters):
"""Simple fixture returning tensorboard metrics."""
return common.read_tb_logs_as_list(
request.config.getoption("--tensorboard-path"), train_iters=train_iters
)
@pytest.fixture
def tensorboard_path(request):
"""Simple fixture returning path to tensorboard logs."""
return request.config.getoption("--tensorboard-path")
@pytest.fixture
def model_config_path(request):
"""Simple fixture returning path to model_config.yaml."""
return request.config.getoption("--model-config-path")
......@@ -2,14 +2,20 @@ import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"
import json
import logging
import click
import yaml
from tests.functional_tests.python_test_utils import common
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@click.command()
@click.option("--logs-dir", required=True, type=str, help="Path to Tensorboard logs")
@click.option("--train-iters", required=True, type=int, help="Number of train iters")
@click.option("--output-path", required=False, type=str, help="Path to write golden values")
@click.option(
"--is-convergence-test/--is-normal-test",
......@@ -17,22 +23,39 @@ from tests.functional_tests.python_test_utils import common
help="Tensorboard index to extract",
default=False,
)
def collect_train_test_metrics(logs_dir: str, output_path: str, is_convergence_test: bool):
summaries = common.read_tb_logs_as_list(logs_dir, index=-1 if is_convergence_test else 0)
train_metrics = {
metric_name: {
"start_step": 0,
"end_step": len(metric_values),
"step_interval": 5,
"values": metric_values[0 : len(metric_values) : 5],
}
for metric_name, metric_values in summaries.items()
def collect_train_test_metrics(
logs_dir: str, train_iters: str, output_path: str, is_convergence_test: bool
):
summaries = common.read_tb_logs_as_list(
logs_dir, index=(0 if not is_convergence_test else -1), train_iters=train_iters, start_idx=1
)
if summaries is None:
logger.warning("No tensorboard logs found, no golden values created.")
return
summaries = {
golden_value_key: golden_value
for (golden_value_key, golden_value) in summaries.items()
if golden_value_key
in [
"iteration-time",
"mem-allocated-bytes",
"mem-max-allocated-bytes",
"lm loss",
"num-zeros",
]
}
if output_path is not None:
with open(output_path, "w") as fh:
json.dump(train_metrics, fh)
json.dump(
{
golden_value_key: golden_values.model_dump()
for golden_value_key, golden_values in summaries.items()
},
fh,
)
if __name__ == "__main__":
......
import os
from typing import List, Union
import numpy as np
import pytest
from .common import (
METRIC_TO_THRESHOLD,
TYPE_OF_TEST_TO_METRIC,
TypeOfTest,
load_expected_data,
read_tb_logs_as_list,
)
@pytest.fixture(params=load_expected_data().items())
def expected_data(request):
return request.param
# If we require a variation of tests for any of the other pipelines we can just inherit this class.
class TestCIPipeline:
allow_nondeterministic = bool(int(os.getenv("NVTE_ALLOW_NONDETERMINISTIC_ALGO")))
logs_dir = os.getenv("LOGS_DIR")
# Replace symbol in namespace to fix function call result for lifetime of
# this class.
def _test_helper(self, metric_type: str, metric_dict: List[Union[int, float]], test_type):
expected_list = metric_dict['values']
print(f"The list of expected values: {expected_list} for metric {metric_type}")
try:
actual_list = read_tb_logs_as_list(self.logs_dir)[metric_type]
except KeyError as e:
raise KeyError(
f"Required metric {metric_type} not found in TB logs. Please make sure your model \
exports this metric as its required by the test case/golden values file"
) from e
if actual_list is None:
raise ValueError(f"No values of {metric_type} found in TB logs.")
actual_list_sliced = actual_list[
metric_dict["start_step"] : metric_dict["end_step"] : metric_dict["step_interval"]
]
print(f"The list of actual values: {actual_list_sliced}")
if metric_type == "iteration-time":
actual_list_sliced = actual_list_sliced[3:]
expected_list = expected_list[3:]
print("Removing first items of values for metric_type iteration-time")
if test_type == TypeOfTest.DETERMINISTIC:
assert np.allclose(
actual_list_sliced, expected_list, rtol=0, atol=0
), f"Actual is not equal to Expected for {metric_type}"
elif test_type == TypeOfTest.APPROX:
assert np.allclose(
actual_list_sliced, expected_list, rtol=1e-5, atol=METRIC_TO_THRESHOLD[metric_type]
), f"Actual is not equal to Expected for {metric_type}"
else:
raise ValueError(f"Unexpected test_type {test_type} provided")
def test_approx(self, expected_data):
expected_metric, expected_values = expected_data
if expected_metric in TYPE_OF_TEST_TO_METRIC[TypeOfTest.APPROX]:
self._test_helper(expected_metric, expected_values, TypeOfTest.APPROX)
else:
print(f"Skipping metric {expected_metric} for approximate as it is deterministic only.")
@pytest.mark.skipif(allow_nondeterministic, reason="Cannot expect exact results")
def test_deterministic(self, expected_data):
expected_metric, expected_values = expected_data
if expected_metric in TYPE_OF_TEST_TO_METRIC[TypeOfTest.DETERMINISTIC]:
self._test_helper(expected_metric, expected_values, TypeOfTest.DETERMINISTIC)
else:
print(f"Skipping metric {expected_metric} for deterministic as it is approximate only.")
# # @TODO: This is inactive, do we want to activate it?
# def iteration_timing_node(self):
# expected_iteration_timing_avg = self.expected["train_step_timing_avg"]
# iteration_time = read_tb_logs_as_list(LOGS_DIR)["iteration-time"]
# idx = len(iteration_time) // 3
# iteration_time_avg = sum(iteration_time[idx:]) / len(iteration_time[idx:])
# assert (
# expected_iteration_timing_avg
# == pytest.approx(expected=iteration_time_avg, rel=self.margin_time)
# ), f"The time per global step must be approximately {expected_iteration_timing_avg} but "
# "it is {iteration_time_avg}."
# if deterministic, then also approx
# if not determinstic, then also aprox
import json
import os
import numpy as np
import pytest
import scipy.stats as ss
from scipy.integrate import trapezoid
from .common import read_tb_logs_as_list
LOGS_DIR = os.getenv("LOGS_DIR")
EXPECTED_METRICS_FILE = os.getenv("EXPECTED_METRICS_FILE")
# If we require a variation of tests for any of the other pipelines we can just inherit this class.
class TestFP8CIPipeline:
margin_loss, margin_time = 0.2, 0.1
auc_threshold, correlation_threshold = 0.01, 0.999
expected = None
def _setup(self):
if os.path.exists(EXPECTED_METRICS_FILE):
with open(EXPECTED_METRICS_FILE) as f:
self.expected = json.load(f)
if self.expected is None:
raise FileNotFoundError("Expected data is none")
def _get_actual(self, loss_type):
actual_list = read_tb_logs_as_list(LOGS_DIR)[loss_type]
assert (
actual_list is not None
), f"No TensorBoard events file was found in the logs for {loss_type}."
return actual_list
def _margin_test_helper(self, loss_type):
expected = self.expected[loss_type]
expected_list = np.array(expected["values"])
actual_list = self._get_actual(loss_type)
actual_list_sliced = np.array(
actual_list[expected["start_step"] : expected["end_step"] : expected["step_interval"]]
)
max_diff_index = np.argmax(np.abs(actual_list_sliced - expected_list))
max_diff = np.abs(actual_list_sliced[max_diff_index] - expected_list[max_diff_index])
print(
"[INFO - margin]: "
f"maximum absolute difference for {loss_type} is {max_diff} at index {max_diff_index}, "
f"Actual: {actual_list_sliced[max_diff_index]}, "
f"Expected: {expected_list[max_diff_index]}"
)
assert np.allclose(
actual_list_sliced, expected_list, rtol=1e-5, atol=self.margin_loss
), f"Actual is not equal to Expected for {loss_type}"
def _auc_test_helper(self, loss_type):
expected = self.expected[loss_type]
expected_list = np.array(expected["values"])
actual_list = self._get_actual(loss_type)
actual_list_sliced = np.array(
actual_list[expected["start_step"] : expected["end_step"] : expected["step_interval"]]
)
def compute_auc(y_values):
x_values = np.arange(0, len(y_values), 1)
area = trapezoid(y_values, x_values)
return round(area, 5)
baseline_area = compute_auc(expected_list)
current_area = compute_auc(actual_list_sliced)
diff = abs(baseline_area - current_area)
print(
f"[INFO - AUC]: AUC diff: {diff * 100 / baseline_area} %, current: {current_area}, "
f"baseline: {baseline_area}"
)
assert (baseline_area <= 0) or (diff <= self.auc_threshold * baseline_area)
def _correlation_test_helper(self, loss_type):
expected = self.expected[loss_type]
expected_list = np.array(expected["values"])
actual_list = self._get_actual(loss_type)
actual_list_sliced = np.array(
actual_list[expected["start_step"] : expected["end_step"] : expected["step_interval"]]
)
corr = ss.pearsonr(actual_list_sliced, expected_list).statistic
print(f"[INFO - Corr]: Corr: {corr}")
assert corr > self.correlation_threshold
@pytest.mark.xfail
def test_lm_loss_margin(self):
self._setup()
self._margin_test_helper("lm loss")
def test_lm_loss_auc(self):
self._setup()
self._auc_test_helper("lm loss")
@pytest.mark.xfail
def test_lm_loss_correlation(self):
self._setup()
self._correlation_test_helper("lm loss")
def iteration_timing_node(self):
expected_iteration_timing_avg = self.expected["train_step_timing_avg"]
iteration_time = read_tb_logs_as_list(LOGS_DIR)["iteration-time"]
idx = len(iteration_time) // 3
iteration_time_avg = sum(iteration_time[idx:]) / len(iteration_time[idx:])
assert expected_iteration_timing_avg == pytest.approx(
expected=iteration_time_avg, rel=self.margin_time
), f"The time per global step must be approximately {expected_iteration_timing_avg} but it \
is {iteration_time_avg}."
import logging
from typing import Dict, List, Optional
import numpy as np
from tests.functional_tests.python_test_utils import common
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def test_regular_pipeline(
compare_approximate_results: bool,
golden_values: Dict[str, common.GoldenValueMetric],
tensorboard_logs: Dict[str, common.GoldenValueMetric],
checks: Optional[Dict[str, List[common.Test]]] = None,
):
if checks is None:
checks = {
"iteration-time": [common.ApproximateTest(atol=2.0, rtol=0)],
"mem-allocated-bytes": [
common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.05), rtol=0)
],
"mem-max-allocated-bytes": [
common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.05), rtol=0)
],
"lm loss": [
common.DeterministicTest(),
common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.05), rtol=0),
],
"num-zeros": [
common.DeterministicTest(),
common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.20), rtol=0),
],
}
if (
len(
missing_metrics := [
golden_metric
for golden_metric in checks.keys()
if golden_metric not in golden_values.keys()
]
)
> 0
):
logger.error(
f"The following metrics are required but not provided in golden values: {', '.join(missing_metrics)}"
)
assert False
common.pipeline(
compare_approximate_results=compare_approximate_results,
golden_values=golden_values,
tensorboard_logs=tensorboard_logs,
checks=checks,
)
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"
import pytest
from tests.functional_tests.python_test_utils.common import TypeOfTest, read_tb_logs_as_list
LOGS_DIR = os.getenv("LOGS_DIR")
ALLOW_NONDETERMINISTIC = os.getenv("NVTE_ALLOW_NONDETERMINISTIC_ALGO")
STEP_INTERVAL = 5
def collect_train_test_metrics(logs_dir, index):
train_loss_list = read_tb_logs_as_list(logs_dir, index)["lm loss"]
train_loss_list = [round(elem, 3) for elem in train_loss_list]
train_metrics = {"lm loss": train_loss_list[0 : len(train_loss_list) : STEP_INTERVAL]}
str_train_metrics = str(train_metrics).replace("'", '"')
print("\n ----------- The following are the metrics for ----------")
print(f"\n {str_train_metrics}", flush=True)
return train_metrics
class TestCIPipeline:
margin_loss = 0.005
allow_nondeterministic = bool(int(ALLOW_NONDETERMINISTIC))
train_metrics_100 = collect_train_test_metrics(LOGS_DIR, 0)
train_metrics_50_to_100 = collect_train_test_metrics(LOGS_DIR, 1)
def _test_helper(self, loss_type, test_type):
expected = self.train_metrics_100[loss_type]
assert (
len(expected) == 100 // STEP_INTERVAL
), "Train metrics from first run (before checkpoint load) should \
have {100 // STEP_INTERVAL} elements"
print("expected : " + str(expected))
actual = self.train_metrics_50_to_100[loss_type]
assert (
len(actual) == 50 // STEP_INTERVAL
), "Train metrics from second run (after checkpoint load) should have \
{50 // STEP_INTERVAL} elements"
print("actual : " + str(actual))
start_idx_expected = len(expected) - len(actual)
print("start_idx_expected:", start_idx_expected)
# Here we will just be comparing values of actual and second half (50-100) of expected
for i, (expected_val, actual_val) in enumerate(zip(expected[start_idx_expected:], actual)):
step = start_idx_expected + i * STEP_INTERVAL
if test_type == TypeOfTest.APPROX:
assert actual_val == pytest.approx(
expected=expected_val, rel=self.margin_loss
), f"The loss at step {step} should be approximately {expected_val} but it is \
{actual_val}."
else:
assert (
actual_val == expected_val
), f"The value at step {step} should be {expected_val} but it is {actual_val}."
@pytest.mark.skipif(allow_nondeterministic, reason="Nondeterministic is allowed.")
def test_lm_loss_deterministic(self):
self._test_helper("lm loss", TypeOfTest.DETERMINISTIC)
@pytest.mark.skipif(not allow_nondeterministic, reason="Nondeterministic is not allowed.")
def test_lm_loss_nondeterministic(self):
self._test_helper("lm loss", TypeOfTest.APPROX)
import logging
from typing import Dict
import numpy as np
import yaml
from tests.functional_tests.python_test_utils import common, test_regular_pipeline
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def test_resume_checkpoint_pipeline(
compare_approximate_results: bool, tensorboard_path: str, train_iters: int
):
first_run_values = common.read_tb_logs_as_list(
tensorboard_path, index=0, train_iters=train_iters, start_idx=(train_iters // 2) + 1
)
second_run_values = common.read_tb_logs_as_list(
tensorboard_path, index=1, train_iters=train_iters, start_idx=(train_iters // 2) + 1
)
checks = {
"iteration-time": [common.ApproximateTest(atol=2.0, rtol=0)],
"mem-allocated-bytes": [
common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.05), rtol=0)
],
"mem-max-allocated-bytes": [
common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.05), rtol=0)
],
"lm loss": [
common.DeterministicTest(),
common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.05), rtol=0),
],
"num-zeros": [
common.DeterministicTest(),
common.ApproximateTest(atol_func=common.approximate_threshold(rtol=0.20), rtol=0),
],
}
if (
len(
missing_metrics := [
golden_metric
for golden_metric in checks.keys()
if golden_metric not in first_run_values.keys()
]
)
> 0
):
logger.error(
f"The following metrics are required but not logged during training: {', '.join(missing_metrics)}"
)
assert False
first_run_values = {
metric_name: metric_values
for (metric_name, metric_values) in first_run_values.items()
if metric_name in checks.keys()
}
second_run_values = {
metric_name: metric_values
for (metric_name, metric_values) in second_run_values.items()
if metric_name in checks.keys()
}
logger.info(first_run_values)
logger.info(second_run_values)
test_regular_pipeline.test_regular_pipeline(
compare_approximate_results=compare_approximate_results,
golden_values=first_run_values,
tensorboard_logs=second_run_values,
checks=checks,
)
......@@ -26,9 +26,11 @@ MANDATORY_VARS=(
"TRAINING_PARAMS_PATH"
"OUTPUT_PATH"
"TENSORBOARD_PATH"
"CHECKPOINT_PATH"
"CHECKPOINT_SAVE_PATH"
"CHECKPOINT_LOAD_PATH"
"DATA_PATH"
"RUN_NUMBER"
"REPEAT"
)
for mandatory_var in "${MANDATORY_VARS[@]}"; do
if [[ -z "${!mandatory_var}" ]]; then
......@@ -37,12 +39,9 @@ for mandatory_var in "${MANDATORY_VARS[@]}"; do
fi
done
cp $TRAINING_PARAMS_PATH "$TRAINING_PARAMS_PATH.${SLURM_PROCID}"
TRAINING_PARAMS_PATH="$TRAINING_PARAMS_PATH.${SLURM_PROCID}"
# Envsubst model_params
cat $TRAINING_PARAMS_PATH | envsubst "$(env | cut -d= -f1 | sed -e 's/^/$/')" >$TRAINING_PARAMS_PATH.tmp
mv $TRAINING_PARAMS_PATH.tmp "$TRAINING_PARAMS_PATH"
TRAINING_PARAMS_PATH="$TRAINING_PARAMS_PATH.tmp"
# Pull env vars to export
ENV_VARS=$(yq '... comments="" | .ENV_VARS | to_entries | .[] | [.key + "=" + .value] | join(" ")' "$TRAINING_PARAMS_PATH")
......@@ -54,13 +53,13 @@ while IFS= read -r ARGUMENT; do
export "$KEY"="$VALUE"
echo "$KEY=$VALUE"
done <<< "$ENV_VARS"
done <<<"$ENV_VARS"
# Run before script
SCRIPT=$(cat "$TRAINING_PARAMS_PATH" | yq '.BEFORE_SCRIPT')
if [[ "$SCRIPT" != null ]]; then
eval "$SCRIPT"
fi;
BEFORE_SCRIPT=$(cat "$TRAINING_PARAMS_PATH" | yq '.BEFORE_SCRIPT')
if [[ "$BEFORE_SCRIPT" != null ]]; then
eval "$BEFORE_SCRIPT"
fi
# Exit earlier to leave time for properly saving checkpoint
if [[ $(echo "$TRAINING_SCRIPT_PATH" | tr '[:upper:]' '[:lower:]') == *nemo* ]]; then
......@@ -88,6 +87,33 @@ PARAMS="$PARAMS $TRAINING_PARAMS_FROM_CONFIG"
export PYTHONPATH="$(pwd):${PYTHONPATH:-}"
export WANDB_API_KEY="${WANDB_API_KEY:-}"
######## Distributed training settings. ########
echo "------ARGUMENTS for SLURM ---"
MASTER_ADDR=${MASTER_ADDR:-localhost}
MASTER_PORT=${MASTER_PORT:-6000}
NUM_NODES=${NUM_NODES:-${SLURM_NNODES}}
GPUS_PER_NODE=${GPUS_PER_NODE:-8}
NODE_RANK=${SLURM_NODEID:-${SLURM_NODEID}}
LAST_RANK=7
export LOG_DIR=$OUTPUT_PATH/logs/$REPEAT
mkdir -p $LOG_DIR
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
--node_rank $SLURM_NODEID
--log-dir $LOG_DIR
--tee "0:3,7:3"
--redirects "3"
)
# Start training
python $TRAINING_SCRIPT_PATH $PARAMS
torchrun ${DISTRIBUTED_ARGS[@]} $TRAINING_SCRIPT_PATH $PARAMS || EXIT_CODE=$?
# Run after script
AFTER_SCRIPT=$(cat "$TRAINING_PARAMS_PATH" | yq '.AFTER_SCRIPT')
if [[ "$AFTER_SCRIPT" != null ]]; then
eval "$AFTER_SCRIPT"
fi
......@@ -21,7 +21,8 @@ MANDATORY_VARS=(
"GOLDEN_VALUES_PATH"
"OUTPUT_PATH"
"TENSORBOARD_PATH"
"CHECKPOINT_PATH"
"CHECKPOINT_SAVE_PATH"
"CHECKPOINT_LOAD_PATH"
"DATA_PATH"
"DATA_CACHE_PATH"
)
......@@ -32,42 +33,97 @@ for mandatory_var in "${MANDATORY_VARS[@]}"; do
fi
done
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
RECORD_CHECKPOINTS=${RECORD_CHECKPOINTS:-"false"}
TEST_TYPES=("regular" "ckpt-resume" "frozen-resume" "frozen-start" "release")
mkdir -p $CHECKPOINT_SAVE_PATH
mkdir -p $CHECKPOINT_LOAD_PATH
_CHECKPOINT_LOAD_PATH=$CHECKPOINT_LOAD_PATH
_CHECKPOINT_SAVE_PATH=$CHECKPOINT_SAVE_PATH
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
ROOT_DIR=$(realpath $SCRIPT_DIR/../../../)
# Extract settings from params file
TEST_TYPE=$(cat $TRAINING_PARAMS_PATH \
| yq '.TEST_TYPE')
NVTE_ALLOW_NONDETERMINISTIC_ALGO=$(cat $TRAINING_PARAMS_PATH \
| yq '.ENV_VARS.NVTE_ALLOW_NONDETERMINISTIC_ALGO')
SKIP_PYTEST=$(cat $TRAINING_PARAMS_PATH \
| yq '.ENV_VARS.SKIP_PYTEST')
for i in $(seq 1 $N_REPEAT);
do
TEST_TYPE=$(cat $TRAINING_PARAMS_PATH |
yq '.TEST_TYPE')
NVTE_ALLOW_NONDETERMINISTIC_ALGO=$(cat $TRAINING_PARAMS_PATH |
yq '.ENV_VARS.NVTE_ALLOW_NONDETERMINISTIC_ALGO')
SKIP_PYTEST=$(cat $TRAINING_PARAMS_PATH |
yq '.ENV_VARS.SKIP_PYTEST')
TRAIN_ITERS=$(cat $TRAINING_PARAMS_PATH |
yq '.MODEL_ARGS."--train-iters" // "100"')
for i in $(seq 1 $N_REPEAT); do
if [[ $i -gt 1 ]]; then
rm -rf $CHECKPOINT_PATH/*
rm -rf $CHECKPOINT_SAVE_PATH/*
rm -rf /tmp/checkpoints/*
rm -rf $TENSORBOARD_PATH/*
fi
# Training
# First run never loads from a checkpoint
export RUN_NUMBER=1
export REPEAT=$i
export CHECKPOINT_SAVE_PATH=$_CHECKPOINT_SAVE_PATH
if [[ "$TEST_TYPE" = "frozen-start" ]]; then
export CHECKPOINT_LOAD_PATH=$_CHECKPOINT_LOAD_PATH
else
export CHECKPOINT_LOAD_PATH=/tmp/checkpoints/
fi
if [[ "$TEST_TYPE" = "release" ]]; then
export CHECKPOINT_LOAD_PATH=$_CHECKPOINT_LOAD_PATH
export CHECKPOINT_SAVE_PATH=$_CHECKPOINT_SAVE_PATH
fi
bash $ROOT_DIR/tests/functional_tests/shell_test_utils/_run_training.sh
# Maybe checkpoint resume training
if [[ "$TEST_TYPE" = "frozen-resume" && -z "$(ls -A "$_CHECKPOINT_LOAD_PATH" 2>/dev/null)" ]]; then
echo "No frozen checkpoint found. Will skip second run."
export CHECKPOINT_SAVE_PATH=$_CHECKPOINT_SAVE_PATH
rm -rf "$CHECKPOINT_SAVE_PATH/iter_0000$TRAIN_ITERS"
echo $((TRAIN_ITERS / 2)) >$CHECKPOINT_SAVE_PATH/latest_checkpointed_iteration.txt
break
fi
if [[ "$TEST_TYPE" == "ckpt-resume" ]]; then
if [[ ${SLURM_PROCID} -eq 0 ]]; then
rm -rf $CHECKPOINT_PATH/iter_0000100;
echo 50 > $CHECKPOINT_PATH/latest_checkpointed_iteration.txt;
export CHECKPOINT_LOAD_PATH=$CHECKPOINT_SAVE_PATH
rm -rf "$CHECKPOINT_LOAD_PATH/iter_0000$TRAIN_ITERS"
echo $((TRAIN_ITERS / 2)) >$CHECKPOINT_LOAD_PATH/latest_checkpointed_iteration.txt
export RUN_NUMBER=2
bash $ROOT_DIR/tests/functional_tests/shell_test_utils/_run_training.sh
fi
if [[ "$TEST_TYPE" == "frozen-resume" ]]; then
# Checkpoint-resume tests load from prev run
export CHECKPOINT_LOAD_PATH=$_CHECKPOINT_LOAD_PATH
export CHECKPOINT_SAVE_PATH=/tmp/checkpoints/
export RUN_NUMBER=2
bash $ROOT_DIR/tests/functional_tests/shell_test_utils/_run_training.sh
export CHECKPOINT_SAVE_PATH=$_CHECKPOINT_SAVE_PATH
rm -rf "$CHECKPOINT_SAVE_PATH/iter_0000$TRAIN_ITERS"
echo $((TRAIN_ITERS / 2)) >$CHECKPOINT_SAVE_PATH/latest_checkpointed_iteration.txt
fi
if [[ ${SLURM_PROCID} -gt 0 ]]; then
continue
if [[ "$TEST_TYPE" == "release" ]]; then
SKIP_PYTEST=0
fi
if [[ ${RECORD_CHECKPOINTS} == "true" ]]; then
echo "Skipping Pytest during checkpoint recording."
SKIP_PYTEST=1
fi
# Maybe run tests
if [[ ${SKIP_PYTEST:-0} != 1 ]]; then
# Save run results
export PYTHONPATH=$ROOT_DIR
if [[ "$TEST_TYPE" == "release" ]]; then
......@@ -75,29 +131,37 @@ do
else
EXTRACT_ARGS=("--is-normal-test")
fi
python3 $ROOT_DIR/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py \
--logs-dir $TENSORBOARD_PATH \
--train-iters $TRAIN_ITERS \
--output-path ${OUTPUT_PATH}/$(basename $GOLDEN_VALUES_PATH) \
"${EXTRACT_ARGS[@]}"
# Maybe run tests
if [[ ${SKIP_PYTEST:-0} != 1 ]]; then
export NVTE_ALLOW_NONDETERMINISTIC_ALGO
export LOGS_DIR=$TENSORBOARD_PATH
if [[ "$TEST_TYPE" == "ckpt-resume" ]]; then
echo "Running pytest 1st vs 2nd run comparison"
pytest -s $ROOT_DIR/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
if [[ "${NVTE_ALLOW_NONDETERMINISTIC_ALGO}" == "1" ]]; then
ALLOW_NONDETERMINISTIC_ALGO_ARG="--allow-nondeterministic-algo"
fi
elif [[ "$TEST_TYPE" == "regular" ]]; then
echo "Running pytest checks against golden values"
export EXPECTED_METRICS_FILE=$GOLDEN_VALUES_PATH
pytest -s $ROOT_DIR/tests/functional_tests/python_test_utils/test_ci_pipeline.py
else
pytest -s -o log_cli=true --log-cli-level=info $ROOT_DIR/tests/functional_tests/python_test_utils/test_regular_pipeline.py \
--golden-values-path $GOLDEN_VALUES_PATH \
--tensorboard-path $TENSORBOARD_PATH \
--model-config-path ${TRAINING_PARAMS_PATH} \
$ALLOW_NONDETERMINISTIC_ALGO_ARG
if [[ "$TEST_TYPE" == "ckpt-resume" || "$TEST_TYPE" == "frozen-resume" ]]; then
echo "Running pytest 1st vs 2nd run comparison"
pytest -s -o log_cli=true --log-cli-level=info $ROOT_DIR/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py \
--tensorboard-path $TENSORBOARD_PATH \
--train-iters $TRAIN_ITERS \
--model-config-path ${TRAINING_PARAMS_PATH} \
$ALLOW_NONDETERMINISTIC_ALGO_ARG
fi
if [[ ! " ${TEST_TYPES[*]} " =~ " ${TEST_TYPE} " ]]; then
echo "Test type $TEST_TYPE not yet implemented."
fi
fi
done
{ "lm loss": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.49569,
10.48173,
10.48047,
10.45353,
10.44394,
10.35611,
10.13779,
10.04017,
9.86834,
9.67307
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
2254.0,
2585.0,
2101.0,
2157.0,
2241.0,
2475.0,
2890.0,
3199.0,
3524.0,
3090.0
]
},
"iteration-time": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
13.65829,
1.27589,
1.2782,
1.32374,
1.26543,
1.26423,
1.26203,
1.54723,
1.27297,
1.26491
]
}
}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.49574, "5": 10.48424, "10": 10.49936, "15": 10.46628, "20": 10.44794, "25": 10.34964, "30": 10.17263, "35": 10.04261, "40": 9.90783, "45": 9.75774, "50": 9.67693}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2182.0, "5": 2584.0, "10": 2205.0, "15": 2539.0, "20": 2089.0, "25": 2604.0, "30": 2913.0, "35": 2967.0, "40": 2378.0, "45": 3923.0, "50": 3599.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1754654208.0, "5": 1754654208.0, "10": 1754654208.0, "15": 1754654208.0, "20": 1754654208.0, "25": 1754654208.0, "30": 1754654208.0, "35": 1754654208.0, "40": 1754654208.0, "45": 1754654208.0, "50": 1754654208.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2313432064.0, "5": 3055894528.0, "10": 3055894528.0, "15": 3055894528.0, "20": 3055894528.0, "25": 3055894528.0, "30": 3055894528.0, "35": 3055894528.0, "40": 3055894528.0, "45": 3055894528.0, "50": 3055894528.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 11.04517, "5": 1.25649, "10": 1.25549, "15": 1.2581, "20": 1.26387, "25": 1.25714, "30": 1.25866, "35": 1.26592, "40": 1.24291, "45": 1.23727, "50": 1.24404}}}
\ No newline at end of file
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.49574, 10.48174, 10.4804, 10.45344, 10.44396, 10.35607, 10.13786, 10.04016, 9.86838, 9.67302]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2182.0, 2462.0, 2158.0, 2112.0, 2291.0, 2485.0, 2953.0, 3287.0, 3440.0, 3059.0]}, "iteration_timing_avg": 0.8110379411764704}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.49574, "5": 10.48398, "10": 10.49943, "15": 10.4663, "20": 10.44775, "25": 10.34964, "30": 10.1728, "35": 10.04262, "40": 9.90767, "45": 9.75792, "50": 9.67684}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2182.0, "5": 2568.0, "10": 2108.0, "15": 2533.0, "20": 2151.0, "25": 2601.0, "30": 2801.0, "35": 3107.0, "40": 2294.0, "45": 3909.0, "50": 3482.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1754654208.0, "5": 1754654208.0, "10": 1754654208.0, "15": 1754654208.0, "20": 1754654208.0, "25": 1754654208.0, "30": 1754654208.0, "35": 1754654208.0, "40": 1754654208.0, "45": 1754654208.0, "50": 1754654208.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2300849152.0, "5": 3043311616.0, "10": 3043311616.0, "15": 3043311616.0, "20": 3043311616.0, "25": 3043311616.0, "30": 3043311616.0, "35": 3043311616.0, "40": 3043311616.0, "45": 3043311616.0, "50": 3043311616.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 14.67278, "5": 1.17709, "10": 1.18485, "15": 1.20778, "20": 1.16573, "25": 1.17871, "30": 1.16949, "35": 1.16897, "40": 1.16996, "45": 1.16571, "50": 1.17045}}}
\ No newline at end of file
......@@ -20,8 +20,8 @@ MODEL_ARGS:
--train-iters: 50
--timing-log-level: 2
--lr-decay-iters: 990000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-bert_00_text_sentence
--vocab-file: ${DATA_PATH}/vocab.txt
--split: 949,50,1
......
{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.49569, "5": 10.48402, "10": 10.49933, "15": 10.46635, "20": 10.44782, "25": 10.34968, "30": 10.17276, "35": 10.04265, "40": 9.90757, "45": 9.75784, "50": 9.67694, "55": 9.55383, "60": 9.45452, "65": 9.42152, "70": 9.30114, "75": 9.3222, "80": 9.26181, "85": 9.2967, "90": 9.23351, "95": 9.23792, "100": 9.10613}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2254.0, "5": 2635.0, "10": 2165.0, "15": 2534.0, "20": 2227.0, "25": 2559.0, "30": 2905.0, "35": 3026.0, "40": 2314.0, "45": 3924.0, "50": 3557.0, "55": 3573.0, "60": 2689.0, "65": 3434.0, "70": 3935.0, "75": 5047.0, "80": 3601.0, "85": 4133.0, "90": 4603.0, "95": 4291.0, "100": 3165.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1754654208.0, "5": 1754654208.0, "10": 1754654208.0, "15": 1754654208.0, "20": 1754654208.0, "25": 1754654208.0, "30": 1754654208.0, "35": 1754654208.0, "40": 1754654208.0, "45": 1754654208.0, "50": 1754654208.0, "55": 1754654208.0, "60": 1754654208.0, "65": 1754654208.0, "70": 1754654208.0, "75": 1754654208.0, "80": 1754654208.0, "85": 1754654208.0, "90": 1754654208.0, "95": 1754654208.0, "100": 1754654208.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2503224320.0, "5": 3245686784.0, "10": 3245686784.0, "15": 3245686784.0, "20": 3245686784.0, "25": 3245686784.0, "30": 3245686784.0, "35": 3245686784.0, "40": 3245686784.0, "45": 3245686784.0, "50": 3245686784.0, "55": 3245686784.0, "60": 3245686784.0, "65": 3245686784.0, "70": 3245686784.0, "75": 3245686784.0, "80": 3245686784.0, "85": 3245686784.0, "90": 3245686784.0, "95": 3245686784.0, "100": 3245686784.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 14.19715, "5": 1.20808, "10": 1.19543, "15": 1.19563, "20": 1.42719, "25": 1.40901, "30": 1.18769, "35": 1.43684, "40": 1.18523, "45": 1.18204, "50": 1.18891, "55": 1.20368, "60": 1.19171, "65": 1.18981, "70": 1.17772, "75": 1.18903, "80": 1.17548, "85": 1.1753, "90": 1.36634, "95": 1.17827, "100": 1.17843}}}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 10.49574, "5": 10.48398, "10": 10.49943, "15": 10.4663, "20": 10.44775, "25": 10.34964, "30": 10.1728, "35": 10.04262, "40": 9.90767, "45": 9.75792, "50": 9.67684, "55": 9.55378, "60": 9.45458, "65": 9.42133, "70": 9.30109, "75": 9.32203, "80": 9.26184, "85": 9.29667, "90": 9.23332, "95": 9.23793, "100": 9.10611}}, "num-zeros": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 2182.0, "5": 2568.0, "10": 2108.0, "15": 2533.0, "20": 2151.0, "25": 2601.0, "30": 2801.0, "35": 3107.0, "40": 2294.0, "45": 3909.0, "50": 3482.0, "55": 3606.0, "60": 2653.0, "65": 3341.0, "70": 3849.0, "75": 5090.0, "80": 3613.0, "85": 4194.0, "90": 4618.0, "95": 4439.0, "100": 3224.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 1754654208.0, "5": 1754654208.0, "10": 1754654208.0, "15": 1754654208.0, "20": 1754654208.0, "25": 1754654208.0, "30": 1754654208.0, "35": 1754654208.0, "40": 1754654208.0, "45": 1754654208.0, "50": 1754654208.0, "55": 1754654208.0, "60": 1754654208.0, "65": 1754654208.0, "70": 1754654208.0, "75": 1754654208.0, "80": 1754654208.0, "85": 1754654208.0, "90": 1754654208.0, "95": 1754654208.0, "100": 1754654208.0}}, "iteration-time": {"start_step": 1, "end_step": 100, "step_interval": 5, "values": {"1": 12.95742, "5": 1.16734, "10": 2.45473, "15": 1.45839, "20": 1.51474, "25": 1.15989, "30": 1.14801, "35": 1.14584, "40": 1.15517, "45": 1.14468, "50": 1.14969, "55": 1.15684, "60": 1.14892, "65": 1.14737, "70": 1.30233, "75": 1.37176, "80": 1.1466, "85": 1.24468, "90": 1.15157, "95": 1.15026, "100": 1.15254}}}
\ No newline at end of file
ENV_VARS:
CUDA_DEVICE_MAX_CONNECTIONS: 1
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
MODEL_ARGS:
--num-layers: 24
--hidden-size: 1024
--num-attention-heads: 16
--log-params-norm: true
--log-num-zeros-in-grad: true
--log-validation-ppl-to-tensorboard: true
--log-timers-to-tensorboard: true
--log-memory-to-tensorboard: true
--tensorboard-dir: ${TENSORBOARD_PATH}
--micro-batch-size: 4
--global-batch-size: 128
--seq-length: 512
--max-position-embeddings: 512
--train-iters: 100
--timing-log-level: 2
--lr-decay-iters: 990000
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-bert_00_text_sentence
--vocab-file: ${DATA_PATH}/vocab.txt
--split: 949,50,1
--distributed-backend: nccl
--lr: 0.0001
--min-lr: 0.00001
--lr-warmup-fraction: 0.01
--log-interval: 1
--save-interval: 50
--eval-interval: 1000
--eval-iters: 10
--tensor-model-parallel-size: 2
--pipeline-model-parallel-size: 2
--deterministic-mode: true
--use-checkpoint-args: true
--use-checkpoint-opt_param-scheduler: true
--no-gradient-accumulation-fusion: true
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--ckpt-format: torch
--attention-backend: unfused
TEST_TYPE: frozen-resume
{
"lm loss": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.49566,
10.48172,
10.48046,
10.45369,
10.44391,
10.35613,
10.13791,
10.04025,
9.86848,
9.67328
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
2183.0,
2571.0,
2097.0,
2118.0,
2414.0,
2464.0,
2988.0,
3223.0,
3481.0,
3046.0
]
},
"mem-allocated-bytes": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
1767237120.0,
1767237632.0,
1767237632.0,
1767237632.0,
1767237632.0,
1767237632.0,
1767237632.0,
1767237632.0,
1767237632.0,
1767237632.0
]
},
"iteration-time": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
13.74859,
1.16037,
1.15664,
1.28303,
1.16087,
1.1576,
1.15188,
1.1644,
1.15171,
1.38366
]
}
}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.49566, "5": 10.48412, "10": 10.49946, "15": 10.46625, "20": 10.44783, "25": 10.34967, "30": 10.17283, "35": 10.04281, "40": 9.90782, "45": 9.75786, "50": 9.67692}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2183.0, "5": 2683.0, "10": 2206.0, "15": 2493.0, "20": 2165.0, "25": 2528.0, "30": 2774.0, "35": 3054.0, "40": 2250.0, "45": 3947.0, "50": 3608.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1767237120.0, "5": 1767237120.0, "10": 1767237120.0, "15": 1767237120.0, "20": 1767237120.0, "25": 1768285696.0, "30": 1767237120.0, "35": 1767237120.0, "40": 1767237120.0, "45": 1767237120.0, "50": 1767237120.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2313432064.0, "5": 3055894528.0, "10": 3055894528.0, "15": 3055894528.0, "20": 3055894528.0, "25": 3055894528.0, "30": 3055894528.0, "35": 3055894528.0, "40": 3055894528.0, "45": 3055894528.0, "50": 3055894528.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.83402, "5": 1.12649, "10": 1.1312, "15": 1.12374, "20": 1.12209, "25": 1.13995, "30": 1.38104, "35": 1.14649, "40": 1.14975, "45": 1.14816, "50": 1.15079}}}
\ No newline at end of file
{
"lm loss": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.49566,
10.48166,
10.48045,
10.45348,
10.44412,
10.3561,
10.13792,
10.04026,
9.86832,
9.67306
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
2183.0,
2469.0,
2115.0,
2126.0,
2281.0,
2389.0,
3013.0,
3255.0,
3491.0,
3062.0
]
},
"mem-allocated-bytes": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
1767237120.0,
1767237120.0,
1767237120.0,
1767237120.0,
1767237120.0,
1767237120.0,
1767237120.0,
1767237120.0,
1767237120.0,
1767237120.0
]
},
"iteration-time": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
14.75035,
1.17988,
1.18643,
1.18301,
1.19116,
1.19494,
1.54654,
1.19342,
1.1823,
1.18039
]
}
}
\ No newline at end of file
{"lm loss": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 10.49566, "5": 10.48418, "10": 10.49947, "15": 10.46646, "20": 10.44777, "25": 10.34987, "30": 10.17278, "35": 10.04282, "40": 9.90771, "45": 9.75789, "50": 9.67683}}, "num-zeros": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2183.0, "5": 2533.0, "10": 2162.0, "15": 2548.0, "20": 2180.0, "25": 2557.0, "30": 2908.0, "35": 2999.0, "40": 2252.0, "45": 3808.0, "50": 3622.0}}, "mem-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 1767237120.0, "5": 1767237120.0, "10": 1767237120.0, "15": 1767237120.0, "20": 1767237120.0, "25": 1767237120.0, "30": 1767237120.0, "35": 1767237120.0, "40": 1767237120.0, "45": 1767237120.0, "50": 1767237120.0}}, "mem-max-allocated-bytes": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 2313432064.0, "5": 3055894528.0, "10": 3055894528.0, "15": 3055894528.0, "20": 3055894528.0, "25": 3055894528.0, "30": 3055894528.0, "35": 3055894528.0, "40": 3055894528.0, "45": 3055894528.0, "50": 3055894528.0}}, "iteration-time": {"start_step": 1, "end_step": 50, "step_interval": 5, "values": {"1": 12.23164, "5": 1.1819, "10": 1.18193, "15": 1.18147, "20": 1.18394, "25": 1.37105, "30": 1.18551, "35": 1.18659, "40": 1.18004, "45": 1.183, "50": 1.196}}}
\ No newline at end of file
......@@ -20,8 +20,8 @@ MODEL_ARGS:
--train-iters: 50
--timing-log-level: 2
--lr-decay-iters: 990000
--save: ${CHECKPOINT_PATH}
--load: ${CHECKPOINT_PATH}
--save: ${CHECKPOINT_SAVE_PATH}
--load: ${CHECKPOINT_LOAD_PATH}
--data-path: ${DATA_PATH}/my-bert_00_text_sentence
--vocab-file: ${DATA_PATH}/vocab.txt
--split: 949,50,1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment