"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "b90fbc7e0ba41dfd6b343e7e2274443f19087f36"
Unverified Commit 6f90c29e authored by Philipp Schmid's avatar Philipp Schmid Committed by GitHub
Browse files

added json dump and extraction of train run time (#11167)

* added json dump and extraction of train run time

* make style happy
parent 07f0bb69
import json
import os import os
import subprocess import subprocess
import unittest import unittest
...@@ -11,7 +12,7 @@ from . import is_sagemaker_available ...@@ -11,7 +12,7 @@ from . import is_sagemaker_available
if is_sagemaker_available(): if is_sagemaker_available():
from sagemaker import TrainingJobAnalytics from sagemaker import Session, TrainingJobAnalytics
from sagemaker.huggingface import HuggingFace from sagemaker.huggingface import HuggingFace
...@@ -27,21 +28,21 @@ if is_sagemaker_available(): ...@@ -27,21 +28,21 @@ if is_sagemaker_available():
"script": "run_glue.py", "script": "run_glue.py",
"model_name_or_path": "distilbert-base-cased", "model_name_or_path": "distilbert-base-cased",
"instance_type": "ml.p3dn.24xlarge", "instance_type": "ml.p3dn.24xlarge",
"results": {"train_runtime": 300, "eval_accuracy": 0.7, "eval_loss": 0.6}, "results": {"train_runtime": 650, "eval_accuracy": 0.7, "eval_loss": 0.6},
}, },
{ {
"framework": "pytorch", "framework": "pytorch",
"script": "run_ddp.py", "script": "run_ddp.py",
"model_name_or_path": "distilbert-base-cased", "model_name_or_path": "distilbert-base-cased",
"instance_type": "ml.p3dn.24xlarge", "instance_type": "ml.p3dn.24xlarge",
"results": {"train_runtime": 300, "eval_accuracy": 0.7, "eval_loss": 0.6}, "results": {"train_runtime": 600, "eval_accuracy": 0.7, "eval_loss": 0.6},
}, },
{ {
"framework": "tensorflow", "framework": "tensorflow",
"script": "run_tf_dist.py", "script": "run_tf_dist.py",
"model_name_or_path": "distilbert-base-cased", "model_name_or_path": "distilbert-base-cased",
"instance_type": "ml.p3dn.24xlarge", "instance_type": "ml.p3dn.24xlarge",
"results": {"train_runtime": 500, "eval_accuracy": 0.6, "eval_loss": 0.7}, "results": {"train_runtime": 600, "eval_accuracy": 0.6, "eval_loss": 0.7},
}, },
] ]
) )
...@@ -88,17 +89,22 @@ class MultiNodeTest(unittest.TestCase): ...@@ -88,17 +89,22 @@ class MultiNodeTest(unittest.TestCase):
# run training # run training
estimator.fit() estimator.fit()
# save csv
self.save_results_as_csv(estimator.latest_training_job.name)
# result dataframe # result dataframe
result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe() result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()
# extract kpis # extract kpis
train_runtime = list(result_metrics_df[result_metrics_df.metric_name == "train_runtime"]["value"])
eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"]) eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])
eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"]) eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])
# get train time from SageMaker job, this includes starting, preprocessing, stopping
train_runtime = (
Session().describe_training_job(estimator.latest_training_job.name).get("TrainingTimeInSeconds", 999999)
)
# assert kpis # assert kpis
assert all(t <= self.results["train_runtime"] for t in train_runtime) assert train_runtime <= self.results["train_runtime"]
assert any(t >= self.results["eval_accuracy"] for t in eval_accuracy) assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy)
assert all(t <= self.results["eval_loss"] for t in eval_loss) assert all(t <= self.results["eval_loss"] for t in eval_loss)
# dump tests result into json file to share in PR
with open(f"{estimator.latest_training_job.name}.json", "w") as outfile:
json.dump({"train_time": train_runtime, "eval_accuracy": eval_accuracy, "eval_loss": eval_loss}, outfile)
import json
import os import os
import subprocess import subprocess
import unittest import unittest
...@@ -11,7 +12,7 @@ from . import is_sagemaker_available ...@@ -11,7 +12,7 @@ from . import is_sagemaker_available
if is_sagemaker_available(): if is_sagemaker_available():
from sagemaker import TrainingJobAnalytics from sagemaker import Session, TrainingJobAnalytics
from sagemaker.huggingface import HuggingFace from sagemaker.huggingface import HuggingFace
...@@ -27,14 +28,14 @@ if is_sagemaker_available(): ...@@ -27,14 +28,14 @@ if is_sagemaker_available():
"script": "run_glue_model_parallelism.py", "script": "run_glue_model_parallelism.py",
"model_name_or_path": "roberta-large", "model_name_or_path": "roberta-large",
"instance_type": "ml.p3dn.24xlarge", "instance_type": "ml.p3dn.24xlarge",
"results": {"train_runtime": 700, "eval_accuracy": 0.3, "eval_loss": 1.2}, "results": {"train_runtime": 1500, "eval_accuracy": 0.3, "eval_loss": 1.2},
}, },
{ {
"framework": "pytorch", "framework": "pytorch",
"script": "run_glue.py", "script": "run_glue.py",
"model_name_or_path": "roberta-large", "model_name_or_path": "roberta-large",
"instance_type": "ml.p3dn.24xlarge", "instance_type": "ml.p3dn.24xlarge",
"results": {"train_runtime": 700, "eval_accuracy": 0.3, "eval_loss": 1.2}, "results": {"train_runtime": 1500, "eval_accuracy": 0.3, "eval_loss": 1.2},
}, },
] ]
) )
...@@ -69,13 +70,14 @@ class MultiNodeTest(unittest.TestCase): ...@@ -69,13 +70,14 @@ class MultiNodeTest(unittest.TestCase):
distribution = {"smdistributed": {"modelparallel": smp_options}, "mpi": mpi_options} distribution = {"smdistributed": {"modelparallel": smp_options}, "mpi": mpi_options}
name_extension = "trainer" if self.script == "run_glue.py" else "smtrainer"
# creates estimator # creates estimator
return HuggingFace( return HuggingFace(
entry_point=self.script, entry_point=self.script,
source_dir=self.env.test_path, source_dir=self.env.test_path,
role=self.env.role, role=self.env.role,
image_uri=self.env.image_uri, image_uri=self.env.image_uri,
base_job_name=f"{self.env.base_job_name}-{instance_count}-smp", base_job_name=f"{self.env.base_job_name}-{instance_count}-smp-{name_extension}",
instance_count=instance_count, instance_count=instance_count,
instance_type=self.instance_type, instance_type=self.instance_type,
debugger_hook_config=False, debugger_hook_config=False,
...@@ -101,17 +103,22 @@ class MultiNodeTest(unittest.TestCase): ...@@ -101,17 +103,22 @@ class MultiNodeTest(unittest.TestCase):
# run training # run training
estimator.fit() estimator.fit()
# save csv
self.save_results_as_csv(estimator.latest_training_job.name)
# result dataframe # result dataframe
result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe() result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()
# extract kpis # extract kpis
train_runtime = list(result_metrics_df[result_metrics_df.metric_name == "train_runtime"]["value"])
eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"]) eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])
eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"]) eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])
# get train time from SageMaker job, this includes starting, preprocessing, stopping
train_runtime = (
Session().describe_training_job(estimator.latest_training_job.name).get("TrainingTimeInSeconds", 999999)
)
# assert kpis # assert kpis
assert all(t <= self.results["train_runtime"] for t in train_runtime) assert train_runtime <= self.results["train_runtime"]
assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy) assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy)
assert all(t <= self.results["eval_loss"] for t in eval_loss) assert all(t <= self.results["eval_loss"] for t in eval_loss)
# dump tests result into json file to share in PR
with open(f"{estimator.latest_training_job.name}.json", "w") as outfile:
json.dump({"train_time": train_runtime, "eval_accuracy": eval_accuracy, "eval_loss": eval_loss}, outfile)
import json
import os import os
import subprocess import subprocess
import unittest import unittest
...@@ -11,7 +12,7 @@ from . import is_sagemaker_available ...@@ -11,7 +12,7 @@ from . import is_sagemaker_available
if is_sagemaker_available(): if is_sagemaker_available():
from sagemaker import TrainingJobAnalytics from sagemaker import Session, TrainingJobAnalytics
from sagemaker.huggingface import HuggingFace from sagemaker.huggingface import HuggingFace
...@@ -27,14 +28,14 @@ if is_sagemaker_available(): ...@@ -27,14 +28,14 @@ if is_sagemaker_available():
"script": "run_glue.py", "script": "run_glue.py",
"model_name_or_path": "distilbert-base-cased", "model_name_or_path": "distilbert-base-cased",
"instance_type": "ml.g4dn.xlarge", "instance_type": "ml.g4dn.xlarge",
"results": {"train_runtime": 200, "eval_accuracy": 0.6, "eval_loss": 0.9}, "results": {"train_runtime": 650, "eval_accuracy": 0.6, "eval_loss": 0.9},
}, },
{ {
"framework": "tensorflow", "framework": "tensorflow",
"script": "run_tf.py", "script": "run_tf.py",
"model_name_or_path": "distilbert-base-cased", "model_name_or_path": "distilbert-base-cased",
"instance_type": "ml.g4dn.xlarge", "instance_type": "ml.g4dn.xlarge",
"results": {"train_runtime": 350, "eval_accuracy": 0.3, "eval_loss": 0.9}, "results": {"train_runtime": 600, "eval_accuracy": 0.3, "eval_loss": 0.9},
}, },
] ]
) )
...@@ -74,17 +75,22 @@ class SingleNodeTest(unittest.TestCase): ...@@ -74,17 +75,22 @@ class SingleNodeTest(unittest.TestCase):
# run training # run training
estimator.fit() estimator.fit()
# save csv
self.save_results_as_csv(estimator.latest_training_job.name)
# result dataframe # result dataframe
result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe() result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()
# extract kpis # extract kpis
train_runtime = list(result_metrics_df[result_metrics_df.metric_name == "train_runtime"]["value"])
eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"]) eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])
eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"]) eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])
# get train time from SageMaker job, this includes starting, preprocessing, stopping
train_runtime = (
Session().describe_training_job(estimator.latest_training_job.name).get("TrainingTimeInSeconds", 999999)
)
# assert kpis # assert kpis
assert all(t <= self.results["train_runtime"] for t in train_runtime) assert train_runtime <= self.results["train_runtime"]
assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy) assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy)
assert all(t <= self.results["eval_loss"] for t in eval_loss) assert all(t <= self.results["eval_loss"] for t in eval_loss)
# dump tests result into json file to share in PR
with open(f"{estimator.latest_training_job.name}.json", "w") as outfile:
json.dump({"train_time": train_runtime, "eval_accuracy": eval_accuracy, "eval_loss": eval_loss}, outfile)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment