Unverified Commit 604c0850 authored by Philipp Schmid's avatar Philipp Schmid Committed by GitHub
Browse files

Sagemaker test (#10925)

* init

* first working test

* added todo for setup.py

* working test for single node multi node ddp and smd

* added tensorflow single node test

* added directory for pytorch and tensorflow due to different requirements.txt

* added directory for pytorch and tensorflow

* added comment for run_glue until it is available

* added output_dir to it

* smaller dataset to make test running faster

* adjust HP and script

* adjusted parameter for tensorflow

* refactored test scripts

* adjusted make file

* init

* first working test

* added todo for setup.py

* working test for single node multi node ddp and smd

* added tensorflow single node test

* added directory for pytorch and tensorflow due to different requirements.txt

* added directory for pytorch and tensorflow

* added comment for run_glue until it is available

* added output_dir to it

* smaller dataset to make test running faster

* adjust HP and script

* adjusted parameter for tensorflow

* refactored test scripts

* adjusted make file

* updated dlc container

* commented in all tests

* added both ecr images

* added new master branches

* debug

* added new datasets version

* init

* strange rebase bug

* removed changes

* changed min version for tests to work

* updated DLC

* added model parallel test

* removed test files

* removed test files

* tested with ned dlc

* added correct sagemaker sdk version

* adjust DLCs for official one

* reworked tests

* quality

* removed default profile added documentation to it

* added step in release for sagemaker tests

* reverted version for example script removed duplicated script and added install from master to requirements.txt

* removed mistaken .DS_Stores from mac

* fixed tests

* added Sylvains feedback

* make style

* added lysandre's feedback
parent 6dfd0272
......@@ -74,6 +74,12 @@ test:
test-examples:
python -m pytest -n auto --dist=loadfile -s -v ./examples/
# Run tests for SageMaker DLC release
test-sagemaker: # install sagemaker dependencies in advance with pip install .[sagemaker]
TEST_SAGEMAKER=True python -m pytest -n auto -s -v ./tests/sagemaker
# Check that docs can build
docs:
......
......@@ -19,15 +19,17 @@ To create the package for pypi.
1. Run `make pre-release` (or `make pre-patch` for a patch release) then run `make fix-copies` to fix the index of the
documentation.
2. Run Tests for Amazon Sagemaker. The documentation is located in `./tests/sagemaker/README.md`, otherwise @philschmid.
2. Unpin specific versions from setup.py that use a git install.
3. Unpin specific versions from setup.py that use a git install.
3. Commit these changes with the message: "Release: VERSION"
4. Commit these changes with the message: "Release: VERSION"
4. Add a tag in git to mark the release: "git tag VERSION -m 'Adds tag VERSION for pypi' "
5. Add a tag in git to mark the release: "git tag VERSION -m 'Adds tag VERSION for pypi' "
Push the tag to git: git push --tags origin master
5. Build both the sources and the wheel. Do not change anything in setup.py between
6. Build both the sources and the wheel. Do not change anything in setup.py between
creating the wheel and the source distribution (obviously).
For the wheel, run: "python setup.py bdist_wheel" in the top level directory.
......@@ -36,7 +38,7 @@ To create the package for pypi.
For the sources, run: "python setup.py sdist"
You should now have a /dist directory with both .whl and .tar.gz source versions.
6. Check that everything looks correct by uploading the package to the pypi test server:
7. Check that everything looks correct by uploading the package to the pypi test server:
twine upload dist/* -r pypitest
(pypi suggest using twine as other methods upload files via plaintext.)
......@@ -46,12 +48,12 @@ To create the package for pypi.
Check that you can install it in a virtualenv by running:
pip install -i https://testpypi.python.org/pypi transformers
7. Upload the final version to actual pypi:
8. Upload the final version to actual pypi:
twine upload dist/* -r pypi
8. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
9. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
9. Run `make post-release` (or `make post-patch` for a patch release).
10. Run `make post-release` (or `make post-patch` for a patch release).
"""
import os
......@@ -134,6 +136,7 @@ _deps = [
"unidic>=1.0.2",
"unidic_lite>=1.0.7",
"uvicorn",
"sagemaker>=2.31.0",
]
......@@ -223,12 +226,16 @@ extras["onnxruntime"] = deps_list("onnxruntime", "onnxruntime-tools")
extras["onnx"] = deps_list("onnxconverter-common", "keras2onnx") + extras["onnxruntime"]
extras["modelcreation"] = deps_list("cookiecutter")
extras["sagemaker"] = deps_list("sagemaker")
extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
extras["speech"] = deps_list("soundfile", "torchaudio")
extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
extras["testing"] = (
deps_list("pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-sugar", "black")
deps_list(
"pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-sugar", "black"
)
+ extras["retrieval"]
+ extras["modelcreation"]
)
......
......@@ -53,4 +53,5 @@ deps = {
"unidic": "unidic>=1.0.2",
"unidic_lite": "unidic_lite>=1.0.7",
"uvicorn": "uvicorn",
"sagemaker": "sagemaker>=2.31.0",
}
# Testing new Hugging Face Deep Learning Container.
This document explains the testing strategy for releasing the new Hugging Face Deep Learning Container. AWS maintains 14 days of currency with framework releases. Besides framework releases, AWS release train is bi-weekly on Monday. Code cutoff date for any changes is the Wednesday before release-Monday.
## Test Case 1: Releasing a New Version (Minor/Major) of 🤗 Transformers
### Requirements: Test should run on Release Candidate for new `transformers` release to validate the new release is compatible with the DLCs. To run these tests you need credentials for the HF SageMaker AWS Account. You can ask @philschmid or @n1t0 to get access.
### Run Tests:
Before we can run the tests we need to adjust the `requirements.txt` for PyTorch under `/tests/sagemaker/scripts/pytorch` and for TensorFlow under `/tests/sagemaker/scripts/pytorch`. We adjust the branch to the new RC-tag.
```
git+https://github.com/huggingface/transformers.git@v4.5.0.rc0 # install master or adjust ist with vX.X.X for installing version specific-transforms
```
After we adjusted the `requirements.txt` we can run Amazon SageMaker tests with:
```bash
AWS_PROFILE=<enter-your-profile> make sagemaker-test
```
These tests take around 10-15 minutes to finish. Preferably make a screenshot of the successfully ran tests.
### After Transformers Release:
After we have released the Release Candidate we need to create a PR at the [Deep Learning Container Repository](https://github.com/aws/deep-learning-containers).
**Creating the update PR:**
1. Update the two latest `buildspec.yaml` config for [PyTorch](https://github.com/aws/deep-learning-containers/tree/master/huggingface/pytorch) and [TensorFlow](https://github.com/aws/deep-learning-containers/tree/master/huggingface/tensorflow). The two latest `buildspec.yaml` are the `buildspec.yaml` without a version tag and the one with the highest framework version, e.g. `buildspec-1-7-1.yml` and not `buildspec-1-6.yml`.
To update the `buildspec.yaml` we need to adjust either the `transformers_version` or the `datasets_version` or both. Example for upgrading to `transformers 4.5.0` and `datasets 1.6.0`.
```yaml
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
region: &REGION <set-$REGION-in-environment>
base_framework: &BASE_FRAMEWORK pytorch
framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
version: &VERSION 1.6.0
short_version: &SHORT_VERSION 1.6
repository_info:
training_repository: &TRAINING_REPOSITORY
image_type: &TRAINING_IMAGE_TYPE training
root: !join [ "huggingface/", *BASE_FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
repository_name: &REPOSITORY_NAME !join ["pr", "-", "huggingface", "-", *BASE_FRAMEWORK, "-", *TRAINING_IMAGE_TYPE]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/,
*REPOSITORY_NAME ]
images:
BuildHuggingFacePytorchGpuPy37Cu110TrainingDockerImage:
<<: *TRAINING_REPOSITORY
build: &HUGGINGFACE_PYTORCH_GPU_TRAINING_PY3 false
image_size_baseline: &IMAGE_SIZE_BASELINE 15000
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py36
cuda_version: &CUDA_VERSION cu110
os_version: &OS_VERSION ubuntu18.04
transformers_version: &TRANSFORMERS_VERSION 4.5.0 # this was adjusted from 4.4.2 to 4.5.0
datasets_version: &DATASETS_VERSION 1.6.0 # this was adjusted from 1.5.0 to 1.6.0
tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-',
*CUDA_VERSION, '-', *OS_VERSION ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /,
*CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
```
2. In the PR comment describe what test, we ran and with which package versions. Here you can copy the table from [Current Tests](#current-tests).
TODO: Add a screenshot of PR + Text template to make it easy to open.
## Test Case 2: Releasing a New AWS Framework DLC
## Execute Tests
### Requirements:
AWS is going to release new DLCs for PyTorch and/or TensorFlow. The Tests should run on the new framework versions with current `transformers` release to validate the new framework release is compatible with the `transformers` version. To run these tests you need credentials for the HF SageMaker AWS Account. You can ask @philschmid or @n1t0 to get access. AWS will notify us with a new issue in the repository pointing to their framework upgrade PR.
### Run Tests:
Before we can run the tests we need to adjust the `requirements.txt` for Pytorch under `/tests/sagemaker/scripts/pytorch` and for Tensorflow under `/tests/sagemaker/scripts/pytorch`. We add the new framework version to it.
```
torch==1.8.1 # for pytorch
tensorflow-gpu==2.5.0 # for tensorflow
```
After we adjusted the `requirements.txt` we can run Amazon SageMaker tests with.
```bash
AWS_PROFILE=<enter-your-profile> make sagemaker-test
```
These tests take around 10-15 minutes to finish. Preferably make a screenshot of the successfully ran tests.
### After successful Tests:
After we have successfully run tests for the new framework version we need to create a PR at the [Deep Learning Container Repository](https://github.com/aws/deep-learning-containers).
**Creating the update PR:**
1. Create a new `buildspec.yaml` config for [PyTorch](https://github.com/aws/deep-learning-containers/tree/master/huggingface/pytorch) and [TensorFlow](https://github.com/aws/deep-learning-containers/tree/master/huggingface/tensorflow) and rename the old `buildspec.yaml` to `buildespec-x.x.x`, where `x.x.x` is the base framework version, e.g. if pytorch 1.6.0 is the latest version in `buildspec.yaml` the file should be renamed to `buildspec-yaml-1-6.yaml`.
To create the new `buildspec.yaml` we need to adjust the `version` and the `short_version`. Example for upgrading to `pytorch 1.7.1`.
```yaml
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
region: &REGION <set-$REGION-in-environment>
base_framework: &BASE_FRAMEWORK pytorch
framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
version: &VERSION 1.7.1 # this was adjusted from 1.6.0 to 1.7.1
short_version: &SHORT_VERSION 1.7 # this was adjusted from 1.6 to 1.7
repository_info:
training_repository: &TRAINING_REPOSITORY
image_type: &TRAINING_IMAGE_TYPE training
root: !join [ "huggingface/", *BASE_FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
repository_name: &REPOSITORY_NAME !join ["pr", "-", "huggingface", "-", *BASE_FRAMEWORK, "-", *TRAINING_IMAGE_TYPE]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/,
*REPOSITORY_NAME ]
images:
BuildHuggingFacePytorchGpuPy37Cu110TrainingDockerImage:
<<: *TRAINING_REPOSITORY
build: &HUGGINGFACE_PYTORCH_GPU_TRAINING_PY3 false
image_size_baseline: &IMAGE_SIZE_BASELINE 15000
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py36
cuda_version: &CUDA_VERSION cu110
os_version: &OS_VERSION ubuntu18.04
transformers_version: &TRANSFORMERS_VERSION 4.4.2
datasets_version: &DATASETS_VERSION 1.5.0
tag: !join [ *VERSION, '-', 'transformers', *TRANSFORMERS_VERSION, '-', *DEVICE_TYPE, '-', *TAG_PYTHON_VERSION, '-',
*CUDA_VERSION, '-', *OS_VERSION ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /,
*CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
```
2. In the PR comment describe what test we ran and with which framework versions. Here you can copy the table from [Current Tests](#current-tests).
TODO: Add a screenshot of PR + Text template to make it easy to open.
## Current Tests
| ID | Description | Platform | #GPUS | Collected & evaluated metrics |
|-------------------------------------|-------------------------------------------------------------------|-----------------------------|-------|------------------------------------------|
| pytorch-transfromers-test-single | test bert finetuning using BERT fromtransformerlib+PT | SageMaker createTrainingJob | 1 | train_runtime, eval_accuracy & eval_loss |
| pytorch-transfromers-test-2-ddp | test bert finetuning using BERT from transformer lib+ PT DPP | SageMaker createTrainingJob | 16 | train_runtime, eval_accuracy & eval_loss |
| pytorch-transfromers-test-2-smd | test bert finetuning using BERT from transformer lib+ PT SM DDP | SageMaker createTrainingJob | 16 | train_runtime, eval_accuracy & eval_loss |
| pytorch-transfromers-test-1-smp | test roberta finetuning using BERT from transformer lib+ PT SM MP | SageMaker createTrainingJob | 8 | train_runtime, eval_accuracy & eval_loss |
| tensorflow-transfromers-test-single | Test bert finetuning using BERT from transformer lib+TF | SageMaker createTrainingJob | 1 | train_runtime, eval_accuracy & eval_loss |
| tensorflow-transfromers-test-2-smd | test bert finetuning using BERT from transformer lib+ TF SM DDP | SageMaker createTrainingJob | 16 | train_runtime, eval_accuracy & eval_loss |
\ No newline at end of file
import importlib
def is_sagemaker_available():
return importlib.util.find_spec("sagemaker") is not None
# we define a fixture function below and it will be "used" by
# referencing its name from tests
import os
import pytest
from attr import dataclass
os.environ["AWS_DEFAULT_REGION"] = "us-east-1" # defaults region
@dataclass
class SageMakerTestEnvironment:
framework: str
role = "arn:aws:iam::558105141721:role/sagemaker_execution_role"
hyperparameters = {
"task_name": "mnli",
"per_device_train_batch_size": 32,
"per_device_eval_batch_size": 32,
"do_train": True,
"do_eval": True,
"do_predict": True,
"output_dir": "/opt/ml/model",
"overwrite_output_dir": True,
"max_steps": 500,
"save_steps": 5500,
}
distributed_hyperparameters = {**hyperparameters, "max_steps": 1000}
@property
def metric_definitions(self) -> str:
if self.framework == "pytorch":
return [
{"Name": "train_runtime", "Regex": "train_runtime.*=\D*(.*?)$"},
{"Name": "eval_accuracy", "Regex": "eval_accuracy.*=\D*(.*?)$"},
{"Name": "eval_loss", "Regex": "eval_loss.*=\D*(.*?)$"},
]
else:
return [
{"Name": "train_runtime", "Regex": "train_runtime.*=\D*(.*?)$"},
{"Name": "eval_accuracy", "Regex": "loss.*=\D*(.*?)]?$"},
{"Name": "eval_loss", "Regex": "sparse_categorical_accuracy.*=\D*(.*?)]?$"},
]
@property
def base_job_name(self) -> str:
return f"{self.framework}-transfromers-test"
@property
def test_path(self) -> str:
return f"./tests/sagemaker/scripts/{self.framework}"
@property
def image_uri(self) -> str:
if self.framework == "pytorch":
return "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:1.6.0-transformers4.4.2-gpu-py36-cu110-ubuntu18.04"
else:
return "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-tensorflow-training:2.4.1-transformers4.4.2-gpu-py37-cu110-ubuntu18.04"
@pytest.fixture(scope="class")
def sm_env(request):
request.cls.env = SageMakerTestEnvironment(framework=request.cls.framework)
git+https://github.com/huggingface/transformers.git@master # install master or adjust ist with vX.X.X for installing version specific transforms
\ No newline at end of file
import json
import logging
import os
import subprocess
from argparse import ArgumentParser
logger = logging.getLogger(__name__)
def parse_args():
parser = ArgumentParser()
parsed, unknown = parser.parse_known_args()
for arg in unknown:
if arg.startswith(("-", "--")):
parser.add_argument(arg.split("=")[0])
return parser.parse_args()
def main():
args = parse_args()
port = 8888
num_gpus = int(os.environ["SM_NUM_GPUS"])
hosts = json.loads(os.environ["SM_HOSTS"])
num_nodes = len(hosts)
current_host = os.environ["SM_CURRENT_HOST"]
rank = hosts.index(current_host)
os.environ["NCCL_DEBUG"] = "INFO"
if num_nodes > 1:
cmd = f"""python -m torch.distributed.launch \
--nnodes={num_nodes} \
--node_rank={rank} \
--nproc_per_node={num_gpus} \
--master_addr={hosts[0]} \
--master_port={port} \
./run_glue.py \
{"".join([f" --{parameter} {value}" for parameter,value in args.__dict__.items()])}"""
else:
cmd = f"""python -m torch.distributed.launch \
--nproc_per_node={num_gpus} \
./run_glue.py \
{"".join([f" --{parameter} {value}" for parameter,value in args.__dict__.items()])}"""
try:
subprocess.run(cmd, shell=True)
except Exception as e:
logger.info(e)
if __name__ == "__main__":
main()
This diff is collapsed.
git+https://github.com/huggingface/transformers.git@master # install master or adjust ist with vX.X.X for installing version specific transforms
\ No newline at end of file
import argparse
import logging
import sys
import time
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Hyperparameters sent by the client are passed as command-line arguments to the script.
parser.add_argument("--epochs", type=int, default=1)
parser.add_argument("--per_device_train_batch_size", type=int, default=16)
parser.add_argument("--per_device_eval_batch_size", type=int, default=8)
parser.add_argument("--model_name_or_path", type=str)
parser.add_argument("--learning_rate", type=str, default=5e-5)
parser.add_argument("--do_train", type=bool, default=True)
parser.add_argument("--do_eval", type=bool, default=True)
parser.add_argument("--output_dir", type=str)
args, _ = parser.parse_known_args()
# overwrite batch size until we have tf_glue.py
args.per_device_train_batch_size = 16
args.per_device_eval_batch_size = 16
# Set up logging
logger = logging.getLogger(__name__)
logging.basicConfig(
level=logging.getLevelName("INFO"),
handlers=[logging.StreamHandler(sys.stdout)],
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
# Load model and tokenizer
model = TFAutoModelForSequenceClassification.from_pretrained(args.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
# Load dataset
train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"])
train_dataset = train_dataset.shuffle().select(range(5000)) # smaller the size for train dataset to 5k
test_dataset = test_dataset.shuffle().select(range(500)) # smaller the size for test dataset to 500
# Preprocess train dataset
train_dataset = train_dataset.map(
lambda e: tokenizer(e["text"], truncation=True, padding="max_length"), batched=True
)
train_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"])
train_features = {
x: train_dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.model_max_length])
for x in ["input_ids", "attention_mask"]
}
tf_train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_dataset["label"])).batch(
args.per_device_train_batch_size
)
# Preprocess test dataset
test_dataset = test_dataset.map(
lambda e: tokenizer(e["text"], truncation=True, padding="max_length"), batched=True
)
test_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"])
test_features = {
x: test_dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.model_max_length])
for x in ["input_ids", "attention_mask"]
}
tf_test_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_dataset["label"])).batch(
args.per_device_eval_batch_size
)
# fine optimizer and loss
optimizer = tf.keras.optimizers.Adam(learning_rate=args.learning_rate)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
start_train_time = time.time()
train_results = model.fit(tf_train_dataset, epochs=args.epochs, batch_size=args.per_device_train_batch_size)
end_train_time = time.time() - start_train_time
logger.info("*** Train ***")
logger.info("train_runtime = %s", end_train_time)
for key, value in train_results.history.items():
logger.info(" %s = %s", key, value)
import argparse
import logging
import os
import sys
import time
import tensorflow as tf
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers.file_utils import is_sagemaker_distributed_available
if os.environ.get("SDP_ENABLED") or is_sagemaker_distributed_available():
SDP_ENABLED = True
os.environ["SAGEMAKER_INSTANCE_TYPE"] = "p3dn.24xlarge"
import smdistributed.dataparallel.tensorflow as sdp
else:
SDP_ENABLED = False
def fit(model, loss, opt, train_dataset, epochs, train_batch_size, max_steps=None):
pbar = tqdm(train_dataset)
for i, batch in enumerate(pbar):
with tf.GradientTape() as tape:
inputs, targets = batch
outputs = model(batch)
loss_value = loss(targets, outputs.logits)
if SDP_ENABLED:
tape = sdp.DistributedGradientTape(tape, sparse_as_dense=True)
grads = tape.gradient(loss_value, model.trainable_variables)
opt.apply_gradients(zip(grads, model.trainable_variables))
pbar.set_description(f"Loss: {loss_value:.4f}")
if SDP_ENABLED and i == 0:
sdp.broadcast_variables(model.variables, root_rank=0)
sdp.broadcast_variables(opt.variables(), root_rank=0)
if max_steps and i >= max_steps:
break
train_results = {"loss": loss_value.numpy()}
return train_results
def get_datasets(tokenizer, train_batch_size, eval_batch_size):
# Load dataset
train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"])
# Preprocess train dataset
train_dataset = train_dataset.map(
lambda e: tokenizer(e["text"], truncation=True, padding="max_length"), batched=True
)
train_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"])
train_features = {
x: train_dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.model_max_length])
for x in ["input_ids", "attention_mask"]
}
tf_train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_dataset["label"]))
# Preprocess test dataset
test_dataset = test_dataset.map(
lambda e: tokenizer(e["text"], truncation=True, padding="max_length"), batched=True
)
test_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"])
test_features = {
x: test_dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.model_max_length])
for x in ["input_ids", "attention_mask"]
}
tf_test_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_dataset["label"]))
if SDP_ENABLED:
tf_train_dataset = tf_train_dataset.shard(sdp.size(), sdp.rank())
tf_test_dataset = tf_test_dataset.shard(sdp.size(), sdp.rank())
tf_train_dataset = tf_train_dataset.batch(train_batch_size, drop_remainder=True)
tf_test_dataset = tf_test_dataset.batch(eval_batch_size, drop_remainder=True)
return tf_train_dataset, tf_test_dataset
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Hyperparameters sent by the client are passed as command-line arguments to the script.
parser.add_argument("--epochs", type=int, default=3)
parser.add_argument("--per_device_train_batch_size", type=int, default=16)
parser.add_argument("--per_device_eval_batch_size", type=int, default=8)
parser.add_argument("--model_name_or_path", type=str)
parser.add_argument("--learning_rate", type=str, default=5e-5)
parser.add_argument("--do_train", type=bool, default=True)
parser.add_argument("--do_eval", type=bool, default=True)
parser.add_argument("--output_dir", type=str)
parser.add_argument("--max_steps", type=int, default=None)
# Data, model, and output directories
parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
args, _ = parser.parse_known_args()
# Set up logging
logger = logging.getLogger(__name__)
logging.basicConfig(
level=logging.getLevelName("INFO"),
handlers=[logging.StreamHandler(sys.stdout)],
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
if SDP_ENABLED:
sdp.init()
gpus = tf.config.experimental.list_physical_devices("GPU")
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
if gpus:
tf.config.experimental.set_visible_devices(gpus[sdp.local_rank()], "GPU")
# Load model and tokenizer
model = TFAutoModelForSequenceClassification.from_pretrained(args.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
# get datasets
tf_train_dataset, tf_test_dataset = get_datasets(
tokenizer=tokenizer,
train_batch_size=args.per_device_train_batch_size,
eval_batch_size=args.per_device_eval_batch_size,
)
# fine optimizer and loss
optimizer = tf.keras.optimizers.Adam(learning_rate=args.learning_rate)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
# Training
if args.do_train:
# train_results = model.fit(tf_train_dataset, epochs=args.epochs, batch_size=args.train_batch_size)
start_train_time = time.time()
train_results = fit(
model,
loss,
optimizer,
tf_train_dataset,
args.epochs,
args.per_device_train_batch_size,
max_steps=args.max_steps,
)
end_train_time = time.time() - start_train_time
logger.info("*** Train ***")
logger.info("train_runtime = %s", end_train_time)
output_eval_file = os.path.join(args.output_dir, "train_results.txt")
if not SDP_ENABLED or sdp.rank() == 0:
with open(output_eval_file, "w") as writer:
logger.info("***** Train results *****")
logger.info(train_results)
for key, value in train_results.items():
logger.info(" %s = %s", key, value)
writer.write("%s = %s\n" % (key, value))
# Evaluation
if args.do_eval and (not SDP_ENABLED or sdp.rank() == 0):
result = model.evaluate(tf_test_dataset, batch_size=args.per_device_eval_batch_size, return_dict=True)
logger.info("*** Evaluate ***")
output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
with open(output_eval_file, "w") as writer:
logger.info("***** Eval results *****")
logger.info(result)
for key, value in result.items():
logger.info(" %s = %s", key, value)
writer.write("%s = %s\n" % (key, value))
# Save result
if SDP_ENABLED:
if sdp.rank() == 0:
model.save_pretrained(args.output_dir)
tokenizer.save_pretrained(args.output_dir)
else:
model.save_pretrained(args.output_dir)
tokenizer.save_pretrained(args.output_dir)
import os
import subprocess
import unittest
from ast import literal_eval
import pytest
from parameterized import parameterized, parameterized_class
from . import is_sagemaker_available
if is_sagemaker_available():
from sagemaker import TrainingJobAnalytics
from sagemaker.huggingface import HuggingFace
@pytest.mark.skipif(
literal_eval(os.getenv("TEST_SAGEMAKER", "False")) is not True,
reason="Skipping test because should only be run when releasing minor transformers version",
)
@pytest.mark.usefixtures("sm_env")
@parameterized_class(
[
{
"framework": "pytorch",
"script": "run_glue.py",
"model_name_or_path": "distilbert-base-cased",
"instance_type": "ml.p3dn.24xlarge",
"results": {"train_runtime": 300, "eval_accuracy": 0.7, "eval_loss": 0.6},
},
{
"framework": "pytorch",
"script": "run_ddp.py",
"model_name_or_path": "distilbert-base-cased",
"instance_type": "ml.p3dn.24xlarge",
"results": {"train_runtime": 300, "eval_accuracy": 0.7, "eval_loss": 0.6},
},
{
"framework": "tensorflow",
"script": "run_tf_dist.py",
"model_name_or_path": "distilbert-base-cased",
"instance_type": "ml.p3dn.24xlarge",
"results": {"train_runtime": 500, "eval_accuracy": 0.6, "eval_loss": 0.7},
},
]
)
class MultiNodeTest(unittest.TestCase):
def setUp(self):
if self.framework == "pytorch":
subprocess.run(
f"cp ./examples/text-classification/run_glue.py {self.env.test_path}/run_glue.py".split(),
encoding="utf-8",
check=True,
)
assert hasattr(self, "env")
def create_estimator(self, instance_count):
job_name = f"{self.env.base_job_name}-{instance_count}-{'ddp' if 'ddp' in self.script else 'smd'}"
# distributed data settings
distribution = {"smdistributed": {"dataparallel": {"enabled": True}}}
# creates estimator
return HuggingFace(
entry_point=self.script,
source_dir=self.env.test_path,
role=self.env.role,
image_uri=self.env.image_uri,
base_job_name=job_name,
instance_count=instance_count,
instance_type=self.instance_type,
debugger_hook_config=False,
hyperparameters={**self.env.distributed_hyperparameters, "model_name_or_path": self.model_name_or_path},
metric_definitions=self.env.metric_definitions,
distribution=distribution,
py_version="py36",
)
def save_results_as_csv(self, job_name):
TrainingJobAnalytics(job_name).export_csv(f"{self.env.test_path}/{job_name}_metrics.csv")
# @parameterized.expand([(2,), (4,),])
@parameterized.expand([(2,)])
def test_script(self, instance_count):
# create estimator
estimator = self.create_estimator(instance_count)
# run training
estimator.fit()
# save csv
self.save_results_as_csv(estimator.latest_training_job.name)
# result dataframe
result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()
# extract kpis
train_runtime = list(result_metrics_df[result_metrics_df.metric_name == "train_runtime"]["value"])
eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])
eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])
# assert kpis
assert all(t <= self.results["train_runtime"] for t in train_runtime)
assert any(t >= self.results["eval_accuracy"] for t in eval_accuracy)
assert all(t <= self.results["eval_loss"] for t in eval_loss)
import os
import unittest
from ast import literal_eval
import pytest
from parameterized import parameterized, parameterized_class
from . import is_sagemaker_available
if is_sagemaker_available():
from sagemaker import TrainingJobAnalytics
from sagemaker.huggingface import HuggingFace
@pytest.mark.skipif(
literal_eval(os.getenv("TEST_SAGEMAKER", "False")) is not True,
reason="Skipping test because should only be run when releasing minor transformers version",
)
@pytest.mark.usefixtures("sm_env")
@parameterized_class(
[
{
"framework": "pytorch",
"script": "run_glue_model_parallelism.py",
"model_name_or_path": "roberta-large",
"instance_type": "ml.p3dn.24xlarge",
"results": {"train_runtime": 700, "eval_accuracy": 0.3, "eval_loss": 1.2},
},
]
)
class MultiNodeTest(unittest.TestCase):
def setUp(self):
assert hasattr(self, "env")
def create_estimator(self, instance_count):
# configuration for running training on smdistributed Model Parallel
mpi_options = {
"enabled": True,
"processes_per_host": 8,
}
smp_options = {
"enabled": True,
"parameters": {
"microbatches": 4,
"placement_strategy": "spread",
"pipeline": "interleaved",
"optimize": "speed",
"partitions": 4,
"ddp": True,
},
}
distribution = {"smdistributed": {"modelparallel": smp_options}, "mpi": mpi_options}
# creates estimator
return HuggingFace(
entry_point=self.script,
source_dir=self.env.test_path,
role=self.env.role,
image_uri=self.env.image_uri,
base_job_name=f"{self.env.base_job_name}-{instance_count}-smp",
instance_count=instance_count,
instance_type=self.instance_type,
debugger_hook_config=False,
hyperparameters={
**self.env.hyperparameters,
"model_name_or_path": self.model_name_or_path,
"max_steps": 500,
},
metric_definitions=self.env.metric_definitions,
distribution=distribution,
py_version="py36",
)
def save_results_as_csv(self, job_name):
TrainingJobAnalytics(job_name).export_csv(f"{self.env.test_path}/{job_name}_metrics.csv")
# @parameterized.expand([(2,), (4,),])
@parameterized.expand([(1,)])
def test_scripz(self, instance_count):
# create estimator
estimator = self.create_estimator(instance_count)
# run training
estimator.fit()
# save csv
self.save_results_as_csv(estimator.latest_training_job.name)
# result dataframe
result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()
# extract kpis
train_runtime = list(result_metrics_df[result_metrics_df.metric_name == "train_runtime"]["value"])
eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])
eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])
# assert kpis
assert all(t <= self.results["train_runtime"] for t in train_runtime)
assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy)
assert all(t <= self.results["eval_loss"] for t in eval_loss)
import os
import subprocess
import unittest
from ast import literal_eval
import pytest
from parameterized import parameterized_class
from . import is_sagemaker_available
if is_sagemaker_available():
from sagemaker import TrainingJobAnalytics
from sagemaker.huggingface import HuggingFace
@pytest.mark.skipif(
literal_eval(os.getenv("TEST_SAGEMAKER", "False")) is not True,
reason="Skipping test because should only be run when releasing minor transformers version",
)
@pytest.mark.usefixtures("sm_env")
@parameterized_class(
[
{
"framework": "pytorch",
"script": "run_glue.py",
"model_name_or_path": "distilbert-base-cased",
"instance_type": "ml.g4dn.xlarge",
"results": {"train_runtime": 200, "eval_accuracy": 0.6, "eval_loss": 0.9},
},
{
"framework": "tensorflow",
"script": "run_tf.py",
"model_name_or_path": "distilbert-base-cased",
"instance_type": "ml.g4dn.xlarge",
"results": {"train_runtime": 350, "eval_accuracy": 0.3, "eval_loss": 0.9},
},
]
)
class SingleNodeTest(unittest.TestCase):
def setUp(self):
if self.framework == "pytorch":
subprocess.run(
f"cp ./examples/text-classification/run_glue.py {self.env.test_path}/run_glue.py".split(),
encoding="utf-8",
check=True,
)
assert hasattr(self, "env")
def create_estimator(self, instance_count=1):
# creates estimator
return HuggingFace(
entry_point=self.script,
source_dir=self.env.test_path,
role=self.env.role,
image_uri=self.env.image_uri,
base_job_name=f"{self.env.base_job_name}-single",
instance_count=instance_count,
instance_type=self.instance_type,
debugger_hook_config=False,
hyperparameters={**self.env.hyperparameters, "model_name_or_path": self.model_name_or_path},
metric_definitions=self.env.metric_definitions,
py_version="py36",
)
def save_results_as_csv(self, job_name):
TrainingJobAnalytics(job_name).export_csv(f"{self.env.test_path}/{job_name}_metrics.csv")
def test_glue(self):
# create estimator
estimator = self.create_estimator()
# run training
estimator.fit()
# save csv
self.save_results_as_csv(estimator.latest_training_job.name)
# result dataframe
result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()
# extract kpis
train_runtime = list(result_metrics_df[result_metrics_df.metric_name == "train_runtime"]["value"])
eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])
eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])
# assert kpis
assert all(t <= self.results["train_runtime"] for t in train_runtime)
assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy)
assert all(t <= self.results["eval_loss"] for t in eval_loss)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment