Unverified Commit 5042dc00 authored by Shaden Smith's avatar Shaden Smith Committed by GitHub
Browse files

drafting Jekyll webpage (#143)

parent d6bc44bf
...@@ -9,3 +9,10 @@ build/ ...@@ -9,3 +9,10 @@ build/
dist/ dist/
fused_lamb_*.so fused_lamb_*.so
deepspeed.egg-info/ deepspeed.egg-info/
# Website
docs/_site/
docs/code-docs/build
.sass-cache/
.jekyll-cache/
.jekyll-metadata
---
permalink: /404.html
layout: default
---
<style type="text/css" media="screen">
.container {
margin: 10px auto;
max-width: 600px;
text-align: center;
}
h1 {
margin: 30px 0;
font-size: 4em;
line-height: 1;
letter-spacing: -1px;
}
</style>
<div class="container">
<h1>404</h1>
<p><strong>Page not found :(</strong></p>
<p>The requested page could not be found.</p>
</div>
www.deepspeed.ai
\ No newline at end of file
source "https://rubygems.org"
gem 'github-pages', group: :jekyll_plugins
# If you have any plugins, put them here!
group :jekyll_plugins do
gem "jekyll-feed"
gem "jekyll-paginate"
gem "jekyll-remote-theme"
gem "jekyll-include-cache"
gem "minimal-mistakes-jekyll"
end
# Windows and JRuby does not include zoneinfo files, so bundle the tzinfo-data gem
# and associated library.
install_if -> { RUBY_PLATFORM =~ %r!mingw|mswin|java! } do
gem "tzinfo", "~> 1.2"
gem "tzinfo-data"
end
# Performance-booster for watching directories on Windows
gem "wdm", "~> 0.1.1", :install_if => Gem.win_platform?
GEM
remote: https://rubygems.org/
specs:
activesupport (6.0.2.1)
concurrent-ruby (~> 1.0, >= 1.0.2)
i18n (>= 0.7, < 2)
minitest (~> 5.1)
tzinfo (~> 1.1)
zeitwerk (~> 2.2)
addressable (2.7.0)
public_suffix (>= 2.0.2, < 5.0)
coffee-script (2.4.1)
coffee-script-source
execjs
coffee-script-source (1.11.1)
colorator (1.1.0)
commonmarker (0.17.13)
ruby-enum (~> 0.5)
concurrent-ruby (1.1.6)
dnsruby (1.61.3)
addressable (~> 2.5)
em-websocket (0.5.1)
eventmachine (>= 0.12.9)
http_parser.rb (~> 0.6.0)
ethon (0.12.0)
ffi (>= 1.3.0)
eventmachine (1.2.7)
execjs (2.7.0)
faraday (1.0.0)
multipart-post (>= 1.2, < 3)
ffi (1.12.2)
forwardable-extended (2.6.0)
gemoji (3.0.1)
github-pages (204)
github-pages-health-check (= 1.16.1)
jekyll (= 3.8.5)
jekyll-avatar (= 0.7.0)
jekyll-coffeescript (= 1.1.1)
jekyll-commonmark-ghpages (= 0.1.6)
jekyll-default-layout (= 0.1.4)
jekyll-feed (= 0.13.0)
jekyll-gist (= 1.5.0)
jekyll-github-metadata (= 2.13.0)
jekyll-mentions (= 1.5.1)
jekyll-optional-front-matter (= 0.3.2)
jekyll-paginate (= 1.1.0)
jekyll-readme-index (= 0.3.0)
jekyll-redirect-from (= 0.15.0)
jekyll-relative-links (= 0.6.1)
jekyll-remote-theme (= 0.4.1)
jekyll-sass-converter (= 1.5.2)
jekyll-seo-tag (= 2.6.1)
jekyll-sitemap (= 1.4.0)
jekyll-swiss (= 1.0.0)
jekyll-theme-architect (= 0.1.1)
jekyll-theme-cayman (= 0.1.1)
jekyll-theme-dinky (= 0.1.1)
jekyll-theme-hacker (= 0.1.1)
jekyll-theme-leap-day (= 0.1.1)
jekyll-theme-merlot (= 0.1.1)
jekyll-theme-midnight (= 0.1.1)
jekyll-theme-minimal (= 0.1.1)
jekyll-theme-modernist (= 0.1.1)
jekyll-theme-primer (= 0.5.4)
jekyll-theme-slate (= 0.1.1)
jekyll-theme-tactile (= 0.1.1)
jekyll-theme-time-machine (= 0.1.1)
jekyll-titles-from-headings (= 0.5.3)
jemoji (= 0.11.1)
kramdown (= 1.17.0)
liquid (= 4.0.3)
mercenary (~> 0.3)
minima (= 2.5.1)
nokogiri (>= 1.10.4, < 2.0)
rouge (= 3.13.0)
terminal-table (~> 1.4)
github-pages-health-check (1.16.1)
addressable (~> 2.3)
dnsruby (~> 1.60)
octokit (~> 4.0)
public_suffix (~> 3.0)
typhoeus (~> 1.3)
html-pipeline (2.12.3)
activesupport (>= 2)
nokogiri (>= 1.4)
http_parser.rb (0.6.0)
i18n (0.9.5)
concurrent-ruby (~> 1.0)
jekyll (3.8.5)
addressable (~> 2.4)
colorator (~> 1.0)
em-websocket (~> 0.5)
i18n (~> 0.7)
jekyll-sass-converter (~> 1.0)
jekyll-watch (~> 2.0)
kramdown (~> 1.14)
liquid (~> 4.0)
mercenary (~> 0.3.3)
pathutil (~> 0.9)
rouge (>= 1.7, < 4)
safe_yaml (~> 1.0)
jekyll-avatar (0.7.0)
jekyll (>= 3.0, < 5.0)
jekyll-coffeescript (1.1.1)
coffee-script (~> 2.2)
coffee-script-source (~> 1.11.1)
jekyll-commonmark (1.3.1)
commonmarker (~> 0.14)
jekyll (>= 3.7, < 5.0)
jekyll-commonmark-ghpages (0.1.6)
commonmarker (~> 0.17.6)
jekyll-commonmark (~> 1.2)
rouge (>= 2.0, < 4.0)
jekyll-default-layout (0.1.4)
jekyll (~> 3.0)
jekyll-feed (0.13.0)
jekyll (>= 3.7, < 5.0)
jekyll-gist (1.5.0)
octokit (~> 4.2)
jekyll-github-metadata (2.13.0)
jekyll (>= 3.4, < 5.0)
octokit (~> 4.0, != 4.4.0)
jekyll-include-cache (0.2.0)
jekyll (>= 3.7, < 5.0)
jekyll-mentions (1.5.1)
html-pipeline (~> 2.3)
jekyll (>= 3.7, < 5.0)
jekyll-optional-front-matter (0.3.2)
jekyll (>= 3.0, < 5.0)
jekyll-paginate (1.1.0)
jekyll-readme-index (0.3.0)
jekyll (>= 3.0, < 5.0)
jekyll-redirect-from (0.15.0)
jekyll (>= 3.3, < 5.0)
jekyll-relative-links (0.6.1)
jekyll (>= 3.3, < 5.0)
jekyll-remote-theme (0.4.1)
addressable (~> 2.0)
jekyll (>= 3.5, < 5.0)
rubyzip (>= 1.3.0)
jekyll-sass-converter (1.5.2)
sass (~> 3.4)
jekyll-seo-tag (2.6.1)
jekyll (>= 3.3, < 5.0)
jekyll-sitemap (1.4.0)
jekyll (>= 3.7, < 5.0)
jekyll-swiss (1.0.0)
jekyll-theme-architect (0.1.1)
jekyll (~> 3.5)
jekyll-seo-tag (~> 2.0)
jekyll-theme-cayman (0.1.1)
jekyll (~> 3.5)
jekyll-seo-tag (~> 2.0)
jekyll-theme-dinky (0.1.1)
jekyll (~> 3.5)
jekyll-seo-tag (~> 2.0)
jekyll-theme-hacker (0.1.1)
jekyll (~> 3.5)
jekyll-seo-tag (~> 2.0)
jekyll-theme-leap-day (0.1.1)
jekyll (~> 3.5)
jekyll-seo-tag (~> 2.0)
jekyll-theme-merlot (0.1.1)
jekyll (~> 3.5)
jekyll-seo-tag (~> 2.0)
jekyll-theme-midnight (0.1.1)
jekyll (~> 3.5)
jekyll-seo-tag (~> 2.0)
jekyll-theme-minimal (0.1.1)
jekyll (~> 3.5)
jekyll-seo-tag (~> 2.0)
jekyll-theme-modernist (0.1.1)
jekyll (~> 3.5)
jekyll-seo-tag (~> 2.0)
jekyll-theme-primer (0.5.4)
jekyll (> 3.5, < 5.0)
jekyll-github-metadata (~> 2.9)
jekyll-seo-tag (~> 2.0)
jekyll-theme-slate (0.1.1)
jekyll (~> 3.5)
jekyll-seo-tag (~> 2.0)
jekyll-theme-tactile (0.1.1)
jekyll (~> 3.5)
jekyll-seo-tag (~> 2.0)
jekyll-theme-time-machine (0.1.1)
jekyll (~> 3.5)
jekyll-seo-tag (~> 2.0)
jekyll-titles-from-headings (0.5.3)
jekyll (>= 3.3, < 5.0)
jekyll-watch (2.2.1)
listen (~> 3.0)
jemoji (0.11.1)
gemoji (~> 3.0)
html-pipeline (~> 2.2)
jekyll (>= 3.0, < 5.0)
kramdown (1.17.0)
liquid (4.0.3)
listen (3.2.1)
rb-fsevent (~> 0.10, >= 0.10.3)
rb-inotify (~> 0.9, >= 0.9.10)
mercenary (0.3.6)
mini_portile2 (2.4.0)
minima (2.5.1)
jekyll (>= 3.5, < 5.0)
jekyll-feed (~> 0.9)
jekyll-seo-tag (~> 2.1)
minimal-mistakes-jekyll (4.19.1)
jekyll (>= 3.7, < 5.0)
jekyll-feed (~> 0.1)
jekyll-gist (~> 1.5)
jekyll-include-cache (~> 0.1)
jekyll-paginate (~> 1.1)
jekyll-sitemap (~> 1.3)
minitest (5.14.0)
multipart-post (2.1.1)
nokogiri (1.10.9)
mini_portile2 (~> 2.4.0)
octokit (4.17.0)
faraday (>= 0.9)
sawyer (~> 0.8.0, >= 0.5.3)
pathutil (0.16.2)
forwardable-extended (~> 2.6)
public_suffix (3.1.1)
rb-fsevent (0.10.3)
rb-inotify (0.10.1)
ffi (~> 1.0)
rouge (3.13.0)
ruby-enum (0.7.2)
i18n
rubyzip (2.3.0)
safe_yaml (1.0.5)
sass (3.7.4)
sass-listen (~> 4.0.0)
sass-listen (4.0.0)
rb-fsevent (~> 0.9, >= 0.9.4)
rb-inotify (~> 0.9, >= 0.9.7)
sawyer (0.8.2)
addressable (>= 2.3.5)
faraday (> 0.8, < 2.0)
terminal-table (1.8.0)
unicode-display_width (~> 1.1, >= 1.1.1)
thread_safe (0.3.6)
typhoeus (1.3.1)
ethon (>= 0.9.0)
tzinfo (1.2.6)
thread_safe (~> 0.1)
tzinfo-data (1.2019.3)
tzinfo (>= 1.0.0)
unicode-display_width (1.7.0)
wdm (0.1.1)
zeitwerk (2.3.0)
PLATFORMS
ruby
DEPENDENCIES
github-pages
jekyll-feed
jekyll-include-cache
jekyll-paginate
jekyll-remote-theme
minimal-mistakes-jekyll
tzinfo (~> 1.2)
tzinfo-data
wdm (~> 0.1.1)
BUNDLED WITH
2.1.4
title: DeepSpeed
email: deepspeed@microsoft.com
description: >-
DeepSpeed is a deep learning optimization library that makes distributed
training easy, efficient, and effective.
locale : "en-US"
repository: microsoft/DeepSpeed
baseurl: "/" # the subpath of your site, e.g. /blog
url: "https://www.deepspeed.ai" # the base hostname & protocol for your site, e.g. http://example.com
# Build settings
remote_theme: "mmistakes/minimal-mistakes@4.19.0"
minimal_mistakes_skin : "air"
plugins:
- jekyll-feed
- jekyll-include-cache
- jekyll-paginate
#paginate: 10
#paginate_path: /blog/page:num
include: ["_pages"]
exclude: ["code-docs"]
collections:
tutorials:
output: true
permalink: /:collection/:path/
defaults:
- scope:
path: ""
type: posts
values:
layout: single
author_profile: false
read_time: true
comments: false
share: true
related: false
# _tutorials
- scope:
path: ""
type: tutorials
values:
layout: single
toc: true
toc_label: "Contents"
sidebar:
nav: "lnav"
timezone: America/Los_Angeles
breadcrumbs: true
main:
- title: "Getting Started"
url: /getting-started/
- title: "Blog"
url: /blog/
- title: "Tutorials"
url: /tutorials/
- title: "Documentation"
url: https://ghpages-test.readthedocs.io/
- title: "GitHub"
url: https://github.com/microsoft/DeepSpeed
lnav:
- title: "This is a floating nav bar."
- title: "Getting Started"
url: /getting-started/
children:
- title: "Installation"
url: /getting-started/#installation
- title: "Configuration"
url: /getting-started/#deepspeed-configuration
---
title: "Tutorials"
layout: collection
collection: tutorials
permalink: /tutorials/
---
---
layout: single
title: "ZeRO & DeepSpeed: New system optimizations enable training models with over 100 billion parameters"
date: 2020-02-13
link: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
excerpt: "Developed by Microsoft AI & Research."
categories: news
---
---
layout: single
title: "Turing-NLG: A 17-billion-parameter language model by Microsoft"
date: 2020-02-13
link: https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/
excerpt: "DeepSpeed was used to train the world's largest language model."
categories: news
---
---
title: "ZeRO stage 1 with reduced communication"
date: 2020-03-13
excerpt: "Partition-aware ZeRO with up to 2x reduction in communication time!"
---
# ZeRO stage 1 with reduced communication
* Partition-aware approach instead of initial implementation that used a global collective (all-reduce)
* Total communication volume reduction 1.5x -> 1x of data parallelism
* Up to 2x reduction in communication time compared to all-reduce
# Further updates coming soon!
---
title: "ZeRO stage 2"
date: 2020-03-13
excerpt: "Reduce memory footprint to enable training 10B models without model parallelism!"
---
# Zero Stage 2
* Reduce memory footprint of gradients
* Train larger models: e.g., 10B parameters on 32GPUs without model parallelism
* Train larger batch sizes
# Further updates coming soon!
# Tutorial: CIFAR-10 with DeepSpeed ---
title: "CIFAR-10 Tutorial"
excerpt: "Train your first model with DeepSpeed!"
---
If you haven't already, we advise you to first read through the [Getting If you haven't already, we advise you to first read through the [Getting
Started](../../README.md#getting-started) guide before stepping through this Started](../../README.md#getting-started) guide before stepping through this
...@@ -10,22 +13,22 @@ First we will go over how to run original CIFAR-10. Then we will proceed step-by ...@@ -10,22 +13,22 @@ First we will go over how to run original CIFAR-10. Then we will proceed step-by
## 1 Running Original CIFAR-10 ## Running Original CIFAR-10
Original model code from [CIFAR-10 Tutorial](https://github.com/pytorch/tutorials/blob/master/beginner_source/blitz/cifar10_tutorial.py), We've copied this repo under [DeepSpeedExamples/cifar/](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar) and made it available as a submodule. To download, execute: Original model code from [CIFAR-10 Tutorial](https://github.com/pytorch/tutorials/blob/master/beginner_source/blitz/cifar10_tutorial.py), We've copied this repo under [DeepSpeedExamples/cifar/](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar) and made it available as a submodule. To download, execute:
``` ```bash
git submodule update --init --recursive git submodule update --init --recursive
``` ```
To install requirements for CIFAR-10: To install requirements for CIFAR-10:
``` ```bash
cd DeepSpeedExamples/cifar cd DeepSpeedExamples/cifar
pip install -r requirements.txt pip install -r requirements.txt
``` ```
Run `python cifar10_tutorial.py`, it downloads the training data set at first run. Run `python cifar10_tutorial.py`, it downloads the training data set at first run.
```less ```
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz
170500096it [00:02, 61124868.24it/s] 170500096it [00:02, 61124868.24it/s]
Extracting ./data/cifar-10-python.tar.gz to ./data Extracting ./data/cifar-10-python.tar.gz to ./data
...@@ -63,10 +66,10 @@ cuda:0 ...@@ -63,10 +66,10 @@ cuda:0
## 2 Enabling DeepSpeed ## Enabling DeepSpeed
### 2.1 Argument Parsing ### Argument Parsing
The first step to apply DeepSpeed is adding DeepSpeed arguments to CIFAR-10 model, using `deepspeed.add_config_arguments()` function as below. The first step to apply DeepSpeed is adding DeepSpeed arguments to CIFAR-10 model, using `deepspeed.add_config_arguments()` function as below.
...@@ -103,7 +106,7 @@ The first step to apply DeepSpeed is adding DeepSpeed arguments to CIFAR-10 mode ...@@ -103,7 +106,7 @@ The first step to apply DeepSpeed is adding DeepSpeed arguments to CIFAR-10 mode
### 2.2 Initialization ### Initialization
We use `deepspeed.initialize` to create `model_engine`, `optimizer` and `trainloader`. Below is its definition. We use `deepspeed.initialize` to create `model_engine`, `optimizer` and `trainloader`. Below is its definition.
...@@ -144,27 +147,28 @@ The original device and optimizer can be removed after initializing DeepSpeed. ...@@ -144,27 +147,28 @@ The original device and optimizer can be removed after initializing DeepSpeed.
### 2.3 Training API ### Training API
The `model` returned by `deepspeed.initialize` is the _DeepSpeed Model Engine_ that we will use to train the model using the forward, backward and step API. The `model` returned by `deepspeed.initialize` is the _DeepSpeed Model Engine_ that we will use to train the model using the forward, backward and step API.
```python ```python
for i, data in enumerate(trainloader): for i, data in enumerate(trainloader):
# get the inputs; data is a list of [inputs, labels] # get the inputs; data is a list of [inputs, labels]
inputs, labels = data[0].to(model_engine.local_rank), data[1].to(model_engine.local_rank) inputs = data[0].to(model_engine.device)
labels = data[1].to(model_engine.device)
outputs = model_engine(inputs) outputs = model_engine(inputs)
loss = criterion(outputs, labels) loss = criterion(outputs, labels)
model_engine.backward(loss) model_engine.backward(loss)
model_engine.step() model_engine.step()
``` ```
Zeroing the gradients is handled automatically by DeepSpeed after the weights have been updated using a mini-batch. Zeroing the gradients is handled automatically by DeepSpeed after the weights have been updated using a mini-batch.
### 2.4 Configuration ### Configuration
The next step to use DeepSpeed is to create a configuration JSON file (ds_config.json). This file provides DeepSpeed specific parameters defined by the user, e.g., batch size, optimizer, scheduler and other parameters. The next step to use DeepSpeed is to create a configuration JSON file (ds_config.json). This file provides DeepSpeed specific parameters defined by the user, e.g., batch size, optimizer, scheduler and other parameters.
...@@ -198,20 +202,17 @@ The next step to use DeepSpeed is to create a configuration JSON file (ds_config ...@@ -198,20 +202,17 @@ The next step to use DeepSpeed is to create a configuration JSON file (ds_config
### 2.6 Run CIFAR-10 Model with DeepSpeed Enabled ### Run CIFAR-10 Model with DeepSpeed Enabled
To start training CIFAR-10 model with DeepSpeed applied, execute the following command, it will use all detected GPUs by default. To start training CIFAR-10 model with DeepSpeed applied, execute the following command, it will use all detected GPUs by default.
```bash ```bash
deepspeed cifar10_deepspeed.py --deepspeed --deepspeed_config ds_config.json deepspeed cifar10_deepspeed.py --deepspeed_config ds_config.json
``` ```
DeepSpeed usually prints more training details for user to monitor, including training settings, performance statistics and loss trends. DeepSpeed usually prints more training details for user to monitor, including training settings, performance statistics and loss trends.
```
```less deepspeed.pt cifar10_deepspeed.py --deepspeed_config ds_config.json
deepspeed.pt --num_nodes 1 --num_gpus 1 cifar10_deepspeed.py --deepspeed --deepspeed_config ds_config.json
Warning: Permanently added '[192.168.0.22]:42227' (ECDSA) to the list of known hosts. Warning: Permanently added '[192.168.0.22]:42227' (ECDSA) to the list of known hosts.
cmd=['pdsh', '-w', 'worker-0', 'export NCCL_VERSION=2.4.2; ', 'cd /data/users/deepscale/test/ds_v2/examples/cifar;', '/usr/bin/python', '-u', '-m', 'deepspeed.pt.deepspeed_launch', '--world_info=eyJ3b3JrZXItMCI6IFswXX0=', '--node_rank=%n', '--master_addr=192.168.0.22', '--master_port=29500', 'cifar10_deepspeed.py', '--deepspeed', '--deepspeed_config', 'ds_config.json'] cmd=['pdsh', '-w', 'worker-0', 'export NCCL_VERSION=2.4.2; ', 'cd /data/users/deepscale/test/ds_v2/examples/cifar;', '/usr/bin/python', '-u', '-m', 'deepspeed.pt.deepspeed_launch', '--world_info=eyJ3b3JrZXItMCI6IFswXX0=', '--node_rank=%n', '--master_addr=192.168.0.22', '--master_port=29500', 'cifar10_deepspeed.py', '--deepspeed', '--deepspeed_config', 'ds_config.json']
worker-0: Warning: Permanently added '[192.168.0.22]:42227' (ECDSA) to the list of known hosts. worker-0: Warning: Permanently added '[192.168.0.22]:42227' (ECDSA) to the list of known hosts.
......
---
title: "Getting Started"
permalink: /getting-started/
excerpt: "First steps with DeepSpeed"
---
## Installation
* Please see our [Azure tutorial](docs/azure.md) to get started with DeepSpeed on Azure!
* If you're not on Azure, we recommend using our docker image via `docker pull deepspeed/deepspeed:latest` which contains a pre-installed version of DeepSpeed and all the necessary dependencies.
* If you want to install DeepSpeed manually, we provide an install script [install.sh](install.sh) to help install on a local machine or across an entire cluster.
## Writing DeepSpeed Models
DeepSpeed model training is accomplished using the DeepSpeed engine. The engine
can wrap any arbitrary model of type `torch.nn.module` and has a minimal set of APIs
for training and checkpointing the model. Please see the tutorials for detailed
examples.
To initialize the DeepSpeed engine:
```python
model_engine, optimizer, _, _ = deepspeed.initialize(args=cmd_args,
model=model,
model_parameters=params)
```
`deepspeed.inialize` ensures that all of the necessary setup required for
distributed data parallel or mixed precision training are done
appropriately under the hood. In addition to wrapping the model, DeepSpeed can
construct and manage the training optimizer, data loader, and the learning rate
scheduler based on the parameters passed to `deepspeed.initialze` and the
DeepSpeed [configuration file](#deepspeed-configuration).
### Training
Once the DeepSpeed engine has been initialized, it can be used to train the
model using three simple APIs for forward propagation (`()`), backward
propagation (`backward`), and weight updates (`step`).
```python
for step, batch in enumerate(data_loader):
#forward() method
loss = model_engine(batch)
#runs backpropagation
model_engine.backward(loss)
#weight update
model_engine.step()
```
Under the hood, DeepSpeed automatically performs the necessary operations
required for distributed data parallel training, in mixed precision, with a
pre-defined learning rate schedule:
* **Gradient Averaging**: in distributed data parallel training, `backward`
ensures that gradients are averaged across data parallel processes after
training on an `train_batch_size`.
* **Loss Scaling**: in FP16/mixed precision training, the DeepSpeed
engine automatically handles scaling the loss to avoid precision loss in the
gradients.
* **Learning Rate Schedule**: if using DeepSpeed's learning rate
schedule, then DeepSpeed automatically handles any updates to the learning
rate when `step` is executed.
### Model Checkpointing
Saving and loading the training state is handled via the `save_checkpoint` and
`load_checkpoint` API in DeepSpeed which takes two arguments to uniquely
identify a checkpoint:
* `ckpt_dir`: the directory where checkpoints will be saved.
* `ckpt_id`: an identifier that uniquely identifies a checkpoint in the directory.
In the following code snippet, we use the loss value as the checkpoint identifier.
```python
#load checkpoint
_, client_sd = model_engine.load_checkpoint(args.load_dir, args.ckpt_id)
step = client_sd['step']
#advance data loader to ckpt step
dataloader_to_step(data_loader, step + 1)
for step, batch in enumerate(data_loader):
#forward() method
loss = model_engine(batch)
#runs backpropagation
model_engine.backward(loss)
#weight update
model_engine.step()
#save checkpoint
if step % args.save_interval:
client_sd['step'] = step
ckpt_id = loss.item()
model_engine.save_checkpoint(args.save_dir, ckpt_id, client_sd = client_sd)
```
DeepSpeed can automatically save and restore the model, optimizer, and the
learning rate scheduler states while hiding away these details from the user.
However, the user may want to save other data in addition to these that are
unique to a given model training. To support these items, `save_checkpoint`
accepts a client state dictionary `client_sd` for saving. These items can be
retrieved from `load_checkpoint` as a return argument. In the example above,
the `step` value is stored as part of the `client_sd`.
## DeepSpeed Configuration
DeepSpeed features can be enabled, disabled, or configured using a config JSON
file that should be specified as `args.deepspeed_config`. A sample config file
is shown below. For a full set of features see [core API
doc](https://microsoft.github.io/DeepSpeed/docs/htmlfiles/api/full/index.html).
```json
{
"train_batch_size": 8,
"gradient_accumulation_steps": 1,
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
},
"fp16": {
"enabled": true
},
"zero_optimization": true
}
```
## Multi-Node Environment Variables
When training across multiple nodes we have found it useful to support
propagating user-defined environment variables. By default DeepSpeed will
propagate all NCCL and PYTHON related environment variables that are set. If
you would like to propagate additional variables you can specify them in a
dot-file named `.deepspeed_env` that contains a new-line separated list of
`VAR=VAL` entries. The DeepSpeed launcher will look in the local path you are
executing from and also in your home directory (`~/`).
As a concrete example, some clusters require special NCCL variables to set
prior to training. The user can simply add these variables to a
`.deepspeed_env` file in their home directory that looks like this:
```
NCCL_IB_DISABLE=1
NCCL_SOCKET_IFNAME=eth0
```
DeepSpeed will then make sure that these environment variables are set when
launching each process on every node across their training job.
# Launching DeepSpeed Training
DeepSpeed installs the entry point `deepspeed` to launch distributed training.
We illustrate an example usage of DeepSpeed with the following assumptions:
1. You have already integrated DeepSpeed into your model
2. `client_entry.py` is the entry script for your model
3. `client args` is the `argparse` command line arguments
4. `ds_config.json` is the configuration file for DeepSpeed
## Resource Configuration (multi-node)
DeepSpeed configures multi-node compute resources with hostfiles that are compatible with
[OpenMPI](https://www.open-mpi.org/) and [Horovod](https://github.com/horovod/horovod).
A hostfile is a list of *hostnames* (or SSH aliases), which are machines accessible via passwordless
SSH, and *slot counts*, which specify the number of GPUs available on the system. For
example,
```
worker-1 slots=4
worker-2 slots=4
```
specifies that two machines named *worker-1* and *worker-2* each have four GPUs to use
for training.
Hostfiles are specified with the `--hostfile` command line option. If no hostfile is
specified, DeepSpeed searches for `/job/hostfile`. If no hostfile is specified or found,
DeepSpeed queries the number of GPUs on the local machine to discover the number of local
slots available.
The following command launches a PyTorch training job across all available nodes and GPUs
specified in `myhostfile`:
```bash
deepspeed <client_entry.py> <client args> \
--deepspeed --deepspeed_config ds_config.json --hostfile=myhostfile
```
Alternatively, DeepSpeed allows you to restrict distributed training of your model to a
subset of the available nodes and GPUs. This feature is enabled through two command line
arguments: `--num_nodes` and `--num_gpus`. For example, distributed training can be
restricted to use only two nodes with the following command:
```bash
deepspeed --num_nodes=2 \
<client_entry.py> <client args> \
--deepspeed --deepspeed_config ds_config.json
```
You can instead include or exclude specific resources using the `--include` and
`--exclude` flags. For example, to use all available resources **except** GPU 0 on node
*worker-2* and GPUs 0 and 1 on *worker-3*:
```bash
deepspeed --exclude="worker-2:0@worker-3:0,1" \
<client_entry.py> <client args> \
--deepspeed --deepspeed_config ds_config.json
```
Similarly, you can use **only** GPUs 0 and 1 on *worker-2*:
```bash
deepspeed --include="worker-2:0,1" \
<client_entry.py> <client args> \
--deepspeed --deepspeed_config ds_config.json
```
### MPI Compatibility
As described above, DeepSpeed provides its own parallel launcher to help launch
multi-node/multi-gpu training jobs. If you prefer to launch your training job
using MPI (e.g., mpirun), we provide support for this. It should be noted that
DeepSpeed will still use the torch distributed NCCL backend and *not* the MPI
backend. To launch your training job with mpirun + DeepSpeed you simply pass us
an additional flag `--deepspeed_mpi`. DeepSpeed will then use
[mpi4py](https://pypi.org/project/mpi4py/) to discover the MPI environment (e.g.,
rank, world size) and properly initialize torch distributed for training. In this
case you will explicitly invoke `python` to launch your model script instead of using
the `deepspeed` launcher, here is an example:
```bash
mpirun <mpi-args> python \
<client_entry.py> <client args> \
--deepspeed_mpi --deepspeed --deepspeed_config ds_config.json
```
If you want to use this feature of DeepSpeed, please ensure that mpi4py is
installed via `pip install mpi4py`.
## Resource Configuration (single-node)
In the case that we are only running on a single node (with one or more GPUs)
DeepSpeed *does not* require a hostfile as described above. If a hostfile is
not detected or passed in then DeepSpeed will query the number of GPUs on the
local machine to discover the number of slots available. The `--include` and
`--exclude` arguments work as normal, but the user should specify 'localhost'
as the hostname.
# DeepSpeed with Azure
This tutorial will help you get started running DeepSpeed on [Azure virtual
machines](https://azure.microsoft.com/en-us/services/virtual-machines/).
Looking forward, we will be integrating these techniques and additional enhancements
into the [Azure ML](https://azure.microsoft.com/en-us/services/machine-learning/) platform to
benefit all your large model training jobs.
If you don't already have an Azure account please see more details here: [https://azure.microsoft.com/](https://azure.microsoft.com/).
To help with launching Azure instances we suggest using the [Azure
CLI](https://docs.microsoft.com/en-us/cli/azure/?view=azure-cli-latest). We have created
several helper scripts to get you quickly started using DeepSpeed with Azure.
* Install Azure CLI on your local box: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli
* Alternatively you can use the Azure in-browser shell: https://shell.azure.com/
## Create an SSH key
Generate an SSH key that will be used across this tutorial to SSH into your VMs and
between Docker containers. `ssh-keygen` is the recommended way of doing this. Our scripts
assume your key is located inside the same directory as the Azure scripts.
## Azure Config JSON
Our helper scripts depend on the following a configuration JSON for deployment
and setup. We have provided a simple example JSON in `azure_config.json` that
sets up a basic environment with two VMs. This config uses the NV6_Promo
instance type which has one NVIDIA Tesla M60 GPU per VM. You can read more
details about the VM on the [Linux Virtual Machines
Pricing](https://azure.microsoft.com/en-us/pricing/details/virtual-machines/linux/)
page.
See the example below:
```json
{
"num_vms": 2,
"location": "southcentralus",
"azure_sku": "Standard_NV6_Promo",
"ssh_private_key": "id_rsa",
"docker_ssh_port": 2222
}
```
## Dependencies
The scripts in this tutorial require [jq](https://stedolan.github.io/jq/) to help with
parsing JSON from the command line. Also it is recommended to install
[pdsh](https://linux.die.net/man/1/pdsh) to help launch ssh connections in parallel.
## Create Azure VMs
We first need to allocate the VMs. We provide a script
```bash
./create_vms.sh
```
to create VMs with the Azure SKU in the region specified in `azure_config.json`. Feel
free to customize your JSON to your desired region/SKU. This step will take a few minutes
to complete while it sets up all of your VMs on Azure.
## Setup VM environment to use DeepSpeed
Next, we need to configure the VM environment for DeepSpeed. We provide a script
```bash
./setup_vms.sh
```
to generate a [hostfile](../README.md#resource-configuration) and SSH
configuration on all of the VMs. This configuration will be used by the DeepSpeed
Docker containers in the next step.
## Start the DeepSpeed docker container
We now setup the DeepSpeed Docker containers on the VMs. We provide a script
```bash
./setup_docker.sh
```
to pull the DeepSpeed image onto all VMs and start a container instance in the
background. This will take several minutes since it needs to pull the entire Docker
image.
## Access VMs
The tool [azure_ssh.sh](azure_ssh.sh) will let you SSH into any of the VMs with this
syntax:
```bash
./azure_ssh.sh <node-id> [command]
```
where the `node-id` is a number between `0` and `num_vms-1`. This script will find the
public IP address of your VM and use the SSH key provided in the Azure configuration
JSON.
## Access DeepSpeed container
Everything should be up and running at this point. Let's access the running DeepSpeed
container on the first VM and make sure we can talk to the other containers in our deployment.
* SSH into the first VM via: `./azure_ssh.sh 0`
* Change directories into the azure folder of this repo via: `cd ~/workdir/DeepSpeed/azure`
* Attach the running docker container via: `./attach.sh`
* You should now be able to `ssh` into any other docker container, the containers can be
accessed via their SSH alias of `worker-N`, where `N` is the VM number between `0`
and `num_vms-1`. In this example we should be able to successfully run `ssh worker-1
hostname` which will return the hostname of worker-1.
## Parallel SSH across containers
DeepSpeed comes installed with a helper script `ds_ssh` which is a wrapper around
the [pdsh](https://linux.die.net/man/1/pdsh) command that lets you issue commands
to groups of hosts (via SSH) in parallel. This wrapper simply connects with the
hostfile that defines all the containers in your deployment. For example if you run
`ds_ssh hostname` you should see a list of all the hostnames in your deployment.
## Run CIFAR-10 example model
We will now run the DeepSpeed CIFAR-10 model example to test the VM setup. From inside
the first DeepSpeed container:
1) Install the python dependencies necessary to run the CIFAR-10 example model. You can
do this across your cluster via:
```bash
ds_ssh pip install -r ~/workdir/DeepSpeed/DeepSpeedExamples/cifar/requirements.txt
```
2) Now change directories to the CIFAR example:
```bash
cd ~/workdir/DeepSpeed/DeepSpeedExamples/cifar
```
3) Finally, launch training across all VMs:
```bash
deepspeed cifar10_deepspeed.py --deepspeed --deepspeed_config ds_config.json
```
## Megatron-LM GPT2
DeepSpeed includes an example model using Megatron-LM's GPT2. Please refer to the full
[Megatron tutorial](../docs/tutorials/MegatronGPT2Tutorial.md) for more details.
* In order to fully train GPT2 with DeepSpeed and ZeRO we recommend using 8 instances of
Azure's Standard_ND40rs_v2 SKU for a total of 64 NVIDIA V100 GPUs. With this setup and
a batch size of 1536 you should be able to complete 100k training steps (153.6 million
samples) in less than 2 weeks of training.
---
layout: home
---
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
#!/bin/bash
sphinx-apidoc -f -o source ../../deepspeed
make html
sphinx
recommonmark
sphinx-rtd-theme
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment