drafting Jekyll webpage (#143)

5042dc00 · Shaden Smith · GitHub · d6bc44bf · 5042dc00 · 5042dc00
Unverified Commit 5042dc00 authored Mar 17, 2020 by Shaden Smith Committed by GitHub Mar 17, 2020
20 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,10 @@ build/
 dist/
 fused_lamb_*.so
 deepspeed.egg-info/
+# Website
+docs/_site/
+docs/code-docs/build
+.sass-cache/
+.jekyll-cache/
+.jekyll-metadata
--- a/docs/404.html
+++ b/docs/404.html
+---
+permalink: /404.html
+layout: default
+---
+<style type="text/css" media="screen">
+  .container {
+    margin: 10px auto;
+    max-width: 600px;
+    text-align: center;
+  }
+  h1 {
+    margin: 30px 0;
+    font-size: 4em;
+    line-height: 1;
+    letter-spacing: -1px;
+  }
+</style>
+<div class="container">
+  <h1>404</h1>
+  <p><strong>Page not found :(</strong></p>
+  <p>The requested page could not be found.</p>
+</div>
--- a/docs/CNAME
+++ b/docs/CNAME
-www.deepspeed.ai
\ No newline at end of file
--- a/docs/Gemfile
+++ b/docs/Gemfile
+source "https://rubygems.org"
+gem 'github-pages', group: :jekyll_plugins
+# If you have any plugins, put them here!
+group :jekyll_plugins do
+  gem "jekyll-feed"
+  gem "jekyll-paginate"
+  gem "jekyll-remote-theme"
+  gem "jekyll-include-cache"
+  gem "minimal-mistakes-jekyll"
+end
+# Windows and JRuby does not include zoneinfo files, so bundle the tzinfo-data gem
+# and associated library.
+install_if -> { RUBY_PLATFORM =~ %r!mingw|mswin|java! } do
+  gem "tzinfo", "~> 1.2"
+  gem "tzinfo-data"
+end
+# Performance-booster for watching directories on Windows
+gem "wdm", "~> 0.1.1", :install_if => Gem.win_platform?
--- a/docs/Gemfile.lock
+++ b/docs/Gemfile.lock
+GEM
+  remote: https://rubygems.org/
+  specs:
+    activesupport (6.0.2.1)
+      concurrent-ruby (~> 1.0, >= 1.0.2)
+      i18n (>= 0.7, < 2)
+      minitest (~> 5.1)
+      tzinfo (~> 1.1)
+      zeitwerk (~> 2.2)
+    addressable (2.7.0)
+      public_suffix (>= 2.0.2, < 5.0)
+    coffee-script (2.4.1)
+      coffee-script-source
+      execjs
+    coffee-script-source (1.11.1)
+    colorator (1.1.0)
+    commonmarker (0.17.13)
+      ruby-enum (~> 0.5)
+    concurrent-ruby (1.1.6)
+    dnsruby (1.61.3)
+      addressable (~> 2.5)
+    em-websocket (0.5.1)
+      eventmachine (>= 0.12.9)
+      http_parser.rb (~> 0.6.0)
+    ethon (0.12.0)
+      ffi (>= 1.3.0)
+    eventmachine (1.2.7)
+    execjs (2.7.0)
+    faraday (1.0.0)
+      multipart-post (>= 1.2, < 3)
+    ffi (1.12.2)
+    forwardable-extended (2.6.0)
+    gemoji (3.0.1)
+    github-pages (204)
+      github-pages-health-check (= 1.16.1)
+      jekyll (= 3.8.5)
+      jekyll-avatar (= 0.7.0)
+      jekyll-coffeescript (= 1.1.1)
+      jekyll-commonmark-ghpages (= 0.1.6)
+      jekyll-default-layout (= 0.1.4)
+      jekyll-feed (= 0.13.0)
+      jekyll-gist (= 1.5.0)
+      jekyll-github-metadata (= 2.13.0)
+      jekyll-mentions (= 1.5.1)
+      jekyll-optional-front-matter (= 0.3.2)
+      jekyll-paginate (= 1.1.0)
+      jekyll-readme-index (= 0.3.0)
+      jekyll-redirect-from (= 0.15.0)
+      jekyll-relative-links (= 0.6.1)
+      jekyll-remote-theme (= 0.4.1)
+      jekyll-sass-converter (= 1.5.2)
+      jekyll-seo-tag (= 2.6.1)
+      jekyll-sitemap (= 1.4.0)
+      jekyll-swiss (= 1.0.0)
+      jekyll-theme-architect (= 0.1.1)
+      jekyll-theme-cayman (= 0.1.1)
+      jekyll-theme-dinky (= 0.1.1)
+      jekyll-theme-hacker (= 0.1.1)
+      jekyll-theme-leap-day (= 0.1.1)
+      jekyll-theme-merlot (= 0.1.1)
+      jekyll-theme-midnight (= 0.1.1)
+      jekyll-theme-minimal (= 0.1.1)
+      jekyll-theme-modernist (= 0.1.1)
+      jekyll-theme-primer (= 0.5.4)
+      jekyll-theme-slate (= 0.1.1)
+      jekyll-theme-tactile (= 0.1.1)
+      jekyll-theme-time-machine (= 0.1.1)
+      jekyll-titles-from-headings (= 0.5.3)
+      jemoji (= 0.11.1)
+      kramdown (= 1.17.0)
+      liquid (= 4.0.3)
+      mercenary (~> 0.3)
+      minima (= 2.5.1)
+      nokogiri (>= 1.10.4, < 2.0)
+      rouge (= 3.13.0)
+      terminal-table (~> 1.4)
+    github-pages-health-check (1.16.1)
+      addressable (~> 2.3)
+      dnsruby (~> 1.60)
+      octokit (~> 4.0)
+      public_suffix (~> 3.0)
+      typhoeus (~> 1.3)
+    html-pipeline (2.12.3)
+      activesupport (>= 2)
+      nokogiri (>= 1.4)
+    http_parser.rb (0.6.0)
+    i18n (0.9.5)
+      concurrent-ruby (~> 1.0)
+    jekyll (3.8.5)
+      addressable (~> 2.4)
+      colorator (~> 1.0)
+      em-websocket (~> 0.5)
+      i18n (~> 0.7)
+      jekyll-sass-converter (~> 1.0)
+      jekyll-watch (~> 2.0)
+      kramdown (~> 1.14)
+      liquid (~> 4.0)
+      mercenary (~> 0.3.3)
+      pathutil (~> 0.9)
+      rouge (>= 1.7, < 4)
+      safe_yaml (~> 1.0)
+    jekyll-avatar (0.7.0)
+      jekyll (>= 3.0, < 5.0)
+    jekyll-coffeescript (1.1.1)
+      coffee-script (~> 2.2)
+      coffee-script-source (~> 1.11.1)
+    jekyll-commonmark (1.3.1)
+      commonmarker (~> 0.14)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-commonmark-ghpages (0.1.6)
+      commonmarker (~> 0.17.6)
+      jekyll-commonmark (~> 1.2)
+      rouge (>= 2.0, < 4.0)
+    jekyll-default-layout (0.1.4)
+      jekyll (~> 3.0)
+    jekyll-feed (0.13.0)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-gist (1.5.0)
+      octokit (~> 4.2)
+    jekyll-github-metadata (2.13.0)
+      jekyll (>= 3.4, < 5.0)
+      octokit (~> 4.0, != 4.4.0)
+    jekyll-include-cache (0.2.0)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-mentions (1.5.1)
+      html-pipeline (~> 2.3)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-optional-front-matter (0.3.2)
+      jekyll (>= 3.0, < 5.0)
+    jekyll-paginate (1.1.0)
+    jekyll-readme-index (0.3.0)
+      jekyll (>= 3.0, < 5.0)
+    jekyll-redirect-from (0.15.0)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-relative-links (0.6.1)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-remote-theme (0.4.1)
+      addressable (~> 2.0)
+      jekyll (>= 3.5, < 5.0)
+      rubyzip (>= 1.3.0)
+    jekyll-sass-converter (1.5.2)
+      sass (~> 3.4)
+    jekyll-seo-tag (2.6.1)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-sitemap (1.4.0)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-swiss (1.0.0)
+    jekyll-theme-architect (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-cayman (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-dinky (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-hacker (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-leap-day (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-merlot (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-midnight (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-minimal (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-modernist (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-primer (0.5.4)
+      jekyll (> 3.5, < 5.0)
+      jekyll-github-metadata (~> 2.9)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-slate (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-tactile (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-time-machine (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-titles-from-headings (0.5.3)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-watch (2.2.1)
+      listen (~> 3.0)
+    jemoji (0.11.1)
+      gemoji (~> 3.0)
+      html-pipeline (~> 2.2)
+      jekyll (>= 3.0, < 5.0)
+    kramdown (1.17.0)
+    liquid (4.0.3)
+    listen (3.2.1)
+      rb-fsevent (~> 0.10, >= 0.10.3)
+      rb-inotify (~> 0.9, >= 0.9.10)
+    mercenary (0.3.6)
+    mini_portile2 (2.4.0)
+    minima (2.5.1)
+      jekyll (>= 3.5, < 5.0)
+      jekyll-feed (~> 0.9)
+      jekyll-seo-tag (~> 2.1)
+    minimal-mistakes-jekyll (4.19.1)
+      jekyll (>= 3.7, < 5.0)
+      jekyll-feed (~> 0.1)
+      jekyll-gist (~> 1.5)
+      jekyll-include-cache (~> 0.1)
+      jekyll-paginate (~> 1.1)
+      jekyll-sitemap (~> 1.3)
+    minitest (5.14.0)
+    multipart-post (2.1.1)
+    nokogiri (1.10.9)
+      mini_portile2 (~> 2.4.0)
+    octokit (4.17.0)
+      faraday (>= 0.9)
+      sawyer (~> 0.8.0, >= 0.5.3)
+    pathutil (0.16.2)
+      forwardable-extended (~> 2.6)
+    public_suffix (3.1.1)
+    rb-fsevent (0.10.3)
+    rb-inotify (0.10.1)
+      ffi (~> 1.0)
+    rouge (3.13.0)
+    ruby-enum (0.7.2)
+      i18n
+    rubyzip (2.3.0)
+    safe_yaml (1.0.5)
+    sass (3.7.4)
+      sass-listen (~> 4.0.0)
+    sass-listen (4.0.0)
+      rb-fsevent (~> 0.9, >= 0.9.4)
+      rb-inotify (~> 0.9, >= 0.9.7)
+    sawyer (0.8.2)
+      addressable (>= 2.3.5)
+      faraday (> 0.8, < 2.0)
+    terminal-table (1.8.0)
+      unicode-display_width (~> 1.1, >= 1.1.1)
+    thread_safe (0.3.6)
+    typhoeus (1.3.1)
+      ethon (>= 0.9.0)
+    tzinfo (1.2.6)
+      thread_safe (~> 0.1)
+    tzinfo-data (1.2019.3)
+      tzinfo (>= 1.0.0)
+    unicode-display_width (1.7.0)
+    wdm (0.1.1)
+    zeitwerk (2.3.0)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  github-pages
+  jekyll-feed
+  jekyll-include-cache
+  jekyll-paginate
+  jekyll-remote-theme
+  minimal-mistakes-jekyll
+  tzinfo (~> 1.2)
+  tzinfo-data
+  wdm (~> 0.1.1)
+BUNDLED WITH
+   2.1.4
--- a/docs/_config.yml
+++ b/docs/_config.yml
+title: DeepSpeed
+email: deepspeed@microsoft.com
+description: >-
+  DeepSpeed is a deep learning optimization library that makes distributed
+  training easy, efficient, and effective.
+locale : "en-US"
+repository: microsoft/DeepSpeed
+baseurl: "/" # the subpath of your site, e.g. /blog
+url: "https://www.deepspeed.ai" # the base hostname & protocol for your site, e.g. http://example.com
+# Build settings
+remote_theme: "mmistakes/minimal-mistakes@4.19.0"
+minimal_mistakes_skin : "air"
+plugins:
+  - jekyll-feed
+  - jekyll-include-cache
+  - jekyll-paginate
+#paginate: 10
+#paginate_path: /blog/page:num
+include: ["_pages"]
+exclude: ["code-docs"]
+collections:
+  tutorials:
+    output: true
+    permalink: /:collection/:path/
+defaults:
+  - scope:
+      path: ""
+      type: posts
+    values:
+      layout: single
+      author_profile: false
+      read_time: true
+      comments: false
+      share: true
+      related: false
+  # _tutorials
+  - scope:
+      path: ""
+      type: tutorials
+    values:
+      layout: single
+      toc: true
+      toc_label: "Contents"
+      sidebar:
+        nav: "lnav"
+timezone: America/Los_Angeles
+breadcrumbs: true
--- a/docs/_data/navigation.yml
+++ b/docs/_data/navigation.yml
+main:
+  - title: "Getting Started"
+    url: /getting-started/
+  - title: "Blog"
+    url: /blog/
+  - title: "Tutorials"
+    url: /tutorials/
+  - title: "Documentation"
+    url: https://ghpages-test.readthedocs.io/
+  - title: "GitHub"
+    url: https://github.com/microsoft/DeepSpeed
+lnav:
+  - title: "This is a floating nav bar."
+  - title: "Getting Started"
+    url: /getting-started/
+    children:
+      - title: "Installation"
+        url: /getting-started/#installation
+      - title: "Configuration"
+        url: /getting-started/#deepspeed-configuration
--- a/docs/_pages/tutorials-landing.md
+++ b/docs/_pages/tutorials-landing.md
+---
+title: "Tutorials"
+layout: collection
+collection: tutorials
+permalink: /tutorials/
+---
--- a/docs/_posts/2020-02-13-release.md
+++ b/docs/_posts/2020-02-13-release.md
+---
+layout: single
+title: "ZeRO & DeepSpeed: New system optimizations enable training models with over 100 billion parameters"
+date:   2020-02-13
+link: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
+excerpt: "Developed by Microsoft AI & Research."
+categories: news
+---
--- a/docs/_posts/2020-02-13-turing-nlg.md
+++ b/docs/_posts/2020-02-13-turing-nlg.md
+---
+layout: single
+title: "Turing-NLG: A 17-billion-parameter language model by Microsoft"
+date:   2020-02-13
+link: https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/
+excerpt: "DeepSpeed was used to train the world's largest language model."
+categories: news
+---
--- a/docs/_posts/2020-03-17-reduce-scatter.md
+++ b/docs/_posts/2020-03-17-reduce-scatter.md
+---
+title: "ZeRO stage 1 with reduced communication"
+date:   2020-03-13
+excerpt: "Partition-aware ZeRO with up to 2x reduction in communication time!"
+---
+# ZeRO stage 1 with reduced communication
+* Partition-aware approach instead of initial implementation that used a global collective (all-reduce)
+* Total communication volume reduction 1.5x -> 1x of data parallelism
+* Up to 2x reduction in communication time compared to all-reduce
+# Further updates coming soon!
--- a/docs/_posts/2020-03-17-zero-stage2.md
+++ b/docs/_posts/2020-03-17-zero-stage2.md
+---
+title: "ZeRO stage 2"
+date:   2020-03-13
+excerpt: "Reduce memory footprint to enable training 10B models without model parallelism!"
+---
+# Zero Stage 2
+* Reduce memory footprint of gradients
+* Train larger models: e.g., 10B parameters on 32GPUs without model parallelism
+* Train larger batch sizes
+# Further updates coming soon!
--- a/docs/tutorials/CIFAR-10.md
+++ b/docs/tutorials/CIFAR-10.md
-# Tutorial: CIFAR-10 with DeepSpeed
+---
+title: "CIFAR-10 Tutorial"
+excerpt: "Train your first model with DeepSpeed!"
+---
 If you haven't already, we advise you to first read through the [Getting
 Started](../../README.md#getting-started) guide before stepping through this
@@ -10,22 +13,22 @@ First we will go over how to run original CIFAR-10. Then we will proceed step-by
-## 1 Running Original CIFAR-10
+## Running Original CIFAR-10
 Original model code from [CIFAR-10 Tutorial](https://github.com/pytorch/tutorials/blob/master/beginner_source/blitz/cifar10_tutorial.py), We've copied this repo under [DeepSpeedExamples/cifar/](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar) and made it available as a submodule. To download, execute:
-```
+```bash
 git submodule update --init --recursive
 ```
 To install requirements for CIFAR-10:
-```
+```bash
 cd DeepSpeedExamples/cifar
 pip install -r requirements.txt
 ```
 Run `python cifar10_tutorial.py`, it downloads the training data set at first run.
-```less
+```
 Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz
 170500096it [00:02, 61124868.24it/s]
 Extracting ./data/cifar-10-python.tar.gz to ./data
@@ -63,10 +66,10 @@ cuda:0
-## 2 Enabling DeepSpeed
+## Enabling DeepSpeed
-### 2.1 Argument Parsing
+### Argument Parsing
 The first step to apply DeepSpeed is adding DeepSpeed arguments to CIFAR-10 model, using `deepspeed.add_config_arguments()` function as below.
@@ -103,7 +106,7 @@ The first step to apply DeepSpeed is adding DeepSpeed arguments to CIFAR-10 mode
-### 2.2 Initialization
+### Initialization
 We use `deepspeed.initialize` to create `model_engine`, `optimizer` and `trainloader`. Below is its definition.
@@ -144,27 +147,28 @@ The original device and optimizer can be removed after initializing DeepSpeed.
-### 2.3 Training API
+### Training API
 The `model` returned by `deepspeed.initialize` is the _DeepSpeed Model Engine_ that we will use to train the model using the forward, backward and step API.
-   ```python
+```python
     for i, data in enumerate(trainloader):
         # get the inputs; data is a list of [inputs, labels]
-         inputs, labels = data[0].to(model_engine.local_rank), data[1].to(model_engine.local_rank)
+         inputs = data[0].to(model_engine.device)
+         labels = data[1].to(model_engine.device)
         outputs = model_engine(inputs)
         loss = criterion(outputs, labels)
         model_engine.backward(loss)
         model_engine.step()
-   ```
+```
 Zeroing the gradients is handled automatically by DeepSpeed after the weights have been updated using a mini-batch.
-### 2.4 Configuration
+### Configuration
 The next step to use DeepSpeed is to create a configuration JSON file (ds_config.json). This file provides DeepSpeed specific parameters defined by the user, e.g., batch size, optimizer, scheduler and other parameters.
@@ -198,20 +202,17 @@ The next step to use DeepSpeed is to create a configuration JSON file (ds_config
-### 2.6 Run CIFAR-10 Model with DeepSpeed Enabled
+### Run CIFAR-10 Model with DeepSpeed Enabled
 To start training CIFAR-10 model with DeepSpeed applied, execute the following command, it will use all detected GPUs by default.
 ```bash
-deepspeed cifar10_deepspeed.py --deepspeed --deepspeed_config ds_config.json
+deepspeed cifar10_deepspeed.py --deepspeed_config ds_config.json
 ```
 DeepSpeed usually prints more training details for user to monitor, including training settings, performance statistics and loss trends.
+```
-```less
+deepspeed.pt cifar10_deepspeed.py --deepspeed_config ds_config.json
-deepspeed.pt --num_nodes 1 --num_gpus 1 cifar10_deepspeed.py --deepspeed --deepspeed_config ds_config.json
 Warning: Permanently added '[192.168.0.22]:42227' (ECDSA) to the list of known hosts.
 cmd=['pdsh', '-w', 'worker-0', 'export NCCL_VERSION=2.4.2; ', 'cd /data/users/deepscale/test/ds_v2/examples/cifar;', '/usr/bin/python', '-u', '-m', 'deepspeed.pt.deepspeed_launch', '--world_info=eyJ3b3JrZXItMCI6IFswXX0=', '--node_rank=%n', '--master_addr=192.168.0.22', '--master_port=29500', 'cifar10_deepspeed.py', '--deepspeed', '--deepspeed_config', 'ds_config.json']
 worker-0: Warning: Permanently added '[192.168.0.22]:42227' (ECDSA) to the list of known hosts.

--- a/docs/_tutorials/getting-started.md
+++ b/docs/_tutorials/getting-started.md
+---
+title: "Getting Started"
+permalink: /getting-started/
+excerpt: "First steps with DeepSpeed"
+---
+## Installation
+* Please see our [Azure tutorial](docs/azure.md) to get started with DeepSpeed on Azure!
+* If you're not on Azure, we recommend using our docker image via `docker pull deepspeed/deepspeed:latest` which contains a pre-installed version of DeepSpeed and all the necessary dependencies.
+* If you want to install DeepSpeed manually, we provide an install script [install.sh](install.sh) to help install on a local machine or across an entire cluster.
+## Writing DeepSpeed Models
+DeepSpeed model training is accomplished using the DeepSpeed engine. The engine
+can wrap any arbitrary model of type `torch.nn.module` and has a minimal set of APIs
+for training and checkpointing the model. Please see the tutorials for detailed
+examples.
+To initialize the DeepSpeed engine:
+```python
+model_engine, optimizer, _, _ = deepspeed.initialize(args=cmd_args,
+                                                     model=model,
+                                                     model_parameters=params)
+```
+`deepspeed.inialize` ensures that all of the necessary setup required for
+distributed data parallel or mixed precision training are done
+appropriately under the hood.  In addition to wrapping the model, DeepSpeed can
+construct and manage the training optimizer, data loader, and the learning rate
+scheduler based on the parameters passed to `deepspeed.initialze` and the
+DeepSpeed [configuration file](#deepspeed-configuration).
+### Training
+Once the DeepSpeed engine has been initialized, it can be used to train the
+model using three simple APIs for forward propagation (`()`), backward
+propagation (`backward`), and weight updates (`step`).
+```python
+for step, batch in enumerate(data_loader):
+    #forward() method
+    loss = model_engine(batch)
+    #runs backpropagation
+    model_engine.backward(loss)
+    #weight update
+    model_engine.step()
+```
+Under the hood, DeepSpeed automatically performs the necessary operations
+required for distributed data parallel training, in mixed precision, with a
+pre-defined learning rate schedule:
+* **Gradient Averaging**: in distributed data parallel training, `backward`
+  ensures that gradients are averaged across data parallel processes after
+  training on an `train_batch_size`.
+* **Loss Scaling**: in FP16/mixed precision training, the DeepSpeed
+  engine automatically handles scaling the loss to avoid precision loss in the
+  gradients.
+* **Learning Rate Schedule**: if using DeepSpeed's learning rate
+  schedule, then DeepSpeed automatically handles any updates to the learning
+  rate when `step` is executed.
+### Model Checkpointing
+Saving and loading the training state is handled via the `save_checkpoint` and
+`load_checkpoint` API in DeepSpeed which takes two arguments to uniquely
+identify a checkpoint:
+  * `ckpt_dir`: the directory where checkpoints will be saved.
+  * `ckpt_id`: an identifier that uniquely identifies a checkpoint in the directory.
+    In the following code snippet, we use the loss value as the checkpoint identifier.
+```python
+#load checkpoint
+_, client_sd = model_engine.load_checkpoint(args.load_dir, args.ckpt_id)
+step = client_sd['step']
+#advance data loader to ckpt step
+dataloader_to_step(data_loader, step + 1)
+for step, batch in enumerate(data_loader):
+    #forward() method
+    loss = model_engine(batch)
+    #runs backpropagation
+    model_engine.backward(loss)
+    #weight update
+    model_engine.step()
+    #save checkpoint
+    if step % args.save_interval:
+        client_sd['step'] = step
+        ckpt_id = loss.item()
+        model_engine.save_checkpoint(args.save_dir, ckpt_id, client_sd = client_sd)
+```
+DeepSpeed can automatically save and restore the model, optimizer, and the
+learning rate scheduler states while hiding away these details from the user.
+However, the user may want to save other data in addition to these that are
+unique to a given model training. To support these items, `save_checkpoint`
+accepts a client state dictionary `client_sd` for saving. These items can be
+retrieved from `load_checkpoint` as a return argument. In the example above,
+the `step` value is stored as part of the `client_sd`.
+## DeepSpeed Configuration
+DeepSpeed features can be enabled, disabled, or configured using a config JSON
+file that should be specified as `args.deepspeed_config`. A sample config file
+is shown below. For a full set of features see [core API
+doc](https://microsoft.github.io/DeepSpeed/docs/htmlfiles/api/full/index.html).
+```json
+{
+  "train_batch_size": 8,
+  "gradient_accumulation_steps": 1,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.00015
+    }
+  },
+  "fp16": {
+    "enabled": true
+  },
+  "zero_optimization": true
+}
+```
+## Multi-Node Environment Variables
+When training across multiple nodes we have found it useful to support
+propagating user-defined environment variables. By default DeepSpeed will
+propagate all NCCL and PYTHON related environment variables that are set. If
+you would like to propagate additional variables you can specify them in a
+dot-file named `.deepspeed_env` that contains a new-line separated list of
+`VAR=VAL` entries. The DeepSpeed launcher will look in the local path you are
+executing from and also in your home directory (`~/`).
+As a concrete example, some clusters require special NCCL variables to set
+prior to training. The user can simply add these variables to a
+`.deepspeed_env` file in their home directory that looks like this:
+```
+NCCL_IB_DISABLE=1
+NCCL_SOCKET_IFNAME=eth0
+```
+DeepSpeed will then make sure that these environment variables are set when
+launching each process on every node across their training job.
+# Launching DeepSpeed Training
+DeepSpeed installs the entry point `deepspeed` to launch distributed training.
+We illustrate an example usage of DeepSpeed with the following assumptions:
+1. You have already integrated DeepSpeed into your model
+2. `client_entry.py` is the entry script for your model
+3. `client args` is the `argparse` command line arguments
+4. `ds_config.json` is the configuration file for DeepSpeed
+## Resource Configuration (multi-node)
+DeepSpeed configures multi-node compute resources with hostfiles that are compatible with
+[OpenMPI](https://www.open-mpi.org/) and [Horovod](https://github.com/horovod/horovod).
+A hostfile is a list of *hostnames* (or SSH aliases), which are machines accessible via passwordless
+SSH, and *slot counts*, which specify the number of GPUs available on the system. For
+example,
+```
+worker-1 slots=4
+worker-2 slots=4
+```
+specifies that two machines named *worker-1* and *worker-2* each have four GPUs to use
+for training.
+Hostfiles are specified with the `--hostfile` command line option. If no hostfile is
+specified, DeepSpeed searches for `/job/hostfile`. If no hostfile is specified or found,
+DeepSpeed queries the number of GPUs on the local machine to discover the number of local
+slots available.
+The following command launches a PyTorch training job across all available nodes and GPUs
+specified in `myhostfile`:
+```bash
+deepspeed <client_entry.py> <client args> \
+  --deepspeed --deepspeed_config ds_config.json --hostfile=myhostfile
+```
+Alternatively, DeepSpeed allows you to restrict distributed training of your model to a
+subset of the available nodes and GPUs. This feature is enabled through two command line
+arguments: `--num_nodes` and `--num_gpus`. For example, distributed training can be
+restricted to use only two nodes with the following command:
+```bash
+deepspeed --num_nodes=2 \
+	<client_entry.py> <client args> \
+	--deepspeed --deepspeed_config ds_config.json
+```
+You can instead include or exclude specific resources using the `--include` and
+`--exclude` flags. For example, to use all available resources **except** GPU 0 on node
+*worker-2* and GPUs 0 and 1 on *worker-3*:
+```bash
+deepspeed --exclude="worker-2:0@worker-3:0,1" \
+	<client_entry.py> <client args> \
+	--deepspeed --deepspeed_config ds_config.json
+```
+Similarly, you can use **only** GPUs 0 and 1 on *worker-2*:
+```bash
+deepspeed --include="worker-2:0,1" \
+	<client_entry.py> <client args> \
+	--deepspeed --deepspeed_config ds_config.json
+```
+### MPI Compatibility
+As described above, DeepSpeed provides its own parallel launcher to help launch
+multi-node/multi-gpu training jobs. If you prefer to launch your training job
+using MPI (e.g., mpirun), we provide support for this. It should be noted that
+DeepSpeed will still use the torch distributed NCCL backend and *not* the MPI
+backend. To launch your training job with mpirun + DeepSpeed you simply pass us
+an additional flag `--deepspeed_mpi`. DeepSpeed will then use
+[mpi4py](https://pypi.org/project/mpi4py/) to discover the MPI environment (e.g.,
+rank, world size) and properly initialize torch distributed for training. In this
+case you will explicitly invoke `python` to launch your model script instead of using
+the `deepspeed` launcher, here is an example:
+```bash
+mpirun <mpi-args> python \
+	<client_entry.py> <client args> \
+	--deepspeed_mpi --deepspeed --deepspeed_config ds_config.json
+```
+If you want to use this feature of DeepSpeed, please ensure that mpi4py is
+installed via `pip install mpi4py`.
+## Resource Configuration (single-node)
+In the case that we are only running on a single node (with one or more GPUs)
+DeepSpeed *does not* require a hostfile as described above. If a hostfile is
+not detected or passed in then DeepSpeed will query the number of GPUs on the
+local machine to discover the number of slots available. The `--include` and
+`--exclude` arguments work as normal, but the user should specify 'localhost'
+as the hostname.
--- a/docs/figures/DeepSpeed-vs-Megatron.png
+++ b/docs/figures/DeepSpeed-vs-Megatron.png
--- a/docs/azure.md
+++ b/docs/azure.md
-# DeepSpeed with Azure
-This tutorial will help you get started running DeepSpeed on [Azure virtual
-machines](https://azure.microsoft.com/en-us/services/virtual-machines/).
-Looking forward, we will be integrating these techniques and additional enhancements
-into the [Azure ML](https://azure.microsoft.com/en-us/services/machine-learning/) platform to
-benefit all your large model training jobs.
-If you don't already have an Azure account please see more details here: [https://azure.microsoft.com/](https://azure.microsoft.com/).
-To help with launching Azure instances we suggest using the [Azure
-CLI](https://docs.microsoft.com/en-us/cli/azure/?view=azure-cli-latest). We have created
-several helper scripts to get you quickly started using DeepSpeed with Azure.
- * Install Azure CLI on your local box: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli
- * Alternatively you can use the Azure in-browser shell: https://shell.azure.com/
-## Create an SSH key
-Generate an SSH key that will be used across this tutorial to SSH into your VMs and
-between Docker containers. `ssh-keygen` is the recommended way of doing this. Our scripts
-assume your key is located inside the same directory as the Azure scripts.
-## Azure Config JSON
-Our helper scripts depend on the following a configuration JSON for deployment
-and setup.  We have provided a simple example JSON in `azure_config.json` that
-sets up a basic environment with two VMs. This config uses the NV6_Promo
-instance type which has one NVIDIA Tesla M60 GPU per VM. You can read more
-details about the VM on the [Linux Virtual Machines
-Pricing](https://azure.microsoft.com/en-us/pricing/details/virtual-machines/linux/)
-page.
-See the example below:
- ```json
-{
-  "num_vms": 2,
-  "location": "southcentralus",
-  "azure_sku": "Standard_NV6_Promo",
-  "ssh_private_key": "id_rsa",
-  "docker_ssh_port": 2222
-}
-```
-## Dependencies
-The scripts in this tutorial require [jq](https://stedolan.github.io/jq/) to help with
-parsing JSON from the command line. Also it is recommended to install
-[pdsh](https://linux.die.net/man/1/pdsh) to help launch ssh connections in parallel.
-## Create Azure VMs
-We first need to allocate the VMs. We provide a script
-```bash
-./create_vms.sh
-```
-to create VMs with the Azure SKU in the region specified in `azure_config.json`. Feel
-free to customize your JSON to your desired region/SKU. This step will take a few minutes
-to complete while it sets up all of your VMs on Azure.
-## Setup VM environment to use DeepSpeed
-Next, we need to configure the VM environment for DeepSpeed. We provide a script
-```bash
-./setup_vms.sh
-```
-to generate a [hostfile](../README.md#resource-configuration) and SSH
-configuration on all of the VMs. This configuration will be used by the DeepSpeed
-Docker containers in the next step.
-## Start the DeepSpeed docker container
-We now setup the DeepSpeed Docker containers on the VMs. We provide a script
-```bash
-./setup_docker.sh
-```
-to pull the DeepSpeed image onto all VMs and start a container instance in the
-background. This will take several minutes since it needs to pull the entire Docker
-image.
-## Access VMs
-The tool [azure_ssh.sh](azure_ssh.sh) will let you SSH into any of the VMs with this
-syntax:
-```bash
-./azure_ssh.sh <node-id> [command]
-```
-where the `node-id` is a number between `0` and `num_vms-1`.  This script will find the
-public IP address of your VM and use the SSH key provided in the Azure configuration
-JSON.
-## Access DeepSpeed container
-Everything should be up and running at this point. Let's access the running DeepSpeed
-container on the first VM and make sure we can talk to the other containers in our deployment.
- * SSH into the first VM via: `./azure_ssh.sh 0`
- * Change directories into the azure folder of this repo via: `cd ~/workdir/DeepSpeed/azure`
- * Attach the running docker container via: `./attach.sh`
- * You should now be able to `ssh` into any other docker container, the containers can be
-   accessed via their SSH alias of `worker-N`, where `N` is the VM number between `0`
-   and `num_vms-1`. In this example we should be able to successfully run `ssh worker-1
-   hostname` which will return the hostname of worker-1.
-## Parallel SSH across containers
- DeepSpeed comes installed with a helper script `ds_ssh` which is a wrapper around
- the [pdsh](https://linux.die.net/man/1/pdsh) command that lets you issue commands
- to groups of hosts (via SSH) in parallel. This wrapper simply connects with the
- hostfile that defines all the containers in your deployment. For example if you run
- `ds_ssh hostname` you should see a list of all the hostnames in your deployment.
-## Run CIFAR-10 example model
-We will now run the DeepSpeed CIFAR-10 model example to test the VM setup. From inside
-the first DeepSpeed container:
-  1) Install the python dependencies necessary to run the CIFAR-10 example model. You can
-  do this across your cluster via:
-  ```bash
-  ds_ssh pip install -r ~/workdir/DeepSpeed/DeepSpeedExamples/cifar/requirements.txt
-  ```
-  2) Now change directories to the CIFAR example:
-  ```bash
-  cd ~/workdir/DeepSpeed/DeepSpeedExamples/cifar
-  ```
-  3) Finally, launch training across all VMs:
-  ```bash
-  deepspeed cifar10_deepspeed.py --deepspeed --deepspeed_config ds_config.json
-  ```
-## Megatron-LM GPT2
-DeepSpeed includes an example model using Megatron-LM's GPT2. Please refer to the full
-[Megatron tutorial](../docs/tutorials/MegatronGPT2Tutorial.md) for more details.
- * In order to fully train GPT2 with DeepSpeed and ZeRO we recommend using 8 instances of
-   Azure's Standard_ND40rs_v2 SKU for a total of 64 NVIDIA V100 GPUs. With this setup and
-   a batch size of 1536 you should be able to complete 100k training steps (153.6 million
-   samples) in less than 2 weeks of training.
--- a/docs/blog/index.html
+++ b/docs/blog/index.html
+---
+layout: home
+---
--- a/docs/code-docs/Makefile
+++ b/docs/code-docs/Makefile
+# Minimal makefile for Sphinx documentation
+#
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+.PHONY: help Makefile
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/code-docs/build-api-docs.sh
+++ b/docs/code-docs/build-api-docs.sh
+#!/bin/bash
+sphinx-apidoc -f -o source ../../deepspeed
+make html
--- a/docs/code-docs/requirements.local.txt
+++ b/docs/code-docs/requirements.local.txt
+sphinx
+recommonmark
+sphinx-rtd-theme