Docs - Release Note and Introduction (#107)

* Add introduction and release documents. * Fix some typos in documents.

Docs - Release Note and Introduction (#107)
* Add introduction and release documents. * Fix some typos in documents.
2710fad5 · TobeyQin · GitHub · 1e96c27e · 2710fad5 · 1e96c27e
Unverified Commit 2710fad5 authored Jun 30, 2021 by TobeyQin Committed by GitHub Jun 30, 2021
11 changed files
--- a/docs/assets/architecture.png
+++ b/docs/assets/architecture.png
--- a/docs/assets/bar.png
+++ b/docs/assets/bar.png
--- a/docs/assets/benchmark_package.png
+++ b/docs/assets/benchmark_package.png
--- a/docs/assets/executor_workflow.png
+++ b/docs/assets/executor_workflow.png
--- a/docs/assets/superbench_structure.png
+++ b/docs/assets/superbench_structure.png
--- a/docs/benchmarks/micro-benchmarks.md
+++ b/docs/benchmarks/micro-benchmarks.md
@@ -19,25 +19,13 @@ id: micro-benchmarks

 <table>
  <tbody>
-    <tr align="center" valign="bottom">
-      <td>
-      </td>
-      <td>
-        <b>Micro Benchmark</b>
-        <img src={require('../assets/bar.png').default}/>
-      </td>
-      <td>
-        <b>Model Benchmark</b>
-        <img src={require('../assets/bar.png').default}/>
-      </td>
-    </tr>
    <tr valign="top">
      <td align="center" valign="middle">
        <b>Metrics</b>
      </td>
      <td>
        <ul><li><b>Computation Benchmark</b></li>
-          <ul><li><b>Kernel Performance</b></li>
+          <ul><li><b>GEMM FLOPS</b></li>
            <ul>
              <li>GFLOPS</li>
              <li>TensorCore</li>
@@ -54,12 +42,12 @@ id: micro-benchmarks
          <ul><li><b>Operator Performance</b></li>
            <ul><li>MatMul</li><li>Sharding_MatMul</li></ul>
          </ul>
+        </ul>
+        <ul><li><b>Communication Benchmark</b></li>
          <ul><li><b>Memory</b></li>
            <ul><li>H2D_Mem_BW_&lt;GPU ID&gt;</li>
-              <li>H2D_Mem_BW_&lt;GPU ID&gt;</li></ul>
+              <li>D2H_Mem_BW_&lt;GPU ID&gt;</li></ul>
          </ul>
-        </ul>
-        <ul><li><b>Communication Benchmark</b></li>
          <ul><li><b>Device P2P Bandwidth</b></li>
            <ul><li>P2P_BW_Max</li><li>P2P_BW_Min</li><li>P2P_BW_Avg</li></ul>
          </ul>
@@ -80,34 +68,12 @@ id: micro-benchmarks
        <ul><li><b>Storage Benchmark</b></li>
          <ul><li><b>Disk</b></li>
            <ul>
-              <li>Read/Write</li><li>Rand_Read/Rand_Write</li>
-              <li>R/W_Read</li><li>R/W_Write</li><li>Rand_R/W_Read</li><li>Rand_R/W_Write</li>
+              <li>Seq_Read/Seq_Write</li><li>Rand_Read/Rand_Write</li>
+              <li>Seq_R/W_Read</li><li>Seq_R/W_Write</li><li>Rand_R/W_Read</li><li>Rand_R/W_Write</li>
            </ul>
          </ul>
        </ul>
      </td>
-      <td>
-        <ul><li><b>CNN models</b></li>
-          <ul>
-            <li><b>ResNet</b></li>
-              <ul><li>ResNet-50</li><li>ResNet-101</li><li>ResNet-152</li></ul>
-          </ul>
-          <ul>
-            <li><b>DenseNet</b></li>
-              <ul><li>DenseNet-169</li><li>DenseNet-201</li></ul>
-          </ul>
-          <ul>
-            <li><b>VGG</b></li>
-              <ul><li>VGG-11</li><li>VGG-13</li><li>VGG-16</li><li>VGG-19</li></ul>
-          </ul>
-          <ul><li><b>Other CNN models</b></li><ul><li>...</li></ul></ul>
-        </ul>
-        <ul><li><b>BERT models</b></li>
-          <ul><li><b>BERT</b></li><li><b>BERT_LARGE</b></li></ul>
-        </ul>
-        <ul><li><b>LSTM</b></li></ul>
-        <ul><li><b>GPT-2</b></li></ul>
-      </td>
    </tr>
  </tbody>
 </table>
--- a/docs/benchmarks/model-benchmarks.md
+++ b/docs/benchmarks/model-benchmarks.md
@@ -19,73 +19,10 @@ id: model-benchmarks

 <table>
  <tbody>
-    <tr align="center" valign="bottom">
-      <td>
-      </td>
-      <td>
-        <b>Micro Benchmark</b>
-        <img src={require('../assets/bar.png').default}/>
-      </td>
-      <td>
-        <b>Model Benchmark</b>
-        <img src={require('../assets/bar.png').default}/>
-      </td>
-    </tr>
    <tr valign="top">
      <td align="center" valign="middle">
        <b>Metrics</b>
      </td>
-      <td>
-        <ul><li><b>Computation Benchmark</b></li>
-          <ul><li><b>Kernel Performance</b></li>
-            <ul>
-              <li>GFLOPS</li>
-              <li>TensorCore</li>
-              <li>cuBLAS</li>
-              <li>cuDNN</li>
-            </ul>
-          </ul>
-          <ul><li><b>Kernel Launch Time</b></li>
-            <ul>
-              <li>Kernel_Launch_Event_Time</li>
-              <li>Kernel_Launch_Wall_Time</li>
-            </ul>
-          </ul>
-          <ul><li><b>Operator Performance</b></li>
-            <ul><li>MatMul</li><li>Sharding_MatMul</li></ul>
-          </ul>
-          <ul><li><b>Memory</b></li>
-            <ul><li>H2D_Mem_BW_&lt;GPU ID&gt;</li>
-              <li>H2D_Mem_BW_&lt;GPU ID&gt;</li></ul>
-          </ul>
-        </ul>
-        <ul><li><b>Communication Benchmark</b></li>
-          <ul><li><b>Device P2P Bandwidth</b></li>
-            <ul><li>P2P_BW_Max</li><li>P2P_BW_Min</li><li>P2P_BW_Avg</li></ul>
-          </ul>
-          <ul><li><b>RDMA</b></li>
-            <ul><li>RDMA_Peak</li><li>RDMA_Avg</li></ul>
-          </ul>
-          <ul><li><b>NCCL</b></li>
-            <ul><li>NCCL_AllReduce</li></ul>
-            <ul><li>NCCL_AllGather</li></ul>
-            <ul><li>NCCL_broadcast</li></ul>
-            <ul><li>NCCL_reduce</li></ul>
-            <ul><li>NCCL_reduce_scatter</li></ul>
-          </ul>
-        </ul>
-        <ul><li><b>Computation-Communication Benchmark</b></li>
-          <ul><li><b>Mul_During_NCCL</b></li><li><b>MatMul_During_NCCL</b></li></ul>
-        </ul>
-        <ul><li><b>Storage Benchmark</b></li>
-          <ul><li><b>Disk</b></li>
-            <ul>
-              <li>Read/Write</li><li>Rand_Read/Rand_Write</li>
-              <li>R/W_Read</li><li>R/W_Write</li><li>Rand_R/W_Read</li><li>Rand_R/W_Write</li>
-            </ul>
-          </ul>
-        </ul>
-      </td>
      <td>
        <ul><li><b>CNN models</b></li>
          <ul>
@@ -103,7 +40,7 @@ id: model-benchmarks
          <ul><li><b>Other CNN models</b></li><ul><li>...</li></ul></ul>
        </ul>
        <ul><li><b>BERT models</b></li>
-          <ul><li><b>BERT</b></li><li><b>BERT_LARGE</b></li></ul>
+          <ul><li><b>BERT-Base</b></li><li><b>BERT-Large</b></li></ul>
        </ul>
        <ul><li><b>LSTM</b></li></ul>
        <ul><li><b>GPT-2</b></li></ul>

--- a/docs/introduction.md
+++ b/docs/introduction.md
@@ -18,16 +18,12 @@ __SuperBench__ is a validation and profiling tool for AI infrastructure, which s
  * Provide comprehensive performance comparison between different existing hardware
  * Provide insights for hardware and software co-design

-It includes micro-benchmark for primitive computation and communication benchmarking,
-and model-benchmark to measure domain-aware end-to-end deep learning workloads.
+It provides micro-benchmark for primitive computation and communication benchmarking,
+as well as model-benchmark to measure domain-aware end-to-end deep learning workloads.

-:::note
-SuperBench is in the early pre-alpha stage for open source, and not ready for general public yet.
-If you want to jump in early, you can try building latest code yourself.
-:::

 ## Overview

-The following figure shows the capabilities provide by SuperBench core framework and its extension.
+The following figure shows the capabilities provided by SuperBench core framework and its extension.

-![SuperBench Structure](./assets/superbench_structure.png)
+![SuperBench Structure](./assets/architecture.png)
--- a/website/blog/2021-06-24-introduce-superbench.md
+++ b/website/blog/2021-06-24-introduce-superbench.md
@@ -8,4 +8,107 @@ author_url: https://github.com/TobeyQin
 tags: [superbench, announcement]
 ---

-We are very happy to introduce [SuperBench](https://github.com/microsoft/superbenchmark) to help you validate AI infrastructure.
+
+This blog is to introduce [SuperBench](https://github.com/microsoft/superbenchmark) to help you validate AI infrastructure.
+
+## The Advantages of SuperBench
+
+### Easy-to-use CLI
+
+In order to provide good user experience, SuperBench provides a command line interface to help users deploy and run benchmarks. 
+Empowered by SuperBench CLI, user can deploy and run their benchmarks with only one command, which greatly shorten the learning curve of using tools, 
+to help user easily evaluate the performance of AI workload.
+
+Below is a simple example to show how to deploy and run benchmarks locally. For more information, 
+please view [CLI Document](https://microsoft.github.io/superbenchmark/docs/cli)
+
+1. Deploy
+
+    ```bash
+    sb deploy -f local.ini
+    ```
+2. Run Benchmark
+
+    ```bash
+    sb run -f local.ini -c config.yaml
+    ```
+
+Among them, `local.ini` is the configuration file to manage worker nodes that will actually run benchmarks.
+In below case, the worker node is `localhost`, same as control node.
+
+```ini title="local.ini"
+[all]
+localhost ansible_connection=local
+```
+`config.yaml` is a config file to configure the details of benchmarkings. You can customize your benchmarks by modified this file.
+
+For more information, please view [configuration](https://microsoft.github.io/superbenchmark/docs/getting-started/configuration)
+
+### Modular and Extensible Framework
+
+1. Executor Framework
+
+  In order to facilitate the benchmarking and validation on large-scale clusters, we designed and implemented a modular and extensible framework. 
+  SuperBench framework includes a runner as control node, as well as multiple executors as worker nodes. 
+  A runner received commands from CLI and distribute to all nodes (worker nodes) in the cluster, collect data, and summarize the results. 
+  Each worker will run executor to execute the specified benchmark tasks.
+
+  ![SuperBench Executor Workflow](../../docs/assets/executor_workflow.png)
+
+2. Benchmark Framework
+
+  SuperBench supports micro-benchmark for primitive computation and communication benchmarking,
+  and model-benchmark to measure domain-aware end-to-end deep learning workloads.
+  SuperBench implemented an abstract BenchmarkBase to provide common function. All kind of benchmarks are built based on this abstract class.
+  It also provides a unified interface and result format for all benchmarks.
+  Developers can easily add new benchmarks.
+
+  ![SuperBench Benchmark Package](../../docs/assets/benchmark_package.png)
+
+### Conprehensive and Strandardized Benchmarks
+
+SuperBench supports a set of benchmarks listed as below.
+
+* Micro-Benchmarks
+  * Computation benchmarks
+    * GEMM Flops
+    * Kernel Launch Time
+    * Operator Performance
+  * Communication benchmarks
+    * Memory
+    * Device P2P
+    * RDMA
+    * NCCL
+  * Computation-Communication Benchmarks
+  * Storage Benchmarks
+
+* Model-Benchmarks
+  * CNN models
+  * LSTM models
+  * BERT models
+  * GPT-2 models
+
+For the details of each benchmark, please view [micro-benchmarks](https://microsoft.github.io/superbenchmark/docs/benchmarks/micro-benchmarks.md) 
+and [model-benchmarks](https://microsoft.github.io/superbenchmark/docs/benchmarks/model-benchmarks.md).
+
+
+## What's next?
+
+We want to extend SuperBench capability to distributed validation and auto-diagnosis, to build a benchmarking eco-system.
+The following figure shows the whole picture.
+![SuperBench Capabilities and Extension](../../docs/assets/architecture.png)
+
+With SuperBench and its extensions, we can support:
+
+* Quick and trustable distributed validation
+  * Distributed validation tools to validate hundreds or thousands of servers automatically
+  * Provide minute-level fast validation and guarantee high repeatability for each benchmarks
+  * Provide baseline for different systems as Performance/Quality Gates for hardware and system release
+* Detailed auto diagnosis
+  * Provide comprehensive diagnosis benchmarks to analyze the detailed issued on defective node
+  * Provide detailed performance report and advanced analysis tool
+
+
+## Call for Contributor
+
+This project welcomes contributions and suggestions. 
--- a/website/blog/2021-06-28-release-0-2.md
+++ b/website/blog/2021-06-28-release-0-2.md
+---
+slug: release-sb-v0.2
+title: Releasing SuperBench v0.2
+author: Tingting Qin
+author_title: SuperBench Team
+author_url: https://github.com/TobeyQin
+tags: [superbench, announcement, release]
+---
+
+We are very happy to announce that **SuperBench 0.2.0 version** is officially released today! 
+
+You can install and try superbench by following [Getting Started Tutorial](https://microsoft.github.io/superbenchmark/docs/getting-started/installation).
+
+## SuperBench 0.2.0 Release Notes
+
+### SuperBench Framework
+
+* Implemented a CLI to provide a command line interface.
+* Implemented Runner for nodes control and management.
+* Implemented Executor.
+* Implemented Benchmark framework.
+
+### Supported Benchmarks
+
+* Supported Micro-benchmarks
+  * GEMM FLOPS (GFLOPS, TensorCore, cuBLAS, cuDNN)
+  * Kernel Launch Time (Kernel_Launch_Event_Time, Kernel_Launch_Wall_Time)
+  * Operator Performance (MatMul, Sharding_MatMul)
+* Supported Model-benchmarks
+  * CNN models
+  (Reference: [torchvision models](https://github.com/pytorch/vision/tree/v0.8.0/torchvision/models))
+    * ResNet (ResNet-18, ResNet-34, ResNet-50, ResNet-101, ResNet-152)
+    * DenseNet (DenseNet-161, DenseNet-169, DenseNet-201)
+    * VGG (VGG-11, VGG-13, VGG-16, VGG-19, VGG11_bn, VGG13_bn, VGG16_bn, VGG19_bn)
+    * MNASNet (mnasnet0_5, mnasnet0_75, mnasnet1_0, mnasnet1_3)
+    * AlexNet
+    * GoogLeNet
+    * Inception_v3
+    * mobilenet_v2
+    * ResNeXt (resnext50_32x4d, resnext101_32x8d)
+    * Wide ResNet (wide_resnet50_2, wide_resnet101_2)
+    * ShuffleNet (shufflenet_v2_x0_5, shufflenet_v2_x1_0, shufflenet_v2_x1_5, shufflenet_v2_x2_0)
+    * SqueezeNet (squeezenet1_0, squeezenet1_1)
+  * LSTM model
+  * BERT models (BERT-Base, BERT-Large)
+  * GPT-2 model (specify which config)
+
+### Examples and Documents
+
+* Added examples to run benchmarks respectively.
+* Tutorial Documents (introduction, getting-started, developer-guides, APIs, benchmarks).
+* Built SuperBench [website](https://aka.ms/superbench/).
\ No newline at end of file
--- a/website/sidebars.js
+++ b/website/sidebars.js
@@ -15,14 +15,14 @@ module.exports = {
        'getting-started/run-superbench',
      ],
    },
-    // {
-    //   type: 'category',
-    //   label: 'Benchmarks',
-    //   items: [
-    //     'benchmarks/micro-benchmarks',
-    //     'benchmarks/model-benchmarks',
-    //   ],
-    // },
+    {
+      type: 'category',
+      label: 'Benchmarks',
+      items: [
+        'benchmarks/micro-benchmarks',
+        'benchmarks/model-benchmarks',
+      ],
+    },
    {
      type: 'category',
      label: 'Developer Guides',