ZeRO-Offload release (#391)

* ZeRO-Offload (squash) (#381) Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com> Co-authored-by: Reza Yazdani <reyazda@microsoft.com> Co-authored-by: Jeff Rasley <jerasley@microsoft.com> Co-authored-by: Jie <37380896+jren73@users.noreply.github.com> Co-authored-by: Arash Ashari <arashari@microsoft.com> Co-authored-by: Reza Yazdani <reyazda@microsoft.com> Co-authored-by: Samyam Rajbhandari <samyamr@microsoft.com> Co-authored-by: Shaden Smith <Shaden.Smith@microsoft.com> Co-authored-by: arashashari <arashashari@ArashMSLaptop.redmond.corp.microsoft.com> Co-authored-by: RezaYazdaniAminabadi <44502768+RezaYazdaniAminabadi@users.noreply.github.com> Co-authored-by: Reza Yazdani <reyazda@microsoft.com> Co-authored-by: Samyam Rajbhandari <samyamr@microsoft.com> Co-authored-by: Shaden Smith <Shaden.Smith@microsoft.com>

ZeRO-Offload release (#391)
* ZeRO-Offload (squash) (#381) Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com> Co-authored-by: Reza Yazdani <reyazda@microsoft.com> Co-authored-by: Jeff Rasley <jerasley@microsoft.com> Co-authored-by: Jie <37380896+jren73@users.noreply.github.com> Co-authored-by: Arash Ashari <arashari@microsoft.com> Co-authored-by: Reza Yazdani <reyazda@microsoft.com> Co-authored-by: Samyam Rajbhandari <samyamr@microsoft.com> Co-authored-by: Shaden Smith <Shaden.Smith@microsoft.com> Co-authored-by: arashashari <arashashari@ArashMSLaptop.redmond.corp.microsoft.com> Co-authored-by: RezaYazdaniAminabadi <44502768+RezaYazdaniAminabadi@users.noreply.github.com> Co-authored-by: Reza Yazdani <reyazda@microsoft.com> Co-authored-by: Samyam Rajbhandari <samyamr@microsoft.com> Co-authored-by: Shaden Smith <Shaden.Smith@microsoft.com>
41db1c2f · Jeff Rasley · GitHub · 79093d74 · 41db1c2f · 41db1c2f
Unverified Commit 41db1c2f authored Sep 09, 2020 by Jeff Rasley Committed by GitHub Sep 09, 2020
20 changed files
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -2,4 +2,5 @@ torch>=1.2
 torchvision>=0.4.0
 tqdm
 psutil
+cpufeature
 tensorboardX==1.8
--- a/setup.py
+++ b/setup.py
--- a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero1.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero1.json
@@ -3,13 +3,7 @@
  "gradient_accumulation_steps": 1,
  "steps_per_print": 1,
  "zero_optimization": {
-    "stage":1
+    "stage": 1
-  },
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
  },
  "gradient_clipping": 1.0,
  "fp16": {

--- a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2.json
@@ -3,17 +3,11 @@
  "gradient_accumulation_steps": 1,
  "steps_per_print": 1,
  "zero_optimization": {
-    "stage":2,
+    "stage": 2,
    "reduce_bucket_size": 7000000,
    "allgather_bucket_size": 7000000,
    "reduce_scatter": true
  },
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
-  },
  "gradient_clipping": 1.0,
  "fp16": {
    "enabled": true,

--- a/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2_offload.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs4_zero2_offload.json
+{
+  "train_batch_size": 4,
+  "gradient_accumulation_steps": 1,
+  "steps_per_print": 1,
+  "zero_optimization": {
+    "stage": 2,
+    "reduce_bucket_size": 7000000,
+    "allgather_bucket_size": 7000000,
+    "reduce_scatter": true,
+    "cpu_offload": true
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  }
+}
--- a/tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json
@@ -3,13 +3,7 @@
  "gradient_accumulation_steps": 1,
  "steps_per_print": 1,
  "zero_optimization": {
-    "stage":0
+    "stage": 0
-  },
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
  },
  "gradient_clipping": 1.0,
  "fp16": {

--- a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero0_gas3.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero0_gas3.json
 {
-  "train_micro_batch_size_per_gpu":8,
+  "train_micro_batch_size_per_gpu": 8,
  "gradient_accumulation_steps": 3,
  "steps_per_print": 1,
  "zero_optimization": {
-    "stage":0,
+    "stage": 0,
    "reduce_bucket_size": 7000000,
    "allgather_bucket_size": 7000000,
    "reduce_scatter": true
  },
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015
-    }
-  },
  "gradient_clipping": 1.0,
  "fp16": {
    "enabled": true,
@@ -26,5 +20,4 @@
    "partition_activations": true,
    "contiguous_memory_optimization": true
  }
 }
--- a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json
--- a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json
--- a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_gas3.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_gas3.json
--- a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_offload.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_offload.json
--- a/tests/model/Megatron_GPT2/ds_config_func_scheduler.json
+++ b/tests/model/Megatron_GPT2/ds_config_func_scheduler.json
--- a/tests/model/Megatron_GPT2/ds_config_perf_bs16.json
+++ b/tests/model/Megatron_GPT2/ds_config_perf_bs16.json
--- a/tests/model/Megatron_GPT2/ds_config_perf_bs32.json
+++ b/tests/model/Megatron_GPT2/ds_config_perf_bs32.json
--- a/tests/model/Megatron_GPT2/ds_config_perf_bs8.json
+++ b/tests/model/Megatron_GPT2/ds_config_perf_bs8.json
--- a/tests/model/Megatron_GPT2/ds_gpt2_test.sh
+++ b/tests/model/Megatron_GPT2/ds_gpt2_test.sh
--- a/tests/model/Megatron_GPT2/run_checkpoint_test.py
+++ b/tests/model/Megatron_GPT2/run_checkpoint_test.py
--- a/tests/model/Megatron_GPT2/run_func_test.py
+++ b/tests/model/Megatron_GPT2/run_func_test.py
--- a/tests/perf/adam_test.py
+++ b/tests/perf/adam_test.py
--- a/tests/perf/adam_test1.py
+++ b/tests/perf/adam_test1.py