Added checkpointing example.

7eba6bfb · Michael Carilli · ed98834d · 7eba6bfb · 7eba6bfb · 7eba6bfb
Commit 7eba6bfb authored Jun 18, 2018 by Michael Carilli
3 changed files
--- a/examples/FP16_Optimizer_simple/README.md
+++ b/examples/FP16_Optimizer_simple/README.md
 # Simple examples of FP16_Optimizer functionality
+To use `FP16_Optimizer` on a half-precision model, or a model with a mixture of 
+half and float parameters, only two lines of your training script need to change:
+1. Construct an `FP16_Optimizer` instance from an existing optimizer.
+2. Replace `loss.backward()` with `optimizer.backward(loss)`.
+[Full API Documentation](https://nvidia.github.io/apex/fp16_utils.html#automatic-management-of-master-params-loss-scaling)
+See "Other Options" at the bottom of this page for some cases that require special treatment.
 #### Minimal Working Sample
 `minimal.py` shows the basic usage of `FP16_Optimizer` with either static or dynamic loss scaling.  Test via `python minimal.py`.
@@ -9,14 +17,11 @@
 See [the API documentation](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.FP16_Optimizer.step) for more details.
-<!---
+#### Serialization/Deserialization
-TODO:  add checkpointing example showing deserialization on the correct device
+`FP16_Optimizer` supports saving and loading with the same control flow as ordinary Pytorch optimizers.
-#### Checkpointing
-`FP16_Optimizer` also supports checkpointing with the same control flow as ordinary Pytorch optimizers.
 `save_load.py` shows an example.  Test via `python save_load.py`.
 See [the API documentation](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.FP16_Optimizer.load_state_dict) for more details.
-->
 #### Distributed
 **distributed_apex** shows an example using `FP16_Optimizer` with Apex DistributedDataParallel.
@@ -34,3 +39,21 @@ single-process usage.  Test via
 cd distributed_pytorch
 bash run.sh
 ```
+#### Other Options
+Gradient clipping requires that calls to `torch.nn.utils.clip_grad_norm"
+be replaced with [fp16_optimizer_instance.clip_master_grads](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.FP16_Optimizer.clip_master_grads).
+Multiple losses will work if you simply replace
+```bash
+loss1.backward()
+loss2.backward()
+```
+with 
+```bash
+optimizer.backward(loss1)
+optimizer.backward(loss2)
+```
+but `FP16_Optimizer` can be told to handle this more efficiently using the 
+[update_master_grads](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.FP16_Optimizer.update_master_grads) option.
--- a/examples/FP16_Optimizer_simple/minimal.py
+++ b/examples/FP16_Optimizer_simple/minimal.py
@@ -11,22 +11,26 @@ y = Variable(torch.cuda.FloatTensor(N, D_out).normal_()).half()
 model = torch.nn.Linear(D_in, D_out).cuda().half()
-optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)
-### Construct FP16_Optimizer with static loss scaling ###
+### Construct FP16_Optimizer
+### FP16_Optimizer will ingest and remember the original optimizer's param_groups.
+###
+### Construct with static loss scaling...
 optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.0)
-### ...or construct with dynamic loss scaling ###
+### ...or dynamic loss scaling
 # optimizer = FP16_Optimizer(optimizer, 
 #                            dynamic_loss_scale=True,
-#                            dynamic_loss_args={'scale_factor' : 4})
+#                            dynamic_loss_args={'scale_factor' : 2})
 ### dynamic_loss_args is optional, for "power users," and unnecessary in most cases.
 loss_fn = torch.nn.MSELoss()
-for t in range(1000):
+for t in range(200):
    optimizer.zero_grad()
    y_pred = model(x)
    loss = loss_fn(y_pred.float(), y.float())
-    ### Change loss.backward() to: ###
+    ### Change loss.backward() to:
    optimizer.backward(loss)
    ###
    optimizer.step()

--- a/examples/FP16_Optimizer_simple/save_load.py
+++ b/examples/FP16_Optimizer_simple/save_load.py
+import torch
+from torch.autograd import Variable
+from apex.fp16_utils import FP16_Optimizer
+torch.backends.cudnn.benchmark = True
+N, D_in, D_out = 64, 1024, 16
+x = Variable(torch.cuda.FloatTensor(N, D_in ).normal_()).half()
+y = Variable(torch.cuda.FloatTensor(N, D_out).normal_()).half()
+model = torch.nn.Linear(D_in, D_out).cuda().half()
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)
+### Construct FP16_Optimizer with static loss scaling...
+optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.0)
+### ...or dynamic loss scaling
+# optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+loss_fn = torch.nn.MSELoss()
+# The checkpointing shown here is identical to what you'd use without FP16_Optimizer.
+#
+# We save/load checkpoints within local scopes, so the "checkpoint" object
+# does not persist.  This helps avoid dangling references to intermediate deserialized data,
+# and is good practice for Pytorch in general, not just with FP16_Optimizer.
+def save_checkpoint():
+    checkpoint = {}
+    checkpoint['model'] = model.state_dict()
+    checkpoint['optimizer'] = optimizer.state_dict()
+    torch.save(checkpoint, 'saved.pth')
+def load_checkpoint():
+    checkpoint = torch.load('saved.pth', 
+        map_location = lambda storage, loc: storage.cuda(torch.cuda.current_device()))
+    model.load_state_dict(checkpoint['model'])
+    optimizer.load_state_dict(checkpoint['optimizer'])
+for t in range(100):
+    optimizer.zero_grad()
+    y_pred = model(x)
+    loss = loss_fn(y_pred.float(), y.float())
+    optimizer.backward(loss) ### formerly loss.backward()
+    optimizer.step()
+save_checkpoint()
+load_checkpoint()
+for t in range(100):
+    optimizer.zero_grad()
+    y_pred = model(x)
+    loss = loss_fn(y_pred.float(), y.float())
+    optimizer.backward(loss) ### formerly loss.backward()
+    optimizer.step()
+print("final loss = ", loss)