Merging in latest master changes

c142714b · Michael Carilli · b620f96b · e6eec3ba · c142714b · c142714b
Commit c142714b authored May 23, 2019 by Michael Carilli
20 changed files
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@ optimized for NVIDIA's NCCL communication library.

 [Python Source](https://github.com/NVIDIA/apex/tree/master/apex/parallel)

-[Example/Walkthrough](https://github.com/NVIDIA/apex/tree/master/examples/distributed)
+[Example/Walkthrough](https://github.com/NVIDIA/apex/tree/master/examples/simple/distributed)

 The [Imagenet example](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
 shows use of `apex.parallel.DistributedDataParallel` along with `apex.amp`.
@@ -69,6 +69,8 @@ It's often convenient to use Apex in Docker containers.  Compatible options incl
 * [NVIDIA Pytorch containers from NGC](https://ngc.nvidia.com/catalog/containers/nvidia%2Fpytorch), which come with Apex preinstalled.  To use the latest Amp API, you may need to `pip uninstall apex` then reinstall Apex using the **Quick Start** commands below.
 * [official Pytorch -devel Dockerfiles](https://hub.docker.com/r/pytorch/pytorch/tags), e.g. `docker pull pytorch/pytorch:nightly-devel-cuda10.0-cudnn7`, in which you can install Apex using the **Quick Start** commands.

+See the [Docker example folder](https://github.com/NVIDIA/apex/tree/master/examples/docker) for details.
+
 # Quick Start

 ### Linux

--- a/apex/amp/_initialize.py
+++ b/apex/amp/_initialize.py
@@ -12,6 +12,7 @@ from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
 from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused
 from ..optimizers import FusedAdam
 from ..parallel import DistributedDataParallel as apex_DDP
+from ..parallel.LARC import LARC


 def to_type(dtype, t):
@@ -126,7 +127,7 @@ def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs
    from .amp import init as amp_init

    optimizers_was_list = False
-    if isinstance(optimizers, torch.optim.Optimizer):
+    if isinstance(optimizers, torch.optim.Optimizer) or isinstance(optimizers, LARC):
        optimizers = [optimizers]
    elif optimizers is None:
        optimizers = []

--- a/apex/amp/_process_optimizer.py
+++ b/apex/amp/_process_optimizer.py
@@ -463,4 +463,77 @@ def _process_optimizer(optimizer, properties):

    optimizer._amp_lazy_init = types.MethodType(_amp_lazy_init, optimizer)

+    old_add_param_group = optimizer.add_param_group
+
+    def new_add_param_group(self, new_group):
+        stash = self._amp_stash
+        assert isinstance(new_group, dict), "param group must be a dict"
+
+        new_params = new_group['params']
+        if isinstance(new_params, torch.Tensor):
+            new_group['params'] = [new_params]
+        elif isinstance(new_params, set):
+            raise TypeError('optimizer parameters need to be organized in ordered collections, but '
+                            'the ordering of tensors in sets will change between runs. Please use a list instead.')
+        else:
+            new_group['params'] = list(new_params)
+
+        if properties.master_weights:
+            # Mutate new_group in-place to use FP32 master params
+            fp16_params_this_group = []
+            fp32_params_this_group = []
+            fp32_from_fp16_params_this_group = []
+            for i, param in enumerate(new_group['params']):
+                if param.requires_grad:
+                    if param.type() == 'torch.cuda.HalfTensor':
+                        fp16_params_this_group.append(param)
+                        master_param = param.detach().clone().float()
+                        master_param.requires_grad = True
+                        new_group['params'][i] = master_param
+                        fp32_from_fp16_params_this_group.append(master_param)
+                    elif param.type() == 'torch.cuda.FloatTensor':
+                        fp32_params_this_group.append(param)
+                        new_group['params'][i] = param
+                    else:
+                        raise TypeError("Optimizer's parameters must be either "
+                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
+                                        "Received {}".format(param.type()))
+
+            stash.fp16_groups.append(fp16_params_this_group)
+            stash.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
+            stash.fp32_from_fp32_groups.append(fp32_params_this_group)
+
+            stash.all_fp16_params += fp16_params_this_group
+            stash.all_fp32_from_fp16_params += fp32_from_fp16_params_this_group
+            stash.all_fp32_from_fp32_params += fp32_params_this_group
+
+            # stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
+            stash.all_fp32_from_fp32_grad_stash += [None for _ in fp32_params_this_group]
+
+            # It should be ok to let params be added with existing .grad attributes.
+            # for param in fp16_params_this_group:
+            #     param.grad = None
+
+            # for param in fp32_from_fp16_params_this_group:
+            #     param.grad = None
+
+            # for param in stash.fp32_params_this_group:
+            #     param.grad = None
+        else:
+            for param in new_group['params']:
+                if param.type() == 'torch.cuda.HalfTensor':
+                    stash.all_fp16_params.append(param)
+                    stash.all_fp16_grad_stash.append(None)
+                elif param.type() == 'torch.cuda.FloatTensor':
+                    stash.all_fp32_params.append(param)
+                    stash.all_fp32_grad_stash.append(None)
+                else:
+                    raise TypeError("Optimizer's parameters must be either "
+                                    "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
+                                    "Received {}".format(param.type()))
+
+        old_add_param_group(new_group)
+
+    optimizer.add_param_group = types.MethodType(new_add_param_group, optimizer)
+
    return optimizer
--- a/apex/amp/frontend.py
+++ b/apex/amp/frontend.py
@@ -70,7 +70,7 @@ class Properties(object):
                    if self.opt_level == "O1" and value is not None:
                        warn_or_err("With opt_level O1, batchnorm functions are automatically patched "
                                    "to run in FP32, so keep_batchnorm_fp32 should be None." +
-                                    "keep_batchnorm_fp32 was {}".format(keep_batchnorm_fp32))
+                                    " keep_batchnorm_fp32 was {}".format(value))
                    if value == "False":
                        self.options[name] = False
                    elif value == "True":
@@ -78,7 +78,7 @@ class Properties(object):
                    else:
                        assert (value is True or value is False or value is None),\
                            "keep_batchnorm_fp32 must be a boolean, the string 'True' or 'False', "\
-                            "or None, found keep_batchnorm_fp32={}".format(keep_batchnorm_fp32)
+                            "or None, found keep_batchnorm_fp32={}".format(value)
                        self.options[name] = value
                elif name == "master_weights":
                    if self.opt_level == "O1" and value is not None:
@@ -303,6 +303,10 @@ def initialize(
    if not enabled:
        return models, optimizers

+    if not torch.backends.cudnn.enabled:
+        raise RuntimeError(
+            "Amp requires torch.backends.cudnn.enabled = True")
+
    if opt_level not in opt_levels:
        raise RuntimeError(
            "Unexpected optimization level {}. ".format(opt_level) +

--- a/apex/amp/handle.py
+++ b/apex/amp/handle.py
@@ -6,6 +6,7 @@ from . import utils
 from .opt import OptimWrapper
 from .scaler import LossScaler
 from ._amp_state import _amp_state, master_params, maybe_print
+from ..parallel.LARC import LARC


 # There's no reason to expose the notion of a "handle". Everything can happen through amp.* calls.
@@ -73,11 +74,16 @@ def scale_loss(loss,
    .. _`Advanced Amp Usage`:
        https://nvidia.github.io/apex/advanced.html
    """
+    if not hasattr(_amp_state, "opt_properties"):
+        raise RuntimeError("Invoked 'with amp.scale_loss`, but internal Amp state has not been initialized.  "
+                           "model, optimizer = amp.initialize(model, optimizer, opt_level=...) must be called "
+                           "before `with amp.scale_loss`.")
+
    if not _amp_state.opt_properties.enabled:
        yield loss
        return

-    if isinstance(optimizers, torch.optim.Optimizer):
+    if isinstance(optimizers, torch.optim.Optimizer) or isinstance(optimizers, LARC):
        optimizers = [optimizers]

    loss_scaler = _amp_state.loss_scalers[loss_id]

--- a/apex/amp/lists/torch_overrides.py
+++ b/apex/amp/lists/torch_overrides.py
@@ -78,6 +78,7 @@ CASTS = [
    'addcmul',
    'atan2',
    'cross',
+    'bilinear',

    # Element-wise _or_ tensor-wise math
    'add',

--- a/apex/fp16_utils/fp16util.py
+++ b/apex/fp16_utils/fp16util.py
@@ -65,6 +65,8 @@ def convert_network(network, dtype):
        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
            continue
        convert_module(module, dtype)
+        if isinstance(module, torch.nn.RNNBase) or isinstance(module, torch.nn.modules.rnn.RNNBase):
+            module.flatten_parameters()
    return network



--- a/apex/optimizers/fp16_optimizer.py
+++ b/apex/optimizers/fp16_optimizer.py
@@ -83,6 +83,7 @@ class FP16_Optimizer(object):
            self.dynamic_loss_scale = False
            self.cur_iter = 0
            self.cur_scale = static_loss_scale
+        self.verbose = verbose

    def zero_grad(self, set_grads_to_None=True):
        """
@@ -173,8 +174,9 @@ class FP16_Optimizer(object):
    def _update_scale(self, skip):
        if self.dynamic_loss_scale:
            if skip:
-                print("\nGrad overflow on iteration", self.cur_iter)
-                print("Using dynamic loss scale of", self.cur_scale)
+                if self.verbose:
+                    print("\nGrad overflow on iteration", self.cur_iter)
+                    print("Using dynamic loss scale of", self.cur_scale)
                self.cur_scale = max(self.cur_scale/self.scale_factor, 1)
                self.last_overflow_iter = self.cur_iter
            else:

--- a/apex/parallel/distributed.py
+++ b/apex/parallel/distributed.py
@@ -104,8 +104,6 @@ class Reducer(object):
    When used with this launcher, :class:`Reducer` assumes 1:1 mapping of processes to GPUs.
    It also assumes that your script calls ``torch.cuda.set_device(args.rank)`` before creating the model.

-    main_reducer.py in https://github.com/NVIDIA/apex/tree/master/examples/imagenet shows example usage.
-
    Args:
        module_or_grads_list: Either a network definition (module) being run in multi-gpu/distributed mode, or an iterable of gradients to be reduced.  If a module is passed in, the Reducer constructor will sync the parameters across processes (broadcasting from rank 0) to make sure they're all initialized with the same values.  If a list of gradients (that came from some module) is passed in, the user is responsible for manually syncing that module's parameters at the beginning of training.
    """
@@ -143,7 +141,7 @@ class DistributedDataParallel(Module):
    When used with this launcher, :class:`DistributedDataParallel` assumes 1:1 mapping of processes to GPUs.
    It also assumes that your script calls ``torch.cuda.set_device(args.rank)`` before creating the model.

-    https://github.com/NVIDIA/apex/tree/master/examples/distributed shows detailed usage.
+    https://github.com/NVIDIA/apex/tree/master/examples/simple/distributed shows detailed usage.
    https://github.com/NVIDIA/apex/tree/master/examples/imagenet shows another example
    that combines :class:`DistributedDataParallel` with mixed precision training.


--- a/apex/parallel/optimized_sync_batchnorm_kernel.py
+++ b/apex/parallel/optimized_sync_batchnorm_kernel.py
@@ -26,9 +26,6 @@ class SyncBatchnormFunction(Function):
                count = int(input.numel()/input.size(1))
                mean, var_biased = syncbn.welford_mean_var(input)

-            if count == 1:
-                raise ValueError('Expected more than 1 value per channel when training, got input size{}'.format(input.size()))
-
            if torch.distributed.is_initialized():
                if not process_group:
                    process_group = torch.distributed.group.WORLD
@@ -45,6 +42,9 @@ class SyncBatchnormFunction(Function):
                inv_std = 1.0 / torch.sqrt(var_biased + eps)
                var = var_biased * (count) / (count-1) 

+            if count == 1 and world_size < 2:
+                raise ValueError('Expected more than 1 value per channel when training, got input size{}'.format(input.size()))
+
            r_m_inc = mean if running_mean.dtype != torch.float16 else mean.half()
            r_v_inc = var if running_variance.dtype != torch.float16 else var.half()
            running_mean.data = running_mean.data * (1-momentum) + momentum*r_m_inc

--- a/apex/parallel/sync_batchnorm.py
+++ b/apex/parallel/sync_batchnorm.py
@@ -66,10 +66,23 @@ class SyncBatchNorm(_BatchNorm):
        torch.cuda.nvtx.range_push("sync_bn_fw_with_mean_var")
        mean = None
        var = None
+        cast = None
+        out = None
+
+        # casting to handle mismatch input type to layer type
+        if self.running_mean is not None:
+            if self.running_mean.dtype != input.dtype:
+                input = input.to(self.running_mean.dtype)
+                cast = input.dtype
+        elif self.weight is not None:
+            if self.weight.dtype != input.dtype:
+                input = input.to(self.weight.dtype)
+                cast = input.dtype
+
        if not self.training and self.track_running_stats:
            # fall back to pytorch implementation for inference
            torch.cuda.nvtx.range_pop()
-            return F.batch_norm(input, self.running_mean, self.running_var, self.weight, self.bias, False, 0.0, self.eps)
+            out = F.batch_norm(input, self.running_mean, self.running_var, self.weight, self.bias, False, 0.0, self.eps)
        else:
            process_group = self.process_group
            world_size = 1
@@ -114,4 +127,5 @@ class SyncBatchNorm(_BatchNorm):
                        (m-1) * self.momentum * var + \
                        (1 - self.momentum) * self.running_var
            torch.cuda.nvtx.range_pop()
-            return SyncBatchnormFunction.apply(input, self.weight, self.bias, mean, var, self.eps, process_group, world_size)
+            out = SyncBatchnormFunction.apply(input, self.weight, self.bias, mean, var, self.eps, process_group, world_size)
+        out = out.to(cast)
--- a/examples/deprecated_api/FP16_Optimizer_simple/README.md
+++ b/examples/deprecated_api/FP16_Optimizer_simple/README.md
-# Simple examples of FP16_Optimizer functionality
-
-To use `FP16_Optimizer` on a half-precision model, or a model with a mixture of 
-half and float parameters, only two lines of your training script need to change:
-1. Construct an `FP16_Optimizer` instance from an existing optimizer.
-2. Replace `loss.backward()` with `optimizer.backward(loss)`.
-
-#### [Full API Documentation](https://nvidia.github.io/apex/fp16_utils.html#automatic-management-of-master-params-loss-scaling)
-
-See "Other Options" at the bottom of this page for some cases that require special treatment.
-
-#### Minimal Working Sample
-`minimal.py` shows the basic usage of `FP16_Optimizer` with either static or dynamic loss scaling.  Test via `python minimal.py`.
-
-#### Closures
-`FP16_Optimizer` supports closures with the same control flow as ordinary Pytorch optimizers.  
-`closure.py` shows an example.  Test via `python closure.py`.
-
-See [the API documentation](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.FP16_Optimizer.step) for more details.
-
-#### Serialization/Deserialization
-`FP16_Optimizer` supports saving and loading with the same control flow as ordinary Pytorch optimizers.
-`save_load.py` shows an example.  Test via `python save_load.py`.
-
-See [the API documentation](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.FP16_Optimizer.load_state_dict) for more details.
-
-#### Distributed
-**distributed_apex** shows an example using `FP16_Optimizer` with Apex DistributedDataParallel.
-The usage of `FP16_Optimizer` with distributed does not need to change from ordinary single-process 
-usage. Test via
-```bash
-cd distributed_apex
-bash run.sh
-```
-
-**distributed_pytorch** shows an example using `FP16_Optimizer` with Pytorch DistributedDataParallel.
-Again, the usage of `FP16_Optimizer` with distributed does not need to change from ordinary 
-single-process usage.  Test via
-```bash
-cd distributed_pytorch
-bash run.sh
-```
-
-#### Other Options
-
-Gradient clipping requires that calls to `torch.nn.utils.clip_grad_norm`
-be replaced with [fp16_optimizer_instance.clip_master_grads()](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.FP16_Optimizer.clip_master_grads).  The [word_language_model example](https://github.com/NVIDIA/apex/blob/master/examples/word_language_model/main_fp16_optimizer.py) uses this feature.
-
-Multiple losses will work if you simply replace
-```bash
-loss1.backward()
-loss2.backward()
-```
-with 
-```bash
-optimizer.backward(loss1)
-optimizer.backward(loss2)
-```
-but `FP16_Optimizer` can be told to handle this more efficiently using the 
-[update_master_grads()](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.FP16_Optimizer.update_master_grads) option.
--- a/examples/deprecated_api/FP16_Optimizer_simple/closure.py
+++ b/examples/deprecated_api/FP16_Optimizer_simple/closure.py
-import torch
-from apex.fp16_utils import FP16_Optimizer
-
-torch.backends.cudnn.benchmark = True
-
-N, D_in, D_out = 64, 1024, 16
-
-x = torch.randn(N, D_in, device='cuda', dtype=torch.half)
-y = torch.randn(N, D_out, device='cuda', dtype=torch.half)
-
-model = torch.nn.Linear(D_in, D_out).cuda().half()
-
-optimizer = torch.optim.LBFGS(model.parameters())
-### Construct FP16_Optimizer
-optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.0)
-###
-
-loss_fn = torch.nn.MSELoss()
-
-for t in range(5):
-    def closure():
-        optimizer.zero_grad()
-        y_pred = model(x)
-        loss = loss_fn(y_pred.float(), y.float())
-        ### Change loss.backward() within the closure to: ###
-        optimizer.backward(loss)
-        ###
-        return loss
-    loss = optimizer.step(closure)
-
-print("final loss = ", loss)
--- a/examples/deprecated_api/FP16_Optimizer_simple/distributed_apex/README.md
+++ b/examples/deprecated_api/FP16_Optimizer_simple/distributed_apex/README.md
-**distributed_data_parallel.py** and **run.sh** show an example using `FP16_Optimizer` with
-`apex.parallel.DistributedDataParallel` and the Pytorch multiprocess launcher script,
-[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
-The usage of `FP16_Optimizer` with distributed does not need to change from ordinary 
-single-process usage.  Test via
-```bash
-bash run.sh
-```
--- a/examples/deprecated_api/FP16_Optimizer_simple/distributed_apex/distributed_data_parallel.py
+++ b/examples/deprecated_api/FP16_Optimizer_simple/distributed_apex/distributed_data_parallel.py
-import torch
-import argparse
-from apex.parallel import DistributedDataParallel as DDP
-from apex.fp16_utils import FP16_Optimizer
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--local_rank", default=0, type=int)
-args = parser.parse_args()
-
-torch.cuda.set_device(args.local_rank)
-torch.distributed.init_process_group(backend='nccl',
-                                     init_method='env://')
-
-torch.backends.cudnn.benchmark = True
-
-N, D_in, D_out = 64, 1024, 16
-
-x = torch.randn(N, D_in, device='cuda', dtype=torch.half)
-y = torch.randn(N, D_out, device='cuda', dtype=torch.half)
-
-model = torch.nn.Linear(D_in, D_out).cuda().half()
-model = DDP(model)
-
-optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-### Construct FP16_Optimizer ###
-optimizer = FP16_Optimizer(optimizer)
-###
-
-loss_fn = torch.nn.MSELoss()
-
-for t in range(500):
-    optimizer.zero_grad()
-    y_pred = model(x)
-    loss = loss_fn(y_pred.float(), y.float())
-    ### Change loss.backward() to: ###
-    optimizer.backward(loss)
-    ###
-    optimizer.step()
-
-print("final loss = ", loss)
-
--- a/examples/deprecated_api/FP16_Optimizer_simple/distributed_apex/run.sh
+++ b/examples/deprecated_api/FP16_Optimizer_simple/distributed_apex/run.sh
-#!/bin/bash
-python -m torch.distributed.launch --nproc_per_node=2 distributed_data_parallel.py
--- a/examples/deprecated_api/FP16_Optimizer_simple/distributed_pytorch/README.md
+++ b/examples/deprecated_api/FP16_Optimizer_simple/distributed_pytorch/README.md
-**distributed_data_parallel.py** and **run.sh** show an example using `FP16_Optimizer` with
-`torch.nn.parallel.DistributedDataParallel` and the Pytorch multiprocess launcher script,
-[torch.distributed.launch](https://pytorch.org/docs/master/distributed.html#launch-utility).
-The usage of `FP16_Optimizer` with distributed does not need to change from ordinary 
-single-process usage.  Test via
-```bash
-bash run.sh
-```
--- a/examples/deprecated_api/FP16_Optimizer_simple/distributed_pytorch/distributed_data_parallel.py
+++ b/examples/deprecated_api/FP16_Optimizer_simple/distributed_pytorch/distributed_data_parallel.py
-import torch
-import argparse
-from apex.fp16_utils import FP16_Optimizer
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--local_rank", default=0, type=int)
-args = parser.parse_args()
-
-torch.cuda.set_device(args.local_rank)
-torch.distributed.init_process_group(backend='nccl',
-                                     init_method='env://')
-
-torch.backends.cudnn.benchmark = True
-
-N, D_in, D_out = 64, 1024, 16
-
-x = torch.randn(N, D_in, device='cuda', dtype=torch.half)
-y = torch.randn(N, D_out, device='cuda', dtype=torch.half)
-
-model = torch.nn.Linear(D_in, D_out).cuda().half()
-model = torch.nn.parallel.DistributedDataParallel(model,
-                                                  device_ids=[args.local_rank],
-                                                  output_device=args.local_rank)
-
-optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-### Construct FP16_Optimizer ###
-optimizer = FP16_Optimizer(optimizer)
-###
-
-loss_fn = torch.nn.MSELoss()
-
-for t in range(500):
-    optimizer.zero_grad()
-    y_pred = model(x)
-    loss = loss_fn(y_pred.float(), y.float())
-    ### Change loss.backward() to: ###
-    optimizer.backward(loss)
-    ###
-    optimizer.step()
-
-print("final loss = ", loss)
-
--- a/examples/deprecated_api/FP16_Optimizer_simple/distributed_pytorch/run.sh
+++ b/examples/deprecated_api/FP16_Optimizer_simple/distributed_pytorch/run.sh
-#!/bin/bash
-python -m torch.distributed.launch --nproc_per_node=2 distributed_data_parallel.py
--- a/examples/deprecated_api/FP16_Optimizer_simple/minimal.py
+++ b/examples/deprecated_api/FP16_Optimizer_simple/minimal.py
-import torch
-from apex.fp16_utils import FP16_Optimizer
-
-torch.backends.cudnn.benchmark = True
-
-N, D_in, D_out = 64, 1024, 16
-
-x = torch.randn(N, D_in, device='cuda', dtype=torch.half)
-y = torch.randn(N, D_out, device='cuda', dtype=torch.half)
-
-model = torch.nn.Linear(D_in, D_out).cuda().half()
-
-optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)
-
-### Construct FP16_Optimizer
-### FP16_Optimizer will ingest and remember the original optimizer's param_groups.
-###
-### Construct with static loss scaling...
-optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.0)
-### ...or dynamic loss scaling
-# optimizer = FP16_Optimizer(optimizer, 
-#                            dynamic_loss_scale=True,
-#                            dynamic_loss_args={'scale_factor' : 2})
-### dynamic_loss_args is optional, for "power users," and unnecessary in most cases.
-
-loss_fn = torch.nn.MSELoss()
-
-for t in range(200):
-    optimizer.zero_grad()
-    y_pred = model(x)
-    loss = loss_fn(y_pred.float(), y.float())
-    ### Change loss.backward() to:
-    optimizer.backward(loss)
-    ###
-    optimizer.step()
-
-print("final loss = ", loss)