[FairScale] Remove refs to "cpu_offload" in code comments (#814)

* fix * remove dup file

[FairScale] Remove refs to "cpu_offload" in code comments (#814)
* fix * remove dup file
fb7b6a93 · Rohan Varma · GitHub · 8acbec71 · fb7b6a93
Unverified Commit fb7b6a93 authored Oct 19, 2021 by Rohan Varma Committed by GitHub Oct 19, 2021
Show whitespace changes
Inline Side-by-side

Showing with 8 additions and 7 deletions

fairscale/nn/data_parallel/fully_sharded_data_parallel.py fairscale/nn/data_parallel/fully_sharded_data_parallel.py +8 -7

No files found.
--- a/fairscale/nn/data_parallel/fully_sharded_data_parallel.py
+++ b/fairscale/nn/data_parallel/fully_sharded_data_parallel.py
@@ -191,7 +191,7 @@ class FullyShardedDataParallel(nn.Module):
        move_grads_to_cpu (bool, Optional):
            move gradient shard to CPU after reduction. This is useful when
            combined with CPU-based optimizers. It defaults to the value of
-            *``cpu_offload``*.
+            *``move_params_to_cpu``*.
        bucket_cap_mb (int, Optional):
            FSDP will bucket parameters so that gradient reduction can
            be more efficient for small parameters.
@@ -251,7 +251,8 @@ class FullyShardedDataParallel(nn.Module):
        cpu_offload (bool, Optional):
            if ``True``, offload FP32 params to CPU. This is only relevant when
            *``mixed_precision``* is ``True``. Note: This arg will be deprecated in favor of
-            *``move_params_to_cpu``* in an upcoming release.
+            *``move_params_to_cpu``* in an upcoming release. Please prefer
+            specifying ``move_params_to_cpu`` instead.
    """

    def __init__(
@@ -306,7 +307,7 @@ class FullyShardedDataParallel(nn.Module):
        if self.fp32_reduce_scatter and not self.mixed_precision:
            raise ValueError("fp32_reduce_scatter requires mixed_precision=True")
        if self.move_params_to_cpu and not self.mixed_precision:
-            raise ValueError("cpu_offload requires mixed_precision=True")
+            raise ValueError("move_params_to_cpu requires mixed_precision=True")

        # skip validation if the process group was created above
        if process_group:
@@ -634,7 +635,7 @@ class FullyShardedDataParallel(nn.Module):
                f"buffer_dtype={self.buffer_dtype}, "
                f"fp32_reduce_scatter={self.fp32_reduce_scatter}, "
                f"compute_device={self.compute_device}"
-                f"cpu_offload={self.move_params_to_cpu}, "
+                f"move_params_to_cpu={self.move_params_to_cpu}, "
                f"move_grads_to_cpu={self.move_grads_to_cpu}, "
                f"bucket_cap_mb={self.bucket_cap_mb}, "
                f"clear_autocast_cache={self.clear_autocast_cache}"
@@ -987,7 +988,7 @@ class FullyShardedDataParallel(nn.Module):
            ``_fp32_shard``: a single shard of the parameters in full precision
                (typically FP32, but this is dependent on the dtype of the model
                as it's passed in by the user). This can be on CPU or GPU
-                depending on the value of *``cpu_offload``*.
+                depending on the value of *``move_params_to_cpu``*.
            ``_fp16_shard``: if *``mixed_precision``* is ``True``, this will be
                a single shard of the parameters in FP16, used for all-gather.
            ``_full_param_padded``: the full weight (padded to be evenly
@@ -1834,8 +1835,8 @@ class FullyShardedDataParallel(nn.Module):
                assert p._fp16_shard is not None
                alloc_storage_(p._fp16_shard, size=p._fp32_shard.size())
                p._fp16_shard.copy_(
-                    # If cpu_offload is True, this will be non-blocking because
-                    # _fp32_shard is pinned, otherwise it's a no-op.
+                    # If move_params_to_cpu is True, this will be non-blocking
+                    # because _fp32_shard is pinned, otherwise it's a no-op.
                    p._fp32_shard.to(p._fp16_shard.device, non_blocking=True)
                )
                p.data = p._fp16_shard