Unverified Commit fb7b6a93 authored by Rohan Varma's avatar Rohan Varma Committed by GitHub
Browse files

[FairScale] Remove refs to "cpu_offload" in code comments (#814)

* fix

* remove dup file
parent 8acbec71
......@@ -191,7 +191,7 @@ class FullyShardedDataParallel(nn.Module):
move_grads_to_cpu (bool, Optional):
move gradient shard to CPU after reduction. This is useful when
combined with CPU-based optimizers. It defaults to the value of
*``cpu_offload``*.
*``move_params_to_cpu``*.
bucket_cap_mb (int, Optional):
FSDP will bucket parameters so that gradient reduction can
be more efficient for small parameters.
......@@ -251,7 +251,8 @@ class FullyShardedDataParallel(nn.Module):
cpu_offload (bool, Optional):
if ``True``, offload FP32 params to CPU. This is only relevant when
*``mixed_precision``* is ``True``. Note: This arg will be deprecated in favor of
*``move_params_to_cpu``* in an upcoming release.
*``move_params_to_cpu``* in an upcoming release. Please prefer
specifying ``move_params_to_cpu`` instead.
"""
def __init__(
......@@ -306,7 +307,7 @@ class FullyShardedDataParallel(nn.Module):
if self.fp32_reduce_scatter and not self.mixed_precision:
raise ValueError("fp32_reduce_scatter requires mixed_precision=True")
if self.move_params_to_cpu and not self.mixed_precision:
raise ValueError("cpu_offload requires mixed_precision=True")
raise ValueError("move_params_to_cpu requires mixed_precision=True")
# skip validation if the process group was created above
if process_group:
......@@ -634,7 +635,7 @@ class FullyShardedDataParallel(nn.Module):
f"buffer_dtype={self.buffer_dtype}, "
f"fp32_reduce_scatter={self.fp32_reduce_scatter}, "
f"compute_device={self.compute_device}"
f"cpu_offload={self.move_params_to_cpu}, "
f"move_params_to_cpu={self.move_params_to_cpu}, "
f"move_grads_to_cpu={self.move_grads_to_cpu}, "
f"bucket_cap_mb={self.bucket_cap_mb}, "
f"clear_autocast_cache={self.clear_autocast_cache}"
......@@ -987,7 +988,7 @@ class FullyShardedDataParallel(nn.Module):
``_fp32_shard``: a single shard of the parameters in full precision
(typically FP32, but this is dependent on the dtype of the model
as it's passed in by the user). This can be on CPU or GPU
depending on the value of *``cpu_offload``*.
depending on the value of *``move_params_to_cpu``*.
``_fp16_shard``: if *``mixed_precision``* is ``True``, this will be
a single shard of the parameters in FP16, used for all-gather.
``_full_param_padded``: the full weight (padded to be evenly
......@@ -1834,8 +1835,8 @@ class FullyShardedDataParallel(nn.Module):
assert p._fp16_shard is not None
alloc_storage_(p._fp16_shard, size=p._fp32_shard.size())
p._fp16_shard.copy_(
# If cpu_offload is True, this will be non-blocking because
# _fp32_shard is pinned, otherwise it's a no-op.
# If move_params_to_cpu is True, this will be non-blocking
# because _fp32_shard is pinned, otherwise it's a no-op.
p._fp32_shard.to(p._fp16_shard.device, non_blocking=True)
)
p.data = p._fp16_shard
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment