default=25,metadata={"help":"bucket size for reduction"}
)
fix_batches_to_gpus:bool=field(
default=False,
metadata={
"help":"don't shuffle batches between GPUs; this reduces overall "
"randomness and may affect precision but avoids the cost of re-reading the data"
},
)
find_unused_parameters:bool=field(
default=False,
metadata={
"help":"disable unused parameter detection (not applicable to "
"--ddp-backend=legacy_ddp)"
},
)
gradient_as_bucket_view:bool=field(
default=False,
metadata={
"help":"when set to True, gradients will be views pointing to different offsets of allreduce communication buckets. This can reduce peak memory usage, where the saved memory size will be equal to the total gradients size. "