adding comment to explain single process gradient averaging

c8d7c9f1 · jiej · 63e47d29 · c8d7c9f1
Commit c8d7c9f1 authored Jan 28, 2019 by jiej
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 0 deletions

tests/synced_batchnorm/two_gpu_unit_test.py tests/synced_batchnorm/two_gpu_unit_test.py +2 -0

No files found.
--- a/tests/synced_batchnorm/two_gpu_unit_test.py
+++ b/tests/synced_batchnorm/two_gpu_unit_test.py
@@ -92,6 +92,8 @@ inp_bn = inp_t.clone().requires_grad_()
 grad_bn = grad_output_t.clone().detach()
 out_bn = bn(inp_bn)
 out_bn.backward(grad_bn)
+# compensating the averaging over processes done by DDP
+# in order to produce mathmetically equivalent result
 for param in bn.parameters():
    param.grad = param.grad / args.world_size
 bn_opt = optim.SGD(bn.parameters(), lr=1.0)