saving for carl to review

56ea6d78 · Michael Carilli · 3c7a0e44 · 56ea6d78 · 56ea6d78
Commit 56ea6d78 authored Jan 23, 2019 by Michael Carilli
Hide whitespace changes
Inline Side-by-side

Showing with 17 additions and 7 deletions

apex/amp/utils.py apex/amp/utils.py +9 -6

apex/parallel/distributed.py apex/parallel/distributed.py +8 -1

No files found.
--- a/apex/amp/utils.py
+++ b/apex/amp/utils.py
@@ -82,17 +82,20 @@ def casted_args(cast_fn, args, kwargs):
    return new_args
 def cached_cast(cast_fn, x, cache):
+    print("Calling cached_cast")
    if is_nested(x):
        return type(x)([cached_cast(y) for y in x])
    if x in cache:
        cached_x = cache[x]
-        # During eval, it's possible to end up caching casted weights
+        if x.requires_grad and cached_x.requires_grad:
-        # with requires_grad == False. This is then a problem when they
+            # Check to make sure x is actually cached_x's autograd parent.
-        # get reused on the next train iter. So we ensure that cached
+            if cached_x.grad_fn.next_functions[1][0].variable is not x:
-        # weights have same requires_grad flag of most recent request.
+                raise RuntimeError("x and cache[x] both require grad, but x is not "
+                                   "cache[x]'s parent.  This is likely an error.")
        if x.requires_grad != cached_x.requires_grad:
-            cached_x.requires_grad_(x.requires_grad)
+            del cache[x]
-        return cache[x]
+        else:
+            return cached_x
    casted_x = cast_fn(x)
    cache[x] = casted_x

--- a/apex/parallel/distributed.py
+++ b/apex/parallel/distributed.py
@@ -292,7 +292,8 @@ class DistributedDataParallel(Module):
            # Sanity checks that all the buckets were kicked off
            if self.next_bucket != self.num_buckets:
-                raise RuntimeError("In epilogue, next_bucket != num_buckets.  "
+                raise RuntimeError("In epilogue, next_bucket ({}) != num_buckets ({}).  ".format(
+                                   self.next_bucket, self.num_buckets),
                                   "This probably indicates some buckets were not allreduced.")
            for actual, expected in zip(self.buckets_ready_size, self.bucket_sizes):
@@ -389,6 +390,8 @@ class DistributedDataParallel(Module):
    def allreduce_fallback(self):
        grads = [param.grad.data for param in self.module.parameters() if param.grad is not None]
+        print("In allreduce_fallback: {}".format(len(grads)))
        split_buckets = split_half_float_double(grads)
        # If retain_allreduce_buffers is True and delay_allreduce is False,
@@ -413,6 +416,7 @@ class DistributedDataParallel(Module):
        self.buckets[bucket_idx][bucket_loc] = param.grad.data
        self.buckets_ready_size[bucket_idx] += 1
+        print(self.buckets_ready_size)
        if self.buckets_ready_size[bucket_idx] == self.bucket_sizes[bucket_idx]:
            if bucket_idx == self.next_bucket:
@@ -472,6 +476,9 @@ class DistributedDataParallel(Module):
                    self.allreduce_buffers = [None for _ in range(self.num_buckets)]
                self.next_bucket = 0
                self.ready_buckets_not_reduced = set()
+            print(len(param_list), len(self.active_params), [len(b) for b in self.buckets],
+                  self.needs_refresh)
            self.active_params = param_list