[Bugfix] Fix dataloader pytorch cuda indexing (#4297)

* Modify to repro crash * Revert to orig. scenario and add fix * Update Co-authored-by: Xin Yao <xiny@nvidia.com>

[Bugfix] Fix dataloader pytorch cuda indexing (#4297)
* Modify to repro crash * Revert to orig. scenario and add fix * Update Co-authored-by: Xin Yao <xiny@nvidia.com>
4f797295 · Chang Liu · GitHub · 7e6a6b4a · 4f797295 · 4f797295
Unverified Commit 4f797295 authored Jul 26, 2022 by Chang Liu Committed by GitHub Jul 26, 2022
Show whitespace changes
Inline Side-by-side

Showing with 5 additions and 10 deletions

examples/pytorch/rgcn-hetero-ogbn-mag/main.py examples/pytorch/rgcn-hetero-ogbn-mag/main.py +2 -2

python/dgl/dataloading/dataloader.py python/dgl/dataloading/dataloader.py +3 -8

No files found.
--- a/examples/pytorch/rgcn-hetero-ogbn-mag/main.py
+++ b/examples/pytorch/rgcn-hetero-ogbn-mag/main.py
@@ -39,7 +39,7 @@ def train(
        model_optimizer.zero_grad()

        in_nodes = {rel: nid.to(device) for rel, nid in in_nodes.items()}
-        out_nodes = out_nodes[predict_category].to(device)
+        out_nodes = out_nodes[predict_category].to(labels.device)
        blocks = [block.to(device) for block in blocks]

        batch_labels = labels[out_nodes].to(device)
@@ -102,7 +102,7 @@ def validate(
            for step, (in_nodes, out_nodes, blocks) in enumerate(dataloader):
                in_nodes = {rel: nid.to(device)
                            for rel, nid in in_nodes.items()}
-                out_nodes = out_nodes[predict_category].to(device)
+                out_nodes = out_nodes[predict_category].to(labels.device)
                blocks = [block.to(device) for block in blocks]

                batch_labels = labels[out_nodes].to(device)

--- a/python/dgl/dataloading/dataloader.py
+++ b/python/dgl/dataloading/dataloader.py
@@ -157,7 +157,7 @@ class TensorizedDataset(torch.utils.data.IterableDataset):

    def __iter__(self):
        indices = _divide_by_worker(self._indices, self.batch_size, self.drop_last)
-        id_tensor = self._id_tensor[indices.to(self._device)]
+        id_tensor = self._id_tensor[indices]
        return _TensorizedDatasetIter(
            id_tensor, self.batch_size, self.drop_last, self._mapping_keys, self._shuffle)

@@ -223,12 +223,7 @@ class DDPTensorizedDataset(torch.utils.data.IterableDataset):
        """Shuffles the dataset."""
        # Only rank 0 does the actual shuffling.  The other ranks wait for it.
        if self.rank == 0:
-            if self._device == torch.device('cpu'):
            np.random.shuffle(self._indices[:self.num_indices].numpy())
-            else:
-                self._indices[:self.num_indices] = self._indices[
-                    torch.randperm(self.num_indices, device=self._indices.device)]
-
            if not self.drop_last:
                # pad extra
                self._indices[self.num_indices:] = \
@@ -239,7 +234,7 @@ class DDPTensorizedDataset(torch.utils.data.IterableDataset):
        start = self.num_samples * self.rank
        end = self.num_samples * (self.rank + 1)
        indices = _divide_by_worker(self._indices[start:end], self.batch_size, self.drop_last)
-        id_tensor = self._id_tensor[indices.to(self._device)]
+        id_tensor = self._id_tensor[indices]
        return _TensorizedDatasetIter(
            id_tensor, self.batch_size, self.drop_last, self._mapping_keys, self._shuffle)