Unverified Commit 4f797295 authored by Chang Liu's avatar Chang Liu Committed by GitHub
Browse files

[Bugfix] Fix dataloader pytorch cuda indexing (#4297)



* Modify to repro crash

* Revert to orig. scenario and add fix

* Update
Co-authored-by: default avatarXin Yao <xiny@nvidia.com>
parent 7e6a6b4a
......@@ -39,7 +39,7 @@ def train(
model_optimizer.zero_grad()
in_nodes = {rel: nid.to(device) for rel, nid in in_nodes.items()}
out_nodes = out_nodes[predict_category].to(device)
out_nodes = out_nodes[predict_category].to(labels.device)
blocks = [block.to(device) for block in blocks]
batch_labels = labels[out_nodes].to(device)
......@@ -102,7 +102,7 @@ def validate(
for step, (in_nodes, out_nodes, blocks) in enumerate(dataloader):
in_nodes = {rel: nid.to(device)
for rel, nid in in_nodes.items()}
out_nodes = out_nodes[predict_category].to(device)
out_nodes = out_nodes[predict_category].to(labels.device)
blocks = [block.to(device) for block in blocks]
batch_labels = labels[out_nodes].to(device)
......
......@@ -157,7 +157,7 @@ class TensorizedDataset(torch.utils.data.IterableDataset):
def __iter__(self):
indices = _divide_by_worker(self._indices, self.batch_size, self.drop_last)
id_tensor = self._id_tensor[indices.to(self._device)]
id_tensor = self._id_tensor[indices]
return _TensorizedDatasetIter(
id_tensor, self.batch_size, self.drop_last, self._mapping_keys, self._shuffle)
......@@ -223,12 +223,7 @@ class DDPTensorizedDataset(torch.utils.data.IterableDataset):
"""Shuffles the dataset."""
# Only rank 0 does the actual shuffling. The other ranks wait for it.
if self.rank == 0:
if self._device == torch.device('cpu'):
np.random.shuffle(self._indices[:self.num_indices].numpy())
else:
self._indices[:self.num_indices] = self._indices[
torch.randperm(self.num_indices, device=self._indices.device)]
if not self.drop_last:
# pad extra
self._indices[self.num_indices:] = \
......@@ -239,7 +234,7 @@ class DDPTensorizedDataset(torch.utils.data.IterableDataset):
start = self.num_samples * self.rank
end = self.num_samples * (self.rank + 1)
indices = _divide_by_worker(self._indices[start:end], self.batch_size, self.drop_last)
id_tensor = self._id_tensor[indices.to(self._device)]
id_tensor = self._id_tensor[indices]
return _TensorizedDatasetIter(
id_tensor, self.batch_size, self.drop_last, self._mapping_keys, self._shuffle)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment