Unverified Commit 4f797295 authored by Chang Liu's avatar Chang Liu Committed by GitHub
Browse files

[Bugfix] Fix dataloader pytorch cuda indexing (#4297)



* Modify to repro crash

* Revert to orig. scenario and add fix

* Update
Co-authored-by: default avatarXin Yao <xiny@nvidia.com>
parent 7e6a6b4a
...@@ -39,7 +39,7 @@ def train( ...@@ -39,7 +39,7 @@ def train(
model_optimizer.zero_grad() model_optimizer.zero_grad()
in_nodes = {rel: nid.to(device) for rel, nid in in_nodes.items()} in_nodes = {rel: nid.to(device) for rel, nid in in_nodes.items()}
out_nodes = out_nodes[predict_category].to(device) out_nodes = out_nodes[predict_category].to(labels.device)
blocks = [block.to(device) for block in blocks] blocks = [block.to(device) for block in blocks]
batch_labels = labels[out_nodes].to(device) batch_labels = labels[out_nodes].to(device)
...@@ -102,7 +102,7 @@ def validate( ...@@ -102,7 +102,7 @@ def validate(
for step, (in_nodes, out_nodes, blocks) in enumerate(dataloader): for step, (in_nodes, out_nodes, blocks) in enumerate(dataloader):
in_nodes = {rel: nid.to(device) in_nodes = {rel: nid.to(device)
for rel, nid in in_nodes.items()} for rel, nid in in_nodes.items()}
out_nodes = out_nodes[predict_category].to(device) out_nodes = out_nodes[predict_category].to(labels.device)
blocks = [block.to(device) for block in blocks] blocks = [block.to(device) for block in blocks]
batch_labels = labels[out_nodes].to(device) batch_labels = labels[out_nodes].to(device)
......
...@@ -157,7 +157,7 @@ class TensorizedDataset(torch.utils.data.IterableDataset): ...@@ -157,7 +157,7 @@ class TensorizedDataset(torch.utils.data.IterableDataset):
def __iter__(self): def __iter__(self):
indices = _divide_by_worker(self._indices, self.batch_size, self.drop_last) indices = _divide_by_worker(self._indices, self.batch_size, self.drop_last)
id_tensor = self._id_tensor[indices.to(self._device)] id_tensor = self._id_tensor[indices]
return _TensorizedDatasetIter( return _TensorizedDatasetIter(
id_tensor, self.batch_size, self.drop_last, self._mapping_keys, self._shuffle) id_tensor, self.batch_size, self.drop_last, self._mapping_keys, self._shuffle)
...@@ -223,12 +223,7 @@ class DDPTensorizedDataset(torch.utils.data.IterableDataset): ...@@ -223,12 +223,7 @@ class DDPTensorizedDataset(torch.utils.data.IterableDataset):
"""Shuffles the dataset.""" """Shuffles the dataset."""
# Only rank 0 does the actual shuffling. The other ranks wait for it. # Only rank 0 does the actual shuffling. The other ranks wait for it.
if self.rank == 0: if self.rank == 0:
if self._device == torch.device('cpu'): np.random.shuffle(self._indices[:self.num_indices].numpy())
np.random.shuffle(self._indices[:self.num_indices].numpy())
else:
self._indices[:self.num_indices] = self._indices[
torch.randperm(self.num_indices, device=self._indices.device)]
if not self.drop_last: if not self.drop_last:
# pad extra # pad extra
self._indices[self.num_indices:] = \ self._indices[self.num_indices:] = \
...@@ -239,7 +234,7 @@ class DDPTensorizedDataset(torch.utils.data.IterableDataset): ...@@ -239,7 +234,7 @@ class DDPTensorizedDataset(torch.utils.data.IterableDataset):
start = self.num_samples * self.rank start = self.num_samples * self.rank
end = self.num_samples * (self.rank + 1) end = self.num_samples * (self.rank + 1)
indices = _divide_by_worker(self._indices[start:end], self.batch_size, self.drop_last) indices = _divide_by_worker(self._indices[start:end], self.batch_size, self.drop_last)
id_tensor = self._id_tensor[indices.to(self._device)] id_tensor = self._id_tensor[indices]
return _TensorizedDatasetIter( return _TensorizedDatasetIter(
id_tensor, self.batch_size, self.drop_last, self._mapping_keys, self._shuffle) id_tensor, self.batch_size, self.drop_last, self._mapping_keys, self._shuffle)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment