Include comprehensive block info when hashing

e6f2720d · Neel Kant · f3d2426e · e6f2720d · e6f2720d
Commit e6f2720d authored Apr 15, 2020 by Neel Kant
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 10 deletions

hashed_index.py hashed_index.py +5 -9

megatron/data/ict_dataset.py megatron/data/ict_dataset.py +1 -1

No files found.
--- a/hashed_index.py
+++ b/hashed_index.py
@@ -39,7 +39,7 @@ def main():
        try:
            input_tokens, input_types, input_pad_mask, \
            block_tokens, block_token_types, block_pad_mask, block_indices = get_batch(data_iter)
-        except StopIteration:
+        except:
            break

        # TODO: make sure input is still in block
@@ -49,20 +49,16 @@ def main():
        block_hash_pos = torch.matmul(block_logits, hash_matrix)
        block_hash_full = torch.cat((block_hash_pos, -block_hash_pos), axis=1)
        block_hashes = torch.argmax(block_hash_full, axis=1).detach().cpu().numpy()
-        for hash, idx in zip(block_hashes, block_indices):
-            hash_data[int(hash)].append(int(idx))
+        for hash, indices_array in zip(block_hashes, block_indices):
+            hash_data[int(hash)].append(indicecs_array)

        all_input_tokens.append(input_tokens.detach().cpu().numpy())
        all_input_logits.append(input_logits.detach().cpu().numpy())
        all_block_tokens.append(block_tokens.detach().cpu().numpy())
        all_block_logits.append(block_logits.detach().cpu().numpy())

-        if i % 10 == 0:
-            print(i, flush=True)
-            print(block_tokens[0])
-
-        if i == 100:
-            break
+        if i == 1000:
+            print(i)

        i += 1


--- a/megatron/data/ict_dataset.py
+++ b/megatron/data/ict_dataset.py
@@ -79,7 +79,7 @@ class InverseClozeDataset(Dataset):
            'context_text': np.array(context_tokens),
            'context_types': np.array(context_token_types),
            'context_pad_mask': np.array(context_pad_mask),
-            'context_indices': np.array([block_idx]).astype(np.int64)
+            'context_indices': np.array([start_idx, end_idx, doc_idx, block_idx]).astype(np.int64)
        }

        return sample