Merge branch 'dataset-utils-lint' into 'staging'

Lint megatron/data/dataset_utils.py See merge request ADLR/megatron-lm!42

Merge branch 'dataset-utils-lint' into 'staging'
Lint megatron/data/dataset_utils.py See merge request ADLR/megatron-lm!42
1eccd462 · Mohammad Shoeybi · 63262827 · 840759b8 · 1eccd462
Commit 1eccd462 authored Apr 03, 2020 by Mohammad Shoeybi
Show whitespace changes
Inline Side-by-side

Showing with 172 additions and 171 deletions

megatron/data/dataset_utils.py megatron/data/dataset_utils.py +172 -171

No files found.
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -132,6 +132,7 @@ def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, np_rng):
            tokens.pop()
    return True

+
 def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id):
    """Merge segments A and B, add [CLS] and [SEP] and build tokentypes."""

@@ -233,7 +234,7 @@ def create_masked_lm_predictions(tokens,
    for idx in range(len(cand_indexes)):
        ngram_index = []
        for n in ngrams:
-      ngram_index.append(cand_indexes[idx:idx+n])
+            ngram_index.append(cand_indexes[idx:idx + n])
        ngram_indexes.append(ngram_index)

    np_rng.shuffle(ngram_indexes)
@@ -367,12 +368,12 @@ def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
    assert len(masked_positions) == len(masked_labels)

    # Tokens and token types.
-    filler = [pad_id]*padding_length
+    filler = [pad_id] * padding_length
    tokens_np = np.array(tokens + filler, dtype=np.int64)
    tokentypes_np = np.array(tokentypes + filler, dtype=np.int64)

    # Padding mask.
-    padding_mask_np = np.array([1]*num_tokens + [0]*padding_length,
+    padding_mask_np = np.array([1] * num_tokens + [0] * padding_length,
                               dtype=np.int64)

    # Lables and loss mask.