cleaned up old gpt2 dataset stuff from openwebtext

5655f076 · Mohammad · 20764e12 · 20764e12 · 20764e12 · 20764e12
Commit 5655f076 authored Apr 12, 2020 by Mohammad
4 changed files
--- a/openwebtext/make_gpt2_dataset.py
+++ b/openwebtext/make_gpt2_dataset.py
-# coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import numpy as np
-import time
-import os
-import sys
-from tokenizer import Tokenizer
-def tokenize_corpus(filename, np_filename, print_interval=10000):
-    print(' > tokenizing {}'.format(filename))
-    tokenizer = Tokenizer(cache_dir='./cache')
-    tokenized_docs = []
-    num_docs = 0
-    num_tokens = 0
-    start_time = time.time()
-    with open(filename, 'r') as f:
-        for line in f:
-            try:
-                myjson = json.loads(line)
-                url = myjson['url']
-                sample = myjson['text']
-                tokens = tokenizer.tokenize_document(sample)
-                tokenized_docs.append(np.array(tokens, dtype=np.uint16))
-                num_docs += 1
-                num_tokens += len(tokens)
-                if num_docs % print_interval == 0:
-                    print('    processed {:9d} documents in {:.2f} (s) so far'.
-                          format(num_docs, time.time() - start_time),
-                          flush=True)
-            except Exception as e:
-                print('    skipping ', line, e)
-    print('  >> processed {} document with total of {} tokens ...'.format(
-        num_docs, num_tokens))
-    tokenized_docs = np.array(tokenized_docs, dtype=object)
-    np.save(np_filename, tokenized_docs, allow_pickle=True)
-    print('  >> saved the tokenzed document to {} ...'.format(np_filename))
-if __name__ == '__main__':
-    print('building gpt2 dataset ...')
-    path = sys.argv[1]
-    shard = sys.argv[2]
-    input_filename = os.path.join(path,
-                                  'shards/shard_{:04d}'.format(int(shard)))
-    output_filename = os.path.join(path,
-                                  'npys/shard_{:04d}.npy'.format(int(shard)))
-    print('will be reading {}'.format(input_filename))
-    print('and will write the results to {}'.format(output_filename))
-    tokenize_corpus(input_filename, output_filename)
--- a/openwebtext/make_gpt2_sizes.py
+++ b/openwebtext/make_gpt2_sizes.py
-import glob
-import json
-import os
-import time
-import sys
-import numpy as np
-if __name__ == '__main__':
-    print('building the shard sizes ...')
-    path = sys.argv[1]
-    print('> reading numpy files from {}'.format(path))
-    npy_files = glob.glob(path + '/*.npy')
-    npy_files.sort()
-    print('  found {} numpy files'.format(len(npy_files)))
-    size_dict = {}
-    counter = 0
-    start_time = time.time()
-    for filename in npy_files:
-        data = np.load(filename, allow_pickle=True)
-        size = np.hstack(data).size
-        np_filename = os.path.basename(filename)
-        size_dict[np_filename] = size
-        counter += 1
-        if counter % 10 == 0:
-            print('   processed {} files in {:.2f} seconds'.format(
-                counter, time.time() - start_time))
-    output_filename = os.path.join(path, 'sizes.txt')
-    with open(output_filename, 'w') as f:
-        json.dump(size_dict, f)
-    print('> wrote sizes to {}'.format(output_filename))
--- a/openwebtext/run_make_gpt2_dataset.sh
+++ b/openwebtext/run_make_gpt2_dataset.sh
-#!/bin/bash
-echo "processing gpt2 data ..."
-DIR="/raid/mpatwary/redownload_v0/0-21"
-for thread in {0..3}; do
-    echo " launching thread "$thread && python make_gpt2_dataset.py $DIR $thread > $DIR/logs/shard_$thread.log 2>&1 &
-done
--- a/openwebtext/tokenizer.py
+++ b/openwebtext/tokenizer.py
-# coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-sys.path.append('..')
-from megatron.data_utils.tokenization_gpt2 import GPT2Tokenizer
-class Tokenizer:
-    def __init__(self, cache_dir=None):
-        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2',
-                                                       cache_dir=cache_dir)
-        self.tokenizer.max_len = int(1e12)
-        self.eod_token = self.tokenizer.encoder['<|endoftext|>']
-        assert self.eod_token < 65535, 'vocab size will not fit in uint16'
-        print('> GPT2 tokenizer with {} vocab size and eod token {} ...'.format(
-            len(self.tokenizer.encoder), self.eod_token))
-    def tokenize_document(self, document):
-        tokens = self.tokenizer.encode(document)
-        tokens.append(self.eod_token)
-        return tokens