Commit 5655f076 authored by Mohammad's avatar Mohammad
Browse files

cleaned up old gpt2 dataset stuff from openwebtext

parent 20764e12
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import numpy as np
import time
import os
import sys
from tokenizer import Tokenizer
def tokenize_corpus(filename, np_filename, print_interval=10000):
print(' > tokenizing {}'.format(filename))
tokenizer = Tokenizer(cache_dir='./cache')
tokenized_docs = []
num_docs = 0
num_tokens = 0
start_time = time.time()
with open(filename, 'r') as f:
for line in f:
try:
myjson = json.loads(line)
url = myjson['url']
sample = myjson['text']
tokens = tokenizer.tokenize_document(sample)
tokenized_docs.append(np.array(tokens, dtype=np.uint16))
num_docs += 1
num_tokens += len(tokens)
if num_docs % print_interval == 0:
print(' processed {:9d} documents in {:.2f} (s) so far'.
format(num_docs, time.time() - start_time),
flush=True)
except Exception as e:
print(' skipping ', line, e)
print(' >> processed {} document with total of {} tokens ...'.format(
num_docs, num_tokens))
tokenized_docs = np.array(tokenized_docs, dtype=object)
np.save(np_filename, tokenized_docs, allow_pickle=True)
print(' >> saved the tokenzed document to {} ...'.format(np_filename))
if __name__ == '__main__':
print('building gpt2 dataset ...')
path = sys.argv[1]
shard = sys.argv[2]
input_filename = os.path.join(path,
'shards/shard_{:04d}'.format(int(shard)))
output_filename = os.path.join(path,
'npys/shard_{:04d}.npy'.format(int(shard)))
print('will be reading {}'.format(input_filename))
print('and will write the results to {}'.format(output_filename))
tokenize_corpus(input_filename, output_filename)
import glob
import json
import os
import time
import sys
import numpy as np
if __name__ == '__main__':
print('building the shard sizes ...')
path = sys.argv[1]
print('> reading numpy files from {}'.format(path))
npy_files = glob.glob(path + '/*.npy')
npy_files.sort()
print(' found {} numpy files'.format(len(npy_files)))
size_dict = {}
counter = 0
start_time = time.time()
for filename in npy_files:
data = np.load(filename, allow_pickle=True)
size = np.hstack(data).size
np_filename = os.path.basename(filename)
size_dict[np_filename] = size
counter += 1
if counter % 10 == 0:
print(' processed {} files in {:.2f} seconds'.format(
counter, time.time() - start_time))
output_filename = os.path.join(path, 'sizes.txt')
with open(output_filename, 'w') as f:
json.dump(size_dict, f)
print('> wrote sizes to {}'.format(output_filename))
#!/bin/bash
echo "processing gpt2 data ..."
DIR="/raid/mpatwary/redownload_v0/0-21"
for thread in {0..3}; do
echo " launching thread "$thread && python make_gpt2_dataset.py $DIR $thread > $DIR/logs/shard_$thread.log 2>&1 &
done
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
sys.path.append('..')
from megatron.data_utils.tokenization_gpt2 import GPT2Tokenizer
class Tokenizer:
def __init__(self, cache_dir=None):
self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2',
cache_dir=cache_dir)
self.tokenizer.max_len = int(1e12)
self.eod_token = self.tokenizer.encoder['<|endoftext|>']
assert self.eod_token < 65535, 'vocab size will not fit in uint16'
print('> GPT2 tokenizer with {} vocab size and eod token {} ...'.format(
len(self.tokenizer.encoder), self.eod_token))
def tokenize_document(self, document):
tokens = self.tokenizer.encode(document)
tokens.append(self.eod_token)
return tokens
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment