"vscode:/vscode.git/clone" did not exist on "dccd5c2b1acdee242d847e1fbeea2edabfdde15f"
Commit e5ca7e62 authored by hepj987's avatar hepj987
Browse files

初始化仓库

parents
Pipeline #437 failed with stages
in 0 seconds
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import bz2
import os
import urllib.request
import sys
class SquadDownloader:
def __init__(self, save_path):
self.save_path = save_path + '/squad'
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
if not os.path.exists(self.save_path + '/v1.1'):
os.makedirs(self.save_path + '/v1.1')
if not os.path.exists(self.save_path + '/v2.0'):
os.makedirs(self.save_path + '/v2.0')
self.download_urls = {
'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json' : 'v1.1/train-v1.1.json',
'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json' : 'v1.1/dev-v1.1.json',
'https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/' : 'v1.1/evaluate-v1.1.py',
'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json' : 'v2.0/train-v2.0.json',
'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json' : 'v2.0/dev-v2.0.json',
'https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/' : 'v2.0/evaluate-v2.0.py',
}
def download(self):
for item in self.download_urls:
url = item
file = self.download_urls[item]
print('Downloading:', url)
if os.path.isfile(self.save_path + '/' + file):
print('** Download file already exists, skipping download')
else:
response = urllib.request.urlopen(url)
with open(self.save_path + '/' + file, "wb") as handle:
handle.write(response.read())
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import defaultdict
from itertools import islice
import multiprocessing
import statistics
class Sharding:
def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_shards, fraction_test_set):
assert len(input_files) > 0, 'The input file list must contain at least one file.'
assert n_training_shards > 0, 'There must be at least one output shard.'
assert n_test_shards > 0, 'There must be at least one output shard.'
self.n_training_shards = n_training_shards
self.n_test_shards = n_test_shards
self.fraction_test_set = fraction_test_set
self.input_files = input_files
self.output_name_prefix = output_name_prefix
self.output_training_identifier = '_training'
self.output_test_identifier = '_test'
self.output_file_extension = '.txt'
self.articles = {} # key: integer identifier, value: list of articles
self.sentences = {} # key: integer identifier, value: list of sentences
self.output_training_files = {} # key: filename, value: list of articles to go into file
self.output_test_files = {} # key: filename, value: list of articles to go into file
self.init_output_files()
# Remember, the input files contain one article per line (the whitespace check is to skip extraneous blank lines)
def load_articles(self):
print('Start: Loading Articles')
global_article_count = 0
for input_file in self.input_files:
print('input file:', input_file)
with open(input_file, mode='r', newline='\n') as f:
for i, line in enumerate(f):
if line.strip():
self.articles[global_article_count] = line.rstrip()
global_article_count += 1
print('End: Loading Articles: There are', len(self.articles), 'articles.')
def segment_articles_into_sentences(self, segmenter):
print('Start: Sentence Segmentation')
if len(self.articles) is 0:
self.load_articles()
assert len(self.articles) is not 0, 'Please check that input files are present and contain data.'
# TODO: WIP: multiprocessing (create independent ranges and spawn processes)
use_multiprocessing = 'serial'
def chunks(data, size=len(self.articles)):
it = iter(data)
for i in range(0, len(data), size):
yield {k: data[k] for k in islice(it, size)}
if use_multiprocessing == 'manager':
manager = multiprocessing.Manager()
return_dict = manager.dict()
jobs = []
n_processes = 7 # in addition to the main process, total = n_proc+1
def work(articles, return_dict):
sentences = {}
for i, article in enumerate(articles):
sentences[i] = segmenter.segment_string(articles[article])
if i % 5000 == 0:
print('Segmenting article', i)
return_dict.update(sentences)
for item in chunks(self.articles, len(self.articles)):
p = multiprocessing.Process(target=work, args=(item, return_dict))
# Busy wait
while len(jobs) >= n_processes:
pass
jobs.append(p)
p.start()
for proc in jobs:
proc.join()
elif use_multiprocessing == 'queue':
work_queue = multiprocessing.Queue()
jobs = []
for item in chunks(self.articles, len(self.articles)):
pass
else: # serial option
for i, article in enumerate(self.articles):
self.sentences[i] = segmenter.segment_string(self.articles[article])
if i % 5000 == 0:
print('Segmenting article', i)
print('End: Sentence Segmentation')
def init_output_files(self):
print('Start: Init Output Files')
assert len(self.output_training_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
assert len(self.output_test_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
for i in range(self.n_training_shards):
name = self.output_name_prefix + self.output_training_identifier + '_' + str(i) + self.output_file_extension
self.output_training_files[name] = []
for i in range(self.n_test_shards):
name = self.output_name_prefix + self.output_test_identifier + '_' + str(i) + self.output_file_extension
self.output_test_files[name] = []
print('End: Init Output Files')
def get_sentences_per_shard(self, shard):
result = 0
for article_id in shard:
result += len(self.sentences[article_id])
return result
def distribute_articles_over_shards(self):
print('Start: Distribute Articles Over Shards')
assert len(self.articles) >= self.n_training_shards + self.n_test_shards, 'There are fewer articles than shards. Please add more data or reduce the number of shards requested.'
# Create dictionary with - key: sentence count per article, value: article id number
sentence_counts = defaultdict(lambda: [])
max_sentences = 0
total_sentences = 0
for article_id in self.sentences:
current_length = len(self.sentences[article_id])
sentence_counts[current_length].append(article_id)
max_sentences = max(max_sentences, current_length)
total_sentences += current_length
n_sentences_assigned_to_training = int((1 - self.fraction_test_set) * total_sentences)
nominal_sentences_per_training_shard = n_sentences_assigned_to_training // self.n_training_shards
nominal_sentences_per_test_shard = (total_sentences - n_sentences_assigned_to_training) // self.n_test_shards
consumed_article_set = set({})
unused_article_set = set(self.articles.keys())
# Make first pass and add one article worth of lines per file
for file in self.output_training_files:
current_article_id = sentence_counts[max_sentences][-1]
sentence_counts[max_sentences].pop(-1)
self.output_training_files[file].append(current_article_id)
consumed_article_set.add(current_article_id)
unused_article_set.remove(current_article_id)
# Maintain the max sentence count
while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
max_sentences -= 1
if len(self.sentences[current_article_id]) > nominal_sentences_per_training_shard:
nominal_sentences_per_training_shard = len(self.sentences[current_article_id])
print('Warning: A single article contains more than the nominal number of sentences per training shard.')
for file in self.output_test_files:
current_article_id = sentence_counts[max_sentences][-1]
sentence_counts[max_sentences].pop(-1)
self.output_test_files[file].append(current_article_id)
consumed_article_set.add(current_article_id)
unused_article_set.remove(current_article_id)
# Maintain the max sentence count
while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
max_sentences -= 1
if len(self.sentences[current_article_id]) > nominal_sentences_per_test_shard:
nominal_sentences_per_test_shard = len(self.sentences[current_article_id])
print('Warning: A single article contains more than the nominal number of sentences per test shard.')
training_counts = []
test_counts = []
for shard in self.output_training_files:
training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
for shard in self.output_test_files:
test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
training_median = statistics.median(training_counts)
test_median = statistics.median(test_counts)
# Make subsequent passes over files to find articles to add without going over limit
history_remaining = []
n_history_remaining = 4
while len(consumed_article_set) < len(self.articles):
for fidx, file in enumerate(self.output_training_files):
nominal_next_article_size = min(nominal_sentences_per_training_shard - training_counts[fidx], max_sentences)
# Maintain the max sentence count
while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
max_sentences -= 1
while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
nominal_next_article_size -= 1
if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or training_counts[fidx] > training_median:
continue # skip adding to this file, will come back later if no file can accept unused articles
current_article_id = sentence_counts[nominal_next_article_size][-1]
sentence_counts[nominal_next_article_size].pop(-1)
self.output_training_files[file].append(current_article_id)
consumed_article_set.add(current_article_id)
unused_article_set.remove(current_article_id)
for fidx, file in enumerate(self.output_test_files):
nominal_next_article_size = min(nominal_sentences_per_test_shard - test_counts[fidx], max_sentences)
# Maintain the max sentence count
while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
max_sentences -= 1
while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
nominal_next_article_size -= 1
if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or test_counts[fidx] > test_median:
continue # skip adding to this file, will come back later if no file can accept unused articles
current_article_id = sentence_counts[nominal_next_article_size][-1]
sentence_counts[nominal_next_article_size].pop(-1)
self.output_test_files[file].append(current_article_id)
consumed_article_set.add(current_article_id)
unused_article_set.remove(current_article_id)
# If unable to place articles a few times, bump up nominal sizes by fraction until articles get placed
if len(history_remaining) == n_history_remaining:
history_remaining.pop(0)
history_remaining.append(len(unused_article_set))
history_same = True
for i in range(1, len(history_remaining)):
history_same = history_same and (history_remaining[i-1] == history_remaining[i])
if history_same:
nominal_sentences_per_training_shard += 1
# nominal_sentences_per_test_shard += 1
training_counts = []
test_counts = []
for shard in self.output_training_files:
training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
for shard in self.output_test_files:
test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
training_median = statistics.median(training_counts)
test_median = statistics.median(test_counts)
print('Distributing data over shards:', len(unused_article_set), 'articles remaining.')
if len(unused_article_set) != 0:
print('Warning: Some articles did not make it into output files.')
for shard in self.output_training_files:
print('Training shard:', self.get_sentences_per_shard(self.output_training_files[shard]))
for shard in self.output_test_files:
print('Test shard:', self.get_sentences_per_shard(self.output_test_files[shard]))
print('End: Distribute Articles Over Shards')
def write_shards_to_disk(self):
print('Start: Write Shards to Disk')
for shard in self.output_training_files:
self.write_single_shard(shard, self.output_training_files[shard])
for shard in self.output_test_files:
self.write_single_shard(shard, self.output_test_files[shard])
print('End: Write Shards to Disk')
def write_single_shard(self, shard_name, shard):
with open(shard_name, mode='w', newline='\n') as f:
for article_id in shard:
for line in self.sentences[article_id]:
f.write(line + '\n')
f.write('\n') # Line break between articles
import nltk
nltk.download('punkt')
class NLTKSegmenter:
def __init(self):
pass
def segment_string(self, article):
return nltk.tokenize.sent_tokenize(article)
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import bz2
import os
import urllib.request
import subprocess
import sys
class WikiDownloader:
def __init__(self, language, save_path):
self.save_path = save_path + '/wikicorpus_' + language
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
self.language = language
self.download_urls = {
'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
}
self.output_files = {
'en' : 'wikicorpus_en.xml.bz2',
'zh' : 'wikicorpus_zh.xml.bz2'
}
def download(self):
if self.language in self.download_urls:
url = self.download_urls[self.language]
filename = self.output_files[self.language]
print('Downloading:', url)
if os.path.isfile(self.save_path + '/' + filename):
print('** Download file already exists, skipping download')
else:
response = urllib.request.urlopen(url)
with open(self.save_path + '/' + filename, "wb") as handle:
handle.write(response.read())
# Always unzipping since this is relatively fast and will overwrite
print('Unzipping:', self.output_files[self.language])
subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)
else:
assert False, 'WikiDownloader not implemented for this language yet.'
\ No newline at end of file
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import glob
import os
class WikicorpusTextFormatting:
def __init__(self, wiki_path, output_filename, recursive = False):
self.wiki_path = wiki_path
self.recursive = recursive
self.output_filename = output_filename
# This puts one article per line
def merge(self):
with open(self.output_filename, mode='w', newline='\n') as ofile:
for dirname in glob.glob(self.wiki_path + '/*/', recursive=False):
for filename in glob.glob(dirname + 'wiki_*', recursive=self.recursive):
print(filename)
article_lines = []
article_open = False
with open(filename, mode='r', newline='\n') as file:
for line in file:
if '<doc id=' in line:
article_open = True
elif '</doc>' in line:
article_open = False
for oline in article_lines[1:]:
if oline != '\n':
ofile.write(oline.rstrip() + " ")
ofile.write("\n\n")
article_lines = []
else:
if article_open:
article_lines.append(line)
\ No newline at end of file
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import BookscorpusTextFormatting
import Downloader
import TextSharding
import WikicorpusTextFormatting
import argparse
import itertools
import multiprocessing
import os
import pprint
import subprocess
def main(args):
working_dir = os.environ['BERT_PREP_WORKING_DIR']
print('Working Directory:', working_dir)
print('Action:', args.action)
print('Dataset Name:', args.dataset)
if args.input_files:
args.input_files = args.input_files.split(',')
hdf5_tfrecord_folder_prefix = "_lower_case_" + str(args.do_lower_case) + "_seq_len_" + str(args.max_seq_length) \
+ "_max_pred_" + str(args.max_predictions_per_seq) + "_masked_lm_prob_" + str(args.masked_lm_prob) \
+ "_random_seed_" + str(args.random_seed) + "_dupe_factor_" + str(args.dupe_factor)
directory_structure = {
'download' : working_dir + '/download', # Downloaded and decompressed
'extracted' : working_dir +'/extracted', # Extracted from whatever the initial format is (e.g., wikiextractor)
'formatted' : working_dir + '/formatted_one_article_per_line', # This is the level where all sources should look the same
'sharded' : working_dir + '/sharded_' + "training_shards_" + str(args.n_training_shards) + "_test_shards_" + str(args.n_test_shards) + "_fraction_" + str(args.fraction_test_set),
'tfrecord' : working_dir + '/tfrecord'+ hdf5_tfrecord_folder_prefix,
'hdf5': working_dir + '/hdf5' + hdf5_tfrecord_folder_prefix
}
print('\nDirectory Structure:')
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(directory_structure)
print('')
if args.action == 'download':
if not os.path.exists(directory_structure['download']):
os.makedirs(directory_structure['download'])
downloader = Downloader.Downloader(args.dataset, directory_structure['download'])
downloader.download()
elif args.action == 'text_formatting':
assert args.dataset != 'google_pretrained_weights' and args.dataset != 'nvidia_pretrained_weights' and args.dataset != 'squad' and args.dataset != 'mrpc', 'Cannot perform text_formatting on pretrained weights'
if not os.path.exists(directory_structure['extracted']):
os.makedirs(directory_structure['extracted'])
if not os.path.exists(directory_structure['formatted']):
os.makedirs(directory_structure['formatted'])
if args.dataset == 'bookscorpus':
books_path = directory_structure['download'] + '/bookscorpus'
#books_path = directory_structure['download']
output_filename = directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt'
books_formatter = BookscorpusTextFormatting.BookscorpusTextFormatting(books_path, output_filename, recursive=True)
books_formatter.merge()
elif args.dataset == 'wikicorpus_en':
if args.skip_wikiextractor == 0:
path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
print('WikiExtractor Command:', wikiextractor_command)
wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
#wikiextractor_process.communicate()
wiki_path = directory_structure['extracted'] + '/wikicorpus_en'
output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
wiki_formatter.merge()
elif args.dataset == 'wikicorpus_zh':
assert False, 'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be translated and properly segmented still, and should work once this step is added.'
if args.skip_wikiextractor == 0:
path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
print('WikiExtractor Command:', wikiextractor_command)
wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
#wikiextractor_process.communicate()
wiki_path = directory_structure['extracted'] + '/wikicorpus_zh'
output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'
wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
wiki_formatter.merge()
assert os.stat(output_filename).st_size > 0, 'File glob did not pick up extracted wiki files from WikiExtractor.'
elif args.action == 'sharding':
# Note: books+wiki requires user to provide list of input_files (comma-separated with no spaces)
if args.dataset == 'bookscorpus' or 'wikicorpus' in args.dataset or 'books_wiki' in args.dataset:
if args.input_files is None:
if args.dataset == 'bookscorpus':
args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt']
elif args.dataset == 'wikicorpus_en':
args.input_files = [directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
elif args.dataset == 'wikicorpus_zh':
args.input_files = [directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt']
elif args.dataset == 'books_wiki_en_corpus':
args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt', directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
output_file_prefix = directory_structure['sharded'] + '/' + args.dataset + '/' + args.dataset
if not os.path.exists(directory_structure['sharded']):
os.makedirs(directory_structure['sharded'])
if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset):
os.makedirs(directory_structure['sharded'] + '/' + args.dataset)
# Segmentation is here because all datasets look the same in one article/book/whatever per line format, and
# it seemed unnecessarily complicated to add an additional preprocessing step to call just for this.
# Different languages (e.g., Chinese simplified/traditional) may require translation and
# other packages to be called from here -- just add a conditional branch for those extra steps
segmenter = TextSharding.NLTKSegmenter()
sharding = TextSharding.Sharding(args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set)
sharding.load_articles()
sharding.segment_articles_into_sentences(segmenter)
sharding.distribute_articles_over_shards()
sharding.write_shards_to_disk()
else:
assert False, 'Unsupported dataset for sharding'
elif args.action == 'create_tfrecord_files':
assert False, 'TFrecord creation not supported in this PyTorch model example release.' \
''
if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset):
os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset)
def create_record_worker(filename_prefix, shard_id, output_format='tfrecord'):
bert_preprocessing_command = 'python /workspace/bert/create_pretraining_data.py'
bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
bert_preprocessing_command += ' --output_file=' + directory_structure['tfrecord'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else ''
bert_preprocessing_command += ' --max_seq_length=' + str(args.max_seq_length)
bert_preprocessing_command += ' --max_predictions_per_seq=' + str(args.max_predictions_per_seq)
bert_preprocessing_command += ' --masked_lm_prob=' + str(args.masked_lm_prob)
bert_preprocessing_command += ' --random_seed=' + str(args.random_seed)
bert_preprocessing_command += ' --dupe_factor=' + str(args.dupe_factor)
bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
last_process = bert_preprocessing_process
# This could be better optimized (fine if all take equal time)
if shard_id % args.n_processes == 0 and shard_id > 0:
bert_preprocessing_process.wait()
return last_process
output_file_prefix = args.dataset
for i in range(args.n_training_shards):
last_process =create_record_worker(output_file_prefix + '_training', i)
last_process.wait()
for i in range(args.n_test_shards):
last_process = create_record_worker(output_file_prefix + '_test', i)
last_process.wait()
elif args.action == 'create_hdf5_files':
last_process = None
if not os.path.exists(directory_structure['hdf5'] + "/" + args.dataset):
os.makedirs(directory_structure['hdf5'] + "/" + args.dataset)
def create_record_worker(filename_prefix, shard_id, output_format='hdf5'):
bert_preprocessing_command = 'python /workspace/bert/create_pretraining_data.py'
bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
bert_preprocessing_command += ' --output_file=' + directory_structure['hdf5'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else ''
bert_preprocessing_command += ' --max_seq_length=' + str(args.max_seq_length)
bert_preprocessing_command += ' --max_predictions_per_seq=' + str(args.max_predictions_per_seq)
bert_preprocessing_command += ' --masked_lm_prob=' + str(args.masked_lm_prob)
bert_preprocessing_command += ' --random_seed=' + str(args.random_seed)
bert_preprocessing_command += ' --dupe_factor=' + str(args.dupe_factor)
bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
last_process = bert_preprocessing_process
# This could be better optimized (fine if all take equal time)
if shard_id % args.n_processes == 0 and shard_id > 0:
bert_preprocessing_process.wait()
return last_process
output_file_prefix = args.dataset
for i in range(args.n_training_shards):
last_process = create_record_worker(output_file_prefix + '_training', i)
last_process.wait()
for i in range(args.n_test_shards):
last_process = create_record_worker(output_file_prefix + '_test', i)
last_process.wait()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Preprocessing Application for Everything BERT-related'
)
parser.add_argument(
'--action',
type=str,
help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords',
choices={
'download', # Download and verify mdf5/sha sums
'text_formatting', # Convert into a file that contains one article/book per line
'sharding', # Convert previous formatted text into shards containing one sentence per line
'create_tfrecord_files', # Turn each shard into a TFrecord with masking and next sentence prediction info
'create_hdf5_files' # Turn each shard into a HDF5 file with masking and next sentence prediction info
}
)
parser.add_argument(
'--dataset',
type=str,
help='Specify the dataset to perform --action on',
choices={
'bookscorpus',
'wikicorpus_en',
'wikicorpus_zh',
'books_wiki_en_corpus',
'google_pretrained_weights',
'nvidia_pretrained_weights',
'mrpc',
'sst-2',
'squad',
'all'
}
)
parser.add_argument(
'--input_files',
type=str,
help='Specify the input files in a comma-separated list (no spaces)'
)
parser.add_argument(
'--n_training_shards',
type=int,
help='Specify the number of training shards to generate',
default=256
)
parser.add_argument(
'--n_test_shards',
type=int,
help='Specify the number of test shards to generate',
default=256
)
parser.add_argument(
'--fraction_test_set',
type=float,
help='Specify the fraction (0..1) of the data to withhold for the test data split (based on number of sequences)',
default=0.1
)
parser.add_argument(
'--segmentation_method',
type=str,
help='Specify your choice of sentence segmentation',
choices={
'nltk'
},
default='nltk'
)
parser.add_argument(
'--n_processes',
type=int,
help='Specify the max number of processes to allow at one time',
default=4
)
parser.add_argument(
'--random_seed',
type=int,
help='Specify the base seed to use for any random number generation',
default=12345
)
parser.add_argument(
'--dupe_factor',
type=int,
help='Specify the duplication factor',
default=5
)
parser.add_argument(
'--masked_lm_prob',
type=float,
help='Specify the probability for masked lm',
default=0.15
)
parser.add_argument(
'--max_seq_length',
type=int,
help='Specify the maximum sequence length',
default=512
)
parser.add_argument(
'--max_predictions_per_seq',
type=int,
help='Specify the maximum number of masked words per sequence',
default=20
)
parser.add_argument(
'--do_lower_case',
type=int,
help='Specify whether it is cased (0) or uncased (1) (any number greater than 0 will be treated as uncased)',
default=1
)
parser.add_argument(
'--vocab_file',
type=str,
help='Specify absolute path to vocab file to use)'
)
parser.add_argument(
'--skip_wikiextractor',
type=int,
help='Specify whether to skip wikiextractor step 0=False, 1=True',
default=0
)
parser.add_argument(
'--interactive_json_config_generator',
type=str,
help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords'
)
args = parser.parse_args()
main(args)
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
to_download=${1:-"wiki_only"}
#Download
if [ "$to_download" = "wiki_books" ] ; then
python3 /workspace/bert/data/bertPrep.py --action download --dataset bookscorpus
fi
python3 /workspace/bert/data/bertPrep.py --action download --dataset wikicorpus_en
python3 /workspace/bert/data/bertPrep.py --action download --dataset google_pretrained_weights # Includes vocab
python3 /workspace/bert/data/bertPrep.py --action download --dataset squad
python3 /workspace/bert/data/bertPrep.py --action download --dataset mrpc
python3 /workspace/bert/data/bertPrep.py --action download --dataset sst-2
# Properly format the text files
if [ "$to_download" = "wiki_books" ] ; then
python3 /workspace/bert/data/bertPrep.py --action text_formatting --dataset bookscorpus
fi
python3 /workspace/bert/data/bertPrep.py --action text_formatting --dataset wikicorpus_en
if [ "$to_download" = "wiki_books" ] ; then
DATASET="books_wiki_en_corpus"
else
DATASET="wikicorpus_en"
# Shard the text files
fi
# Shard the text files
python3 /workspace/bert/data/bertPrep.py --action sharding --dataset $DATASET
# Create HDF5 files Phase 1
python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset $DATASET --max_seq_length 128 \
--max_predictions_per_seq 20 --vocab_file $BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt --do_lower_case 1
# Create HDF5 files Phase 2
python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset $DATASET --max_seq_length 512 \
--max_predictions_per_seq 80 --vocab_file $BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt --do_lower_case 1
#!/usr/bin/env bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
echo "Downloading dataset for squad..."
# Download SQuAD
v1="v1.1"
mkdir $v1
wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O $v1/train-v1.1.json
wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O $v1/dev-v1.1.json
wget https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/ -O $v1/evaluate-v1.1.py
EXP_TRAIN_v1='981b29407e0affa3b1b156f72073b945 -'
EXP_DEV_v1='3e85deb501d4e538b6bc56f786231552 -'
EXP_EVAL_v1='afb04912d18ff20696f7f88eed49bea9 -'
CALC_TRAIN_v1=`cat ${v1}/train-v1.1.json |md5sum`
CALC_DEV_v1=`cat ${v1}/dev-v1.1.json |md5sum`
CALC_EVAL_v1=`cat ${v1}/evaluate-v1.1.py |md5sum`
v2="v2.0"
mkdir $v2
wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O $v2/train-v2.0.json
wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O $v2/dev-v2.0.json
wget https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/ -O $v2/evaluate-v2.0.py
EXP_TRAIN_v2='62108c273c268d70893182d5cf8df740 -'
EXP_DEV_v2='246adae8b7002f8679c027697b0b7cf8 -'
EXP_EVAL_v2='ff23213bed5516ea4a6d9edb6cd7d627 -'
CALC_TRAIN_v2=`cat ${v2}/train-v2.0.json |md5sum`
CALC_DEV_v2=`cat ${v2}/dev-v2.0.json |md5sum`
CALC_EVAL_v2=`cat ${v2}/evaluate-v2.0.py |md5sum`
echo "Squad data download done!"
echo "Verifying Dataset...."
if [ "$EXP_TRAIN_v1" != "$CALC_TRAIN_v1" ]; then
echo "train-v1.1.json is corrupted! md5sum doesn't match"
fi
if [ "$EXP_DEV_v1" != "$CALC_DEV_v1" ]; then
echo "dev-v1.1.json is corrupted! md5sum doesn't match"
fi
if [ "$EXP_EVAL_v1" != "$CALC_EVAL_v1" ]; then
echo "evaluate-v1.1.py is corrupted! md5sum doesn't match"
fi
if [ "$EXP_TRAIN_v2" != "$CALC_TRAIN_v2" ]; then
echo "train-v2.0.json is corrupted! md5sum doesn't match"
fi
if [ "$EXP_DEV_v2" != "$CALC_DEV_v2" ]; then
echo "dev-v2.0.json is corrupted! md5sum doesn't match"
fi
if [ "$EXP_EVAL_v2" != "$CALC_EVAL_v2" ]; then
echo "evaluate-v2.0.py is corrupted! md5sum doesn't match"
fi
echo "Complete!"
""" Official evaluation script for v1.1 of the SQuAD dataset. """
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def f1_score(prediction, ground_truth):
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def exact_match_score(prediction, ground_truth):
return (normalize_answer(prediction) == normalize_answer(ground_truth))
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def evaluate(dataset, predictions):
f1 = exact_match = total = 0
for article in dataset:
for paragraph in article['paragraphs']:
for qa in paragraph['qas']:
total += 1
if qa['id'] not in predictions:
message = 'Unanswered question ' + qa['id'] + \
' will receive score 0.'
print(message, file=sys.stderr)
continue
ground_truths = list(map(lambda x: x['text'], qa['answers']))
prediction = predictions[qa['id']]
exact_match += metric_max_over_ground_truths(
exact_match_score, prediction, ground_truths)
f1 += metric_max_over_ground_truths(
f1_score, prediction, ground_truths)
exact_match = 100.0 * exact_match / total
f1 = 100.0 * f1 / total
return {'exact_match': exact_match, 'f1': f1}
if __name__ == '__main__':
expected_version = '1.1'
parser = argparse.ArgumentParser(
description='Evaluation for SQuAD ' + expected_version)
parser.add_argument('dataset_file', help='Dataset file')
parser.add_argument('prediction_file', help='Prediction File')
args = parser.parse_args()
with open(args.dataset_file) as dataset_file:
dataset_json = json.load(dataset_file)
if (dataset_json['version'] != expected_version):
print('Evaluation expects v-' + expected_version +
', but got dataset with v-' + dataset_json['version'],
file=sys.stderr)
dataset = dataset_json['data']
with open(args.prediction_file) as prediction_file:
predictions = json.load(prediction_file)
print(json.dumps(evaluate(dataset, predictions)))
This diff is collapsed.
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Utilities for working with the local dataset cache.
This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
Copyright by the AllenNLP authors.
"""
from __future__ import (absolute_import, division, print_function, unicode_literals)
import json
import logging
import os
import shutil
import tempfile
from functools import wraps
from hashlib import sha256
import sys
from io import open
import boto3
import requests
from botocore.exceptions import ClientError
from tqdm import tqdm
try:
from urllib.parse import urlparse
except ImportError:
from urlparse import urlparse
try:
from pathlib import Path
PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
Path.home() / '.pytorch_pretrained_bert'))
except AttributeError:
PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
def url_to_filename(url, etag=None):
"""
Convert `url` into a hashed filename in a repeatable way.
If `etag` is specified, append its hash to the url's, delimited
by a period.
"""
url_bytes = url.encode('utf-8')
url_hash = sha256(url_bytes)
filename = url_hash.hexdigest()
if etag:
etag_bytes = etag.encode('utf-8')
etag_hash = sha256(etag_bytes)
filename += '.' + etag_hash.hexdigest()
return filename
def filename_to_url(filename, cache_dir=None):
"""
Return the url and etag (which may be ``None``) stored for `filename`.
Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
"""
if cache_dir is None:
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
cache_path = os.path.join(cache_dir, filename)
if not os.path.exists(cache_path):
raise EnvironmentError("file {} not found".format(cache_path))
meta_path = cache_path + '.json'
if not os.path.exists(meta_path):
raise EnvironmentError("file {} not found".format(meta_path))
with open(meta_path, encoding="utf-8") as meta_file:
metadata = json.load(meta_file)
url = metadata['url']
etag = metadata['etag']
return url, etag
def cached_path(url_or_filename, cache_dir=None):
"""
Given something that might be a URL (or might be a local path),
determine which. If it's a URL, download the file and cache it, and
return the path to the cached file. If it's already a local path,
make sure the file exists and then return the path.
"""
if cache_dir is None:
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
url_or_filename = str(url_or_filename)
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
parsed = urlparse(url_or_filename)
if parsed.scheme in ('http', 'https', 's3'):
# URL, so get it from the cache (downloading if necessary)
return get_from_cache(url_or_filename, cache_dir)
elif os.path.exists(url_or_filename):
# File, and it exists.
return url_or_filename
elif parsed.scheme == '':
# File, but it doesn't exist.
raise EnvironmentError("file {} not found".format(url_or_filename))
else:
# Something unknown
raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
def split_s3_path(url):
"""Split a full s3 path into the bucket name and path."""
parsed = urlparse(url)
if not parsed.netloc or not parsed.path:
raise ValueError("bad s3 path {}".format(url))
bucket_name = parsed.netloc
s3_path = parsed.path
# Remove '/' at beginning of path.
if s3_path.startswith("/"):
s3_path = s3_path[1:]
return bucket_name, s3_path
def s3_request(func):
"""
Wrapper function for s3 requests in order to create more helpful error
messages.
"""
@wraps(func)
def wrapper(url, *args, **kwargs):
try:
return func(url, *args, **kwargs)
except ClientError as exc:
if int(exc.response["Error"]["Code"]) == 404:
raise EnvironmentError("file {} not found".format(url))
else:
raise
return wrapper
@s3_request
def s3_etag(url):
"""Check ETag on S3 object."""
s3_resource = boto3.resource("s3")
bucket_name, s3_path = split_s3_path(url)
s3_object = s3_resource.Object(bucket_name, s3_path)
return s3_object.e_tag
@s3_request
def s3_get(url, temp_file):
"""Pull a file directly from S3."""
s3_resource = boto3.resource("s3")
bucket_name, s3_path = split_s3_path(url)
s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
def http_get(url, temp_file):
req = requests.get(url, stream=True)
content_length = req.headers.get('Content-Length')
total = int(content_length) if content_length is not None else None
progress = tqdm(unit="B", total=total)
for chunk in req.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
progress.update(len(chunk))
temp_file.write(chunk)
progress.close()
def get_from_cache(url, cache_dir=None):
"""
Given a URL, look for the corresponding dataset in the local cache.
If it's not there, download it. Then return the path to the cached file.
"""
if cache_dir is None:
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
if not os.path.exists(cache_dir):
os.makedirs(cache_dir)
# Get eTag to add to filename, if it exists.
if url.startswith("s3://"):
etag = s3_etag(url)
else:
response = requests.head(url, allow_redirects=True)
if response.status_code != 200:
raise IOError("HEAD request failed for url {} with status code {}"
.format(url, response.status_code))
etag = response.headers.get("ETag")
filename = url_to_filename(url, etag)
# get cache path to put the file
cache_path = os.path.join(cache_dir, filename)
if not os.path.exists(cache_path):
# Download to temporary file, then copy to cache dir once finished.
# Otherwise you get corrupt cache entries if the download gets interrupted.
with tempfile.NamedTemporaryFile() as temp_file:
logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
# GET file object
if url.startswith("s3://"):
s3_get(url, temp_file)
else:
http_get(url, temp_file)
# we are copying the file before closing it, so flush to avoid truncation
temp_file.flush()
# shutil.copyfileobj() starts at the current position, so go to the start
temp_file.seek(0)
logger.info("copying %s to cache at %s", temp_file.name, cache_path)
with open(cache_path, 'wb') as cache_file:
shutil.copyfileobj(temp_file, cache_file)
logger.info("creating metadata file for %s", cache_path)
meta = {'url': url, 'etag': etag}
meta_path = cache_path + '.json'
with open(meta_path, 'w', encoding="utf-8") as meta_file:
json.dump(meta, meta_file)
logger.info("removing temp file %s", temp_file.name)
return cache_path
def read_set_from_file(filename):
'''
Extract a de-duped collection (set) of text from a file.
Expected file format is one item per line.
'''
collection = set()
with open(filename, 'r', encoding='utf-8') as file_:
for line in file_:
collection.add(line.rstrip())
return collection
def get_file_extension(path, dot=True, lower=True):
ext = os.path.splitext(path)[1]
ext = ext if dot else ext[1:]
return ext.lower() if lower else ext
This diff is collapsed.
This diff is collapsed.
DLLL {"timestamp": "1689585949.809111", "datetime": "2023-07-17 17:25:49.809111", "elapsedtime": "0.000235", "type": "LOG", "step": "PARAMETER", "data": {"Config": ["Namespace(amp=False, bert_model='bert-large-uncased', cache_dir=None, config_file='/public/home/hepj/model_source/pytorch_bert/bert_config.json', disable_progress_bar=False, dist_url='tcp://224.66.41.62:23456', do_eval=False, do_lower_case=False, do_predict=True, do_train=True, doc_stride=128, eval_script='./evaluate-v1.1.py', fp16=False, gpus_per_node=1, gradient_accumulation_steps=1, init_checkpoint='/public/home/hepj/model_source/pytorch_bert/model.ckpt-28252.pt', json_summary='./log/results.json', learning_rate=5e-05, local_rank=-1, log_freq=50, loss_scale=0, max_answer_length=30, max_query_length=64, max_seq_length=384, max_steps=-1.0, n_best_size=20, no_cuda=False, null_score_diff_threshold=0.0, num_train_epochs=3.0, output_dir='/public/home/hepj/outdir/torch/SQuAD', predict_batch_size=4, predict_file='/public/home/hepj/data/sq1.1/dev-v1.1.json', seed=42, skip_cache=False, skip_checkpoint=False, train_batch_size=4, train_file='/public/home/hepj/data/sq1.1/train-v1.1.json', use_env=False, verbose_logging=False, version_2_with_negative=False, vocab_file='/public/home/hepj/model_source/pytorch_bert/vocab.txt', warmup_proportion=0.1, world_size=1)"]}}
DLLL {"timestamp": "1689585950.006137", "datetime": "2023-07-17 17:25:50.006137", "elapsedtime": "0.197261", "type": "LOG", "step": "PARAMETER", "data": {"SEED": 42}}
DLLL {"timestamp": "1689585970.324955", "datetime": "2023-07-17 17:26:10.324955", "elapsedtime": "20.516079", "type": "LOG", "step": "PARAMETER", "data": {"loading_checkpoint": true}}
DLLL {"timestamp": "1689585974.448674", "datetime": "2023-07-17 17:26:14.448674", "elapsedtime": "24.639798", "type": "LOG", "step": "PARAMETER", "data": {"loaded_checkpoint": true}}
DLLL {"timestamp": "1689585976.67685", "datetime": "2023-07-17 17:26:16.676850", "elapsedtime": "26.867974", "type": "LOG", "step": "PARAMETER", "data": {"model_weights_num": 335150082}}
DLLL {"timestamp": "1689585989.449134", "datetime": "2023-07-17 17:26:29.449134", "elapsedtime": "39.640258", "type": "LOG", "step": "PARAMETER", "data": {"train_start": true}}
DLLL {"timestamp": "1689585989.467614", "datetime": "2023-07-17 17:26:29.467614", "elapsedtime": "39.658738", "type": "LOG", "step": "PARAMETER", "data": {"training_samples": 87599}}
DLLL {"timestamp": "1689585989.467693", "datetime": "2023-07-17 17:26:29.467693", "elapsedtime": "39.658817", "type": "LOG", "step": "PARAMETER", "data": {"training_features": 88368}}
DLLL {"timestamp": "1689585989.467758", "datetime": "2023-07-17 17:26:29.467758", "elapsedtime": "39.658882", "type": "LOG", "step": "PARAMETER", "data": {"train_batch_size": 4}}
DLLL {"timestamp": "1689585989.46782", "datetime": "2023-07-17 17:26:29.467820", "elapsedtime": "39.658944", "type": "LOG", "step": "PARAMETER", "data": {"steps": 65697.0}}
DLLL {"timestamp": "1689586004.55256", "datetime": "2023-07-17 17:26:44.552560", "elapsedtime": "54.743684", "type": "LOG", "step": [0, 1], "data": {"step_loss": 6.121078014373779, "learning_rate": 5e-05}}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment