增加ds框架测试模型

316d3f90 · Pan,Huiwen · aebde649 · 6bd444a7 · 316d3f90 · 316d3f90
Commit 316d3f90 authored Jul 14, 2022 by Pan,Huiwen
20 changed files
--- a/DeepSpeed @ 6bd444a7
+++ b/DeepSpeed @ 6bd444a7
-Subproject commit 6bd444a7c62e9d7d320dd4c1e1142062f50c861d
--- a/Deepspeed/.pre-commit-config.yaml
+++ b/Deepspeed/.pre-commit-config.yaml
+
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v1.2.3
+    hooks:
+    -   id: trailing-whitespace
+        exclude: "Megatron-LM/"
+    -   id: check-yaml
+        exclude: "Megatron-LM/"
+    -   id: end-of-file-fixer
+        exclude: "Megatron-LM/"
+
+
+-   repo: https://github.com/pre-commit/mirrors-yapf
+    rev: v0.29.0
+    hooks:
+    -   id: yapf
+        exclude: "Megatron-LM/"
--- a/Deepspeed/BingBertGlue/glue_bert_base.json
+++ b/Deepspeed/BingBertGlue/glue_bert_base.json
+{
+    "train_batch_size": 32,
+    "train_micro_batch_size_per_gpu": 1,
+    "steps_per_print": 10,
+    "optimizer": {
+      "type": "Adam",
+      "params": {
+        "lr": 2e-5,
+        "weight_decay": 0.0,
+        "bias_correction": true
+      }
+    },
+    "gradient_clipping": 1.0,
+    "fp16": {
+      "enabled": false
+    }
+  
+  }
+  
\ No newline at end of file
--- a/Deepspeed/BingBertGlue/glue_bert_large.json
+++ b/Deepspeed/BingBertGlue/glue_bert_large.json
+{
+    "train_batch_size": 32,
+    "train_micro_batch_size_per_gpu": 1,
+    "steps_per_print": 10,
+    "optimizer": {
+      "type": "Adam",
+      "params": {
+        "lr": 2e-5,
+        "weight_decay": 0.0,
+        "bias_correction": true
+      }
+    },
+    "gradient_clipping": 1.0,
+    "fp16": {
+      "enabled": false
+    }
+  
+  }
+  
\ No newline at end of file
--- a/Deepspeed/BingBertGlue/nvidia/modeling.py
+++ b/Deepspeed/BingBertGlue/nvidia/modeling.py
--- a/Deepspeed/BingBertGlue/nvidia/modelingpreln.py
+++ b/Deepspeed/BingBertGlue/nvidia/modelingpreln.py
--- a/Deepspeed/BingBertGlue/nvidia/modelingpreln_layerdrop.py
+++ b/Deepspeed/BingBertGlue/nvidia/modelingpreln_layerdrop.py
--- a/Deepspeed/BingBertGlue/nvidia_bert_dataset_provider.py
+++ b/Deepspeed/BingBertGlue/nvidia_bert_dataset_provider.py
+import os
+import random
+import h5py
+import logging
+import json
+from concurrent.futures import ProcessPoolExecutor
+
+import numpy as np
+
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.sampler import RandomSampler
+from torch.utils.data.distributed import DistributedSampler
+
+from bert_dataset_provider import BertDatasetProviderInterface
+from turing.dataset import BatchType, map_to_torch
+
+
+# Workaround because python functions are not picklable
+class WorkerInitObj(object):
+    def __init__(self, seed):
+        self.seed = seed
+
+    def __call__(self, id):
+        np.random.seed(seed=self.seed + id)
+        random.seed(self.seed + id)
+
+
+def create_pretraining_dataset(input_file, max_predictions_per_seq,
+                               num_workers, train_batch_size, worker_init,
+                               data_sampler):
+    train_data = pretraining_dataset(
+        input_file=input_file, max_predictions_per_seq=max_predictions_per_seq)
+    train_dataloader = DataLoader(train_data,
+                                  sampler=data_sampler(train_data),
+                                  batch_size=train_batch_size,
+                                  num_workers=num_workers,
+                                  worker_init_fn=worker_init,
+                                  pin_memory=True)
+    return train_dataloader, len(train_data)
+
+
+class pretraining_dataset(Dataset):
+    def __init__(self, input_file, max_predictions_per_seq):
+        self.input_file = input_file
+        self.max_predictions_per_seq = max_predictions_per_seq
+        f = h5py.File(input_file, "r")
+        keys = [
+            'input_ids', 'input_mask', 'segment_ids', 'masked_lm_positions',
+            'masked_lm_ids', 'next_sentence_labels'
+        ]
+        self.inputs = [np.asarray(f[key][:]) for key in keys]
+        f.close()
+
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.inputs[0])
+
+    def __getitem__(self, index):
+
+        [
+            input_ids, input_mask, segment_ids, masked_lm_positions,
+            masked_lm_ids, next_sentence_labels
+        ] = [
+            torch.from_numpy(input[index].astype(np.int64)) if indice < 5 else
+            torch.from_numpy(np.asarray(input[index].astype(np.int64)))
+            for indice, input in enumerate(self.inputs)
+        ]
+
+        masked_lm_labels = torch.ones(input_ids.shape, dtype=torch.long) * -1
+        index = self.max_predictions_per_seq
+        # store number of  masked tokens in index
+        padded_mask_indices = (masked_lm_positions == 0).nonzero()
+        if len(padded_mask_indices) != 0:
+            index = padded_mask_indices[0].item()
+        masked_lm_labels[masked_lm_positions[:index]] = masked_lm_ids[:index]
+
+        return [
+            map_to_torch([BatchType.PRETRAIN_BATCH]), input_ids, input_mask,
+            segment_ids, next_sentence_labels, masked_lm_labels
+        ]
+
+
+class NvidiaBertDatasetProvider(BertDatasetProviderInterface):
+    def __init__(self, args):
+        self.num_workers = args.config['training']['num_workers']
+        self.max_seq_length = args.max_seq_length
+        self.max_predictions_per_seq = args.max_predictions_per_seq
+
+        self.gradient_accumulation_steps = args.gradient_accumulation_steps
+        self.train_micro_batch_size_per_gpu = args.train_micro_batch_size_per_gpu
+        self.logger = args.logger
+
+        if args.local_rank == -1:
+            self.global_rank = 0
+            self.world_size = 1
+        else:
+            self.global_rank = dist.get_rank()
+            self.world_size = dist.get_world_size()
+
+        # Initialize dataset files
+        dataset_path = os.path.join(
+            args.data_path_prefix,
+            args.config['data']['datasets']['pretrain_dataset'])
+        self.dataset_files = [
+            os.path.join(dataset_path, f) for f in os.listdir(dataset_path) if
+            os.path.isfile(os.path.join(dataset_path, f)) and 'training' in f
+        ]
+        self.dataset_files.sort()
+        random.shuffle(self.dataset_files)
+        self.num_files = len(self.dataset_files)
+        self.data_sampler = RandomSampler
+
+        self.worker_init = WorkerInitObj(args.seed + args.local_rank)
+        self.dataset_future = None
+        self.pool = ProcessPoolExecutor(1)
+
+        if self.global_rank == 0:
+            self.logger.info(
+                f"NvidiaBertDatasetProvider - Initialization:  num_files = {self.num_files}"
+            )
+
+    def get_shard(self, index):
+        if self.dataset_future is None:
+            data_file = self._get_shard_file(index)
+            self.train_dataloader, sample_count = create_pretraining_dataset(
+                input_file=data_file,
+                max_predictions_per_seq=self.max_predictions_per_seq,
+                num_workers=self.num_workers,
+                train_batch_size=self.train_micro_batch_size_per_gpu,
+                worker_init=self.worker_init,
+                data_sampler=self.data_sampler)
+        else:
+            self.train_dataloader, sample_count = self.dataset_future.result(
+                timeout=None)
+
+        return self.train_dataloader, sample_count
+
+    def release_shard(self, index):
+        del self.train_dataloader
+
+    def prefetch_shard(self, index):
+        data_file = self._get_shard_file(index)
+        self.dataset_future = self.pool.submit(
+            create_pretraining_dataset, data_file,
+            self.max_predictions_per_seq, self.num_workers,
+            self.train_micro_batch_size_per_gpu, self.worker_init,
+            self.data_sampler)
+
+    def get_batch(self, batch_iter):
+        return batch_iter
+
+    def prefetch_batch(self):
+        pass
+
+    def _get_shard_file(self, shard_index):
+        file_index = self._get_shard_file_index(shard_index, self.global_rank)
+        return self.dataset_files[file_index % self.num_files]
+
+    def _get_shard_file_index(self, shard_index, global_rank):
+        if dist.is_initialized() and self.world_size > self.num_files:
+            remainder = self.world_size % self.num_files
+            file_index = (shard_index * self.world_size) + global_rank + (
+                remainder * shard_index)
+        else:
+            file_index = shard_index * self.world_size + global_rank
+
+        return file_index % self.num_files
--- a/Deepspeed/BingBertGlue/pytorch_pretrained_bert/__init__.py
+++ b/Deepspeed/BingBertGlue/pytorch_pretrained_bert/__init__.py
+__version__ = "0.4.0"
+from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
+from .modeling import (BertConfig, BertModel, BertForPreTraining,
+                       BertForMaskedLM, BertForNextSentencePrediction,
+                       BertForSequenceClassification, BertForMultipleChoice,
+                       BertForTokenClassification, BertForQuestionAnswering)
+from .optimization import BertAdam
+from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE
--- a/Deepspeed/BingBertGlue/pytorch_pretrained_bert/__main__.py
+++ b/Deepspeed/BingBertGlue/pytorch_pretrained_bert/__main__.py
--- a/Deepspeed/BingBertGlue/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
+++ b/Deepspeed/BingBertGlue/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
--- a/Deepspeed/BingBertGlue/pytorch_pretrained_bert/file_utils.py
+++ b/Deepspeed/BingBertGlue/pytorch_pretrained_bert/file_utils.py
--- a/Deepspeed/BingBertGlue/pytorch_pretrained_bert/modeling.py
+++ b/Deepspeed/BingBertGlue/pytorch_pretrained_bert/modeling.py
--- a/Deepspeed/BingBertGlue/pytorch_pretrained_bert/optimization.py
+++ b/Deepspeed/BingBertGlue/pytorch_pretrained_bert/optimization.py
--- a/Deepspeed/BingBertGlue/pytorch_pretrained_bert/tokenization.py
+++ b/Deepspeed/BingBertGlue/pytorch_pretrained_bert/tokenization.py
--- a/Deepspeed/BingBertGlue/run_glue_bert_base_finetune.sh
+++ b/Deepspeed/BingBertGlue/run_glue_bert_base_finetune.sh
--- a/Deepspeed/BingBertGlue/run_glue_bert_large_finetune.sh
+++ b/Deepspeed/BingBertGlue/run_glue_bert_large_finetune.sh
--- a/Deepspeed/BingBertGlue/run_glue_classifier_bert_base.py
+++ b/Deepspeed/BingBertGlue/run_glue_classifier_bert_base.py
--- a/Deepspeed/BingBertGlue/run_glue_classifier_bert_large.py
+++ b/Deepspeed/BingBertGlue/run_glue_classifier_bert_large.py
--- a/Deepspeed/BingBertGlue/turing/dataset.py
+++ b/Deepspeed/BingBertGlue/turing/dataset.py