dtk23.04初始化

bf95e032 · hepj987 · bf95e032 · bf95e032 · bf95e032 · bf95e032
Commit bf95e032 authored Jul 14, 2023 by hepj987
20 changed files
--- a/.gitignore
+++ b/.gitignore
+# tests
+# megatron autogenerated indices
+tests/data/*/*npy
+tests/tools/openwebtext-1000.jsonl
+tmp/
+# macOS
+.DS_Store
+# Byte-compiled / optimized / DLL files
+*/__pycache__/
+*.py[cod]
+*.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask:
+instance/
+.webassets-cache
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+Pipfile
+Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Intellij project settings
+.idea/
+.iml
+# VSCode
+.vscode/
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# static files generated from Django application
+media
+staticfiles
+/tags
+# tmp files
+*.swp
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+image: gitlab-master.nvidia.com/dl/dgx/pytorch:20.12-py3-devel
+test:
+  script:
+    - pytest --junitxml=report.xml tests
+  artifacts:
+    when: always
+    reports:
+      junit: report.xml
\ No newline at end of file
--- a/CODEOWNERS
+++ b/CODEOWNERS
+* @bigscience-workshop/megatron-deepspeed-codeowners
--- a/LICENSE
+++ b/LICENSE
+The following applies to all files unless otherwise noted:
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+--
+This repository also contains code from Hugging Face Inc., Google Research,
+Facebook (from their Fairseq project), and Philip Popien. Files from these
+organizations have notices at the top of each file. Below are licenses
+used in those files, as indicated.
+------------- LICENSE FOR huggingface and Google Research code  --------------
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+------------- LICENSE FOR Facebook Fairseq code --------------
+MIT License
+Copyright (c) Facebook, Inc. and its affiliates.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/MANIFEST.in
+++ b/MANIFEST.in
+include megatron/data/Makefile
+include megatron/data/helpers.cpp
--- a/Makefile
+++ b/Makefile
+.PHONY: test style
+check_dirs := tests tools/convert_checkpoint
+help: ## this help
+	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[a-zA-Z_-]+:.*?##/ { printf "  \033[36m%-22s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
+test: ## run tests
+	pytest tests
+style: ## checks for code style and applies formatting
+	black $(check_dirs)
+	isort $(check_dirs)
--- a/README.md
+++ b/README.md
+# Generative Pre-Training2(GPT2)
+### 模型介绍
+```
+GPT2模型：第二代生成式预训练模型（Generative Pre-Training2）。
+```
+### 模型结构
+```
+GPT2使用 Transformer 的 Decoder 结构，并对 Transformer Decoder 进行了一些改动，并通过Megatron和deepspeed进行分布式运行
+```
+### 数据集
+```
+#下载数据集
+wget https://huggingface.co/bigscience/misc-test-data/resolve/main/stas/oscar-1GB.jsonl.xz
+#下载vocab文件
+wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
+wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
+xz -d oscar-1GB.jsonl.xz
+#处理数据集
+python tools/preprocess_data.py \
+    --input oscar-1GB.jsonl \
+    --output-prefix my-gpt2 \
+    --vocab gpt2-vocab.json \
+    --dataset-impl mmap \
+    --tokenizer-type GPT2BPETokenizer \
+    --merge-file gpt2-merges.txt \
+    --append-eod \
+    --workers 8
+```
+## GPT2预训练
+### 环境配置
+推荐使用docker方式运行，提供[光源](https://www.sourcefind.cn/)拉取的docker镜像：
+```
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.10.0-centos7.6-dtk-23.04-py37-latest
+```
+进入docker
+```
+pip install -r requirements.txt  -i http://pypi.tuna.tsinghua.edu.cn/simple  --trusted-host pypi.tuna.tsinghua.edu.cn
+```
+### 训练（单卡测试样例）
+```
+rm megatron/arguments.py
+cp megatron/arguments.py-one_node megatron/arguments.py
+sh run-train.sh（基于单节点四卡）
+```
+```
+#重要参数
+MODEL_NAME 					模型名（自定义）
+CHECKPOINT_PATH				模型保存&加载路径
+DATA_PATH					数据集路径（转换后的）
+TENSORBOARD_PATH			tensorboard路径
+CODECARBON_PATH				codecarbon路径
+N_GPUS         				使用加速卡数量
+TP_SIZE  	 				TP数量
+PP_SIZE      				PP数量
+MICRO_BATCH_SIZE			MICRO_BATCH_SIZE大小
+GLOBAL_BATCH_SIZE           GLOBAL_BATCH_SIZE大小
+NLAYERS 					模型层数
+NHIDDEN						隐藏层维度
+NHEADS						多注意力机制头数
+SEQ_LEN						最大长度
+SAVE_INTERVAL				保存频率
+--train-samples				训练样本数
+--eval-interval				验证频率
+--eval-iters				验证iter
+```
+### GPT2模型16B训练(多节点)
+要求DCU集群Slurm环境正常。
+推荐用户使用预编译好的python3.7包来快速建立python3虚拟环境，pytorch、apex、torchaudio、colossalai、faiss、mmcv-full 、torchvision、tensorflow需要在[光合开发者社区](https://cancon.hpccube.com:65024/4/main/)下载所需DCU版本安装包
+```
+export PYTHON3_LIB_PATH=/python_lib_path
+virtualenv -p /python_bin_path/python3 --system-site-packages venv_gpt2
+source env.sh	#进入venv_gpt2虚拟环境
+pip install -r requirements.txt  -i http://pypi.tuna.tsinghua.edu.cn/simple  --trusted-host pypi.tuna.tsinghua.edu.cn
+```
+```
+rm megatron/arguments.py
+cp megatron/arguments.py-nodes megatron/arguments.py
+sbatch  run-16B.sh(主要参数在single-16B.sh)
+```
+```
+#重要参数
+MODEL_NAME 					模型名（自定义）
+CHECKPOINT_PATH				模型保存&加载路径
+DATA_PATH					数据集路径（转换后的）
+TENSORBOARD_PATH			tensorboard路径
+CODECARBON_PATH				codecarbon路径
+TP_SIZE  	 				TP数量
+PP_SIZE      				PP数量
+MICRO_BATCH_SIZE			MICRO_BATCH_SIZE大小
+GLOBAL_BATCH_SIZE           GLOBAL_BATCH_SIZE大小
+NLAYERS 					层数
+NHIDDEN						隐藏层维度
+NHEADS						注意力机制头数
+SEQ_LEN						最大长度
+SAVE_INTERVAL				保存频率
+--train_iters				训练步数
+--eval-interval				验证频率
+--eval-iters				验证iter
+```
+### 16B模型训练loss
+|   卡数    |   lm loss    |
+| :-------: | :----------: |
+| 32 x 4DCU | 1.965622E+00 |
+### 16B模型验证
+|   卡数    | lm loss value | lm loss PPL  |
+| :-------: | :-----------: | :----------: |
+| 32 x 4DCU | 4.299443E+00  | 7.365877E+01 |
+## GPT2文本生成
+使用GPT做文本生成时需要对训练好的模型进行转换，转换需要安装0.7.3版本 deepspeed（此工程已包含）
+```
+pip install deepspeed-0.7.3+unknown-cp37-cp37m-linux_x86_64.whl -i http://pypi.tuna.tsinghua.edu.cn/simple  --trusted-host pypi.tuna.tsinghua.edu.cn
+```
+对deepspeed进行一些修改
+```
+修改/usr/local/lib/python3.7/site-packages/deepspeed/checkpoint/constants.py
+第34行
+	ZERO_FILE_PREFIX = 'bf16_' + 'zero_pp_rank_'
+改为：
+	ZERO_FILE_PREFIX =  'zero_pp_rank_'
+修改/usr/local/lib/python3.7/site-packages/deepspeed/ops/op_builder/builder.py
+第133行 def assert_torch_info(torch_info):函数
+删除下边的版本判断
+	install_torch_version = torch_info['version']
+	install_cuda_version = torch_info['cuda_version']
+	install_hip_version = torch_info['hip_version']
+修改/usr/local/lib/python3.7/site-packages/deepspeed/runtime/state_dict_factory.py文件
+第177行def check_ckpt_list(self):函数
+删除mp_world_size判断
+	if 'mp_world_size' in sd.keys():
+            assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}"
+```
+### 转换脚本
+```
+sh conver.sh
+```
+```
+#重要参数
+需要将工程路径加入PYTHONPATH
+例如：export PYTHONPATH=/home/megatron-deepspeed_dtk22.10:$PYTHONPATH
+CHECKPOINT_PATH  需要转换的模型路径(具体到保存的global_step)
+output_folder	 转换后的模型路径
+target_tp		 转换后的TP数（需要与训练时保持一致） 	 
+target_pp		 转换后的PP数 （设置为1）
+```
+### 无条件文本生成
+```
+sh run-inf.sh（这里以单节点小模型为例）
+```
+```
+#生成时模型各项参数需要与训练时保持一致（TP也需要保持一致）
+--micro-batch-size  	micro-batch-size大小
+--out-seq-length		输出文本程度
+--genfile				生成文本保存位置
+--num-samples			生成样本个数
+```
+## loss收敛情况
+16B模型使用oscar数据集收敛情况如下：
+![image-20230524143710566](image-gpt-loss.png)
+![image-20230524143830580](image-gpt-loss2.png)
+## 源码仓库及问题反馈
+https://developer.hpccube.com/codes/modelzoo/gpt2-pytorch/
+## 参考
+https://github.com/bigscience-workshop/Megatron-DeepSpeed
--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
--- a/examples/create_embeddings.sh
+++ b/examples/create_embeddings.sh
+#!/bin/bash
+# Compute embeddings for each entry of a given dataset (e.g. Wikipedia)
+RANK=0
+WORLD_SIZE=1
+# Wikipedia data can be downloaded from the following link:
+# https://github.com/facebookresearch/DPR/blob/master/data/download_data.py
+EVIDENCE_DATA_DIR=<Specify path of Wikipedia dataset>
+EMBEDDING_PATH=<Specify path to store embeddings>
+CHECKPOINT_PATH=<Specify path of pretrained ICT model>
+python tools/create_doc_index.py \
+    --num-layers 12 \
+    --hidden-size 768 \
+    --num-attention-heads 12 \
+    --tensor-model-parallel-size 1 \
+    --micro-batch-size 128 \
+    --checkpoint-activations \
+    --seq-length 512 \
+    --retriever-seq-length 256 \
+    --max-position-embeddings 512 \
+    --load ${CHECKPOINT_PATH} \
+    --evidence-data-path ${EVIDENCE_DATA_DIR} \
+    --embedding-path ${EMBEDDING_PATH} \
+    --indexer-log-interval 1000 \
+    --indexer-batch-size 128 \
+    --vocab-file bert-vocab.txt \
+    --num-workers 2 \
+    --fp16
--- a/examples/curriculum_learning/README.md
+++ b/examples/curriculum_learning/README.md
+This is a short tutorial of how to use/tune the curriculum learning (CL) integration. Currently it is only integrated for GPT pre-training. For technical details please refer to our [paper](https://arxiv.org/abs/2108.06084).
+# Disable batch size warmup (--rampup-batch-size)
+In our [paper](https://arxiv.org/abs/2108.06084) section 5.4 we demonstrate that curriculum learning (seqlen-based) provides much better training stability than the batch size warmup technique. So when using CL you need to remove the `--rampup-batch-size` config in your training script. It's not recommended to use both CL and batch size warmup, because both of them will reduce the number of tokens in a batch. Another related change you might want is to increase your micro batch size, since without batch size warmup your batch size will be fixed now.
+# Token-based training termination
+Because CL changes length of each sequence/sample during training, it is very hard/impossible to use number of steps/samples to terminate the training exactly at the desired number of tokens. Thus we add a `--train-tokens` config as an alternative accurate token-based termination. We recommend increase your original `--train-samples` or `--train-iters` to a large enough number (e.g., 2X of what you used for baseline), and set `--train-tokens` at the exact desired number of training tokens (e.g., 300B for GPT-3 like training).
+# Token-based LR decay
+Again because CL changes the number of tokens per batch, in our [paper](https://arxiv.org/abs/2108.06084) Appendix A.2 we show that it is also necessary to change the LR decay to token-based (to avoid decaying LR too fast). Thus we add a `--lr-decay-tokens` which will be the number of LR decay tokens. If previously you were using `--lr-decay-samples`, you can calculate your `--lr-decay-tokens` simply by multiplying the former by full seqlen (e.g. 2K for GPT-3). Then you need to replace `--lr-decay-samples` with `--lr-decay-tokens` in your script.
+# LR warmup adjustment
+For LR warmup we don't change it to token-based, because doing so for CL means slowing down the LR warmup, which is both unnecessary and harmful. However, you may need to adjust your `--lr-warmup-samples` or `--lr-warmup-iters` from non-CL cases for various reasons (e.g., if you used `--rampup-batch-size` in non-CL case, for CL we don't use it so the number of samples per batch will be different at the beginning). Assuming you want to use `X` tokens to warmup the LR (for OpenAI GPT-3 this was 375M tokens), then for CL case you may set `--lr-warmup-samples` as `X` divided by the `min_difficulty` below, or set `--lr-warmup-iters` as `X` divided by `min_difficulty * --global-batch-size`. This is a rough estimation based on that CL starts from seqlen `min_difficulty` and it won't increase too much during LR warmup.
+# Token-based tensorboard
+Because of the above changes, we also add token-based tensorboard scalars. We also add scalars that plot the seqlen at each step.
+# Curriculum learning hyperparameters tuning strategy
+The curriculum learning hyperparameters are all located in the deepspeed config json file (see the example `ds_config_cl.json` in this dir). There are a few config entries that you may need to adjust to your circumstances, and two of which require some tuning. In our [paper](https://arxiv.org/abs/2108.06084) Appendix A.1 we have a more detailed tuning strategy description.
+1. `max_difficulty` should be set as the full seqlen (i.e., your `--seq-length`). No need to tune this.
+2. `min_difficulty` is the beginning seqlen used by CL. In general smaller `min_difficulty` could provide better stability/convergence speed benefit. However we observe that for a larger model or for different training data, starting from a very small seqlen could lead to significant validation PPL fluctuation (or even divergence) at the very beginning. We recommend to start with `min_difficulty` at 64, and then increase it if you observe problems at the very beginning. Note that to enable Tensor Core acceleration you should always use a multiple of 8.
+3. `total_curriculum_step` is the total number of steps used by CL. In general larger `total_curriculum_step` could provide better stability/convergence speed benefit. However we observe that a too large `total_curriculum_step` could lead to overfitting and significant validation PPL fluctuation (or even divergence) at the first few multiple of LR warmup steps. In our paper we have a detailed tuning strategy based on binary search. However, if you want to reduce the tuning effort we recommend directly setting `total_curriculum_step` as half of baseline's total number of steps. This may not provide the highest convergence speed benefit, but should provide enough training stability gains.
+4. `difficulty_step` is the change in seq length per CL step. A smaller value is preferable since it gives more smooth CL and better stability. Like `min_difficulty` it too needs to be multiple of 8 for Tensor core acceleration, thus 8 is a good default.
--- a/examples/curriculum_learning/ds_config_cl.json
+++ b/examples/curriculum_learning/ds_config_cl.json
+{
+  "train_batch_size": 512,
+  "gradient_accumulation_steps": 1,
+  "steps_per_print": 1,
+  "zero_optimization": {
+    "stage": 0
+  },
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.00015,
+      "max_grad_norm": 1.0,
+      "betas": [0.9, 0.95]
+    }
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "wall_clock_breakdown": false,
+  "zero_allow_untested_optimizer": false,
+  "curriculum_learning": {
+    "enabled": true,
+    "curriculum_type": "seqlen",
+    "min_difficulty": 8,
+    "max_difficulty": 1024,
+    "schedule_type": "fixed_linear",
+    "schedule_config": {
+      "total_curriculum_step": 60000,
+      "difficulty_step": 8
+    }
+  }
+}
--- a/examples/curriculum_learning/pretrain_gpt_cl.sh
+++ b/examples/curriculum_learning/pretrain_gpt_cl.sh
+#!/bin/bash
+# This is a dummy train script to show how to use curriculum
+# learning, some parameters are not for actual GPT pretraining.
+TARGET_GLOBAL_BATCH_SIZE=512
+TRAIN_SAMPLES=146_484_375
+LR=1.0e-4
+MIN_LR=1.0e-5
+LR_DECAY_SAMPLES=126_953_125
+LR_WARMUP_SAMPLES=183_105
+SEQLEN=1024
+############################################################
+# New configs for curriculum learning, see README.md
+TRAIN_TOKENS=10_000_000_000
+LR_DECAY_TOKENS=$(($LR_DECAY_SAMPLES*$SEQLEN))
+############################################################
+LOG_INTERVAL=100
+EVAL_ITERS=10
+EVAL_INTERVAL=100
+SAVE_INTERVAL=1000
+VOCAB_PATH=/data/Megatron-LM/data/gpt2-vocab.json
+MERGE_PATH=/data/Megatron-LM/data/gpt2-merges.txt
+DATA_PATH=/data/Megatron-LM/data/indexed_datasets/megatron
+MICRO_BATCH_SIZE=1
+MP_SIZE=1
+PP_SIZE=1
+NUM_GPUS=128
+echo ${NUM_GPUS}
+if [[ $PP_SIZE -gt 0 ]]; then
+    DP_SIZE=$(( ${NUM_GPUS} / (${PP_SIZE} * ${MP_SIZE}) ))
+else
+    DP_SIZE=$(( ${NUM_GPUS} / ${MP_SIZE} ))
+fi
+GRAD_ACC_STEPS=$(( ${TARGET_GLOBAL_BATCH_SIZE} / (${MICRO_BATCH_SIZE} * ${DP_SIZE}) ))
+NAME="gpt-117M-pp${PP_SIZE}-mp${MP_SIZE}-bsz${TARGET_GLOBAL_BATCH_SIZE}-mbsz${MICRO_BATCH_SIZE}-cl"
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+host="${HOSTNAME}"
+TENSORBOARD_DIR="tensorboard/${NAME}_${host}_${current_time}"
+mkdir -p ${TENSORBOARD_DIR}
+CHECKPOINT_PATH="checkpoints/${NAME}"
+megatron_options=" \
+        --data-path ${DATA_PATH} \
+        --vocab-file ${VOCAB_PATH} \
+        --merge-file ${MERGE_PATH} \
+        --data-impl mmap \
+        --override-lr-scheduler \
+        --adam-beta1 0.9 \
+        --adam-beta2 0.95 \
+        --tensor-model-parallel-size ${MP_SIZE} \
+        --init-method-std 0.014 \
+        --lr-decay-tokens ${LR_DECAY_TOKENS} \
+        --lr-warmup-samples ${LR_WARMUP_SAMPLES} \
+        --micro-batch-size ${MICRO_BATCH_SIZE} \
+        --global-batch-size ${TARGET_GLOBAL_BATCH_SIZE} \
+        --num-layers 12 \
+        --hidden-size 768 \
+        --num-attention-heads 16 \
+        --seq-length ${SEQLEN} \
+        --max-position-embeddings ${SEQLEN} \
+        --train-samples ${TRAIN_SAMPLES} \
+        --train-tokens ${TRAIN_TOKENS} \
+        --lr ${LR} \
+        --min-lr ${MIN_LR} \
+        --lr-decay-style cosine \
+        --split 98,2,0 \
+        --log-interval ${LOG_INTERVAL} \
+        --eval-interval ${EVAL_INTERVAL} \
+        --eval-iters ${EVAL_ITERS} \
+        --save-interval ${SAVE_INTERVAL} \
+        --weight-decay 0.1 \
+        --clip-grad 1.0 \
+        --hysteresis 2 \
+        --num-workers 0 \
+        --checkpoint-activations \
+        --fp16 \
+        --load ${CHECKPOINT_PATH} \
+        --save ${CHECKPOINT_PATH} \
+        --tensorboard-queue-size 1 \
+        --log-timers-to-tensorboard \
+        --log-batch-size-to-tensorboard \
+        --log-validation-ppl-to-tensorboard \
+        --tensorboard-dir ${TENSORBOARD_DIR}"
+config_json="ds_config_cl.json"
+deepspeed_options=" \
+		    --deepspeed \
+		    --deepspeed_config ${config_json} \
+		    --pipeline-model-parallel-size ${PP_SIZE} \
+		    --partition-activations"
+run_cmd="deepspeed ../../pretrain_gpt.py ${megatron_options} ${deepspeed_options} &>> ${NAME}.log"
+echo ${run_cmd}
+eval ${run_cmd}
+set +x
--- a/examples/evaluate_ict_zeroshot_nq.sh
+++ b/examples/evaluate_ict_zeroshot_nq.sh
+#!/bin/bash
+# Evaluate natural question test data given Wikipedia embeddings and pretrained
+# ICT model
+# Datasets can be downloaded from the following link:
+# https://github.com/facebookresearch/DPR/blob/master/data/download_data.py
+EVIDENCE_DATA_DIR=<Specify path of Wikipedia dataset>
+EMBEDDING_PATH=<Specify path of the embeddings>
+CHECKPOINT_PATH=<Specify path of pretrained ICT model>
+QA_FILE=<Path of the natural question test dataset>
+python tasks/main.py \
+    --task ICT-ZEROSHOT-NQ \
+    --tokenizer-type BertWordPieceLowerCase \
+    --num-layers 12 \
+    --hidden-size 768 \
+    --num-attention-heads 12 \
+    --tensor-model-parallel-size 1 \
+    --micro-batch-size 128 \
+    --checkpoint-activations \
+    --seq-length 512 \
+    --max-position-embeddings 512 \
+    --load ${CHECKPOINT_PATH} \
+    --evidence-data-path ${EVIDENCE_DATA_DIR} \
+    --embedding-path ${EMBEDDING_PATH} \
+    --retriever-seq-length 256 \
+    --vocab-file  bert-vocab.txt\
+    --qa-data-test ${QA_FILE} \
+    --num-workers 2 \
+    --faiss-use-gpu \
+    --retriever-report-topk-accuracies 1 5 20 100 \
+    --fp16
--- a/examples/evaluate_zeroshot_gpt.sh
+++ b/examples/evaluate_zeroshot_gpt.sh
+#!/bin/bash
+WORLD_SIZE=8
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+TASK="LAMBADA"
+VALID_DATA=<lambada path>
+VOCAB_FILE=gpt2-vocab.json
+MERGE_FILE=gpt2-merges.txt
+CHECKPOINT=checkpoints/gpt2_345m
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
+               --task $TASK \
+               --valid-data $VALID_DATA \
+               --tokenizer-type GPT2BPETokenizer \
+               --strict-lambada \
+               --vocab-file $VOCAB_FILE \
+               --merge-file $MERGE_FILE \
+               --load $CHECKPOINT \
+               --tensor-model-parallel-size 1 \
+               --num-layers 24 \
+               --hidden-size 1024 \
+               --num-attention-heads 16 \
+               --batch-size 8 \
+               --checkpoint-activations \
+               --seq-length 1024 \
+               --max-position-embeddings 1024 \
+               --log-interval 10 \
+               --fp16 \
+               --no-load-optim \
+               --no-load-rng
--- a/examples/finetune_mnli_distributed.sh
+++ b/examples/finetune_mnli_distributed.sh
+#!/bin/bash
+WORLD_SIZE=8
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+TRAIN_DATA="data/glue_data/MNLI/train.tsv"
+VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \
+            data/glue_data/MNLI/dev_mismatched.tsv"
+PRETRAINED_CHECKPOINT=checkpoints/bert_345m
+VOCAB_FILE=bert-vocab.txt
+CHECKPOINT_PATH=checkpoints/bert_345m_mnli
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
+               --task MNLI \
+               --seed 1234 \
+               --train-data $TRAIN_DATA \
+               --valid-data $VALID_DATA \
+               --tokenizer-type BertWordPieceLowerCase \
+               --vocab-file $VOCAB_FILE \
+               --epochs 5 \
+               --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
+               --tensor-model-parallel-size 1 \
+               --num-layers 24 \
+               --hidden-size 1024 \
+               --num-attention-heads 16 \
+               --micro-batch-size 8 \
+               --checkpoint-activations \
+               --lr 5.0e-5 \
+               --lr-decay-style linear \
+               --lr-warmup-fraction 0.065 \
+               --seq-length 512 \
+               --max-position-embeddings 512 \
+               --save-interval 500000 \
+               --save $CHECKPOINT_PATH \
+               --log-interval 10 \
+               --eval-interval 100 \
+               --eval-iters 50 \
+               --weight-decay 1.0e-1 \
+               --fp16
--- a/examples/finetune_race_distributed.sh
+++ b/examples/finetune_race_distributed.sh
+#!/bin/bash
+WORLD_SIZE=8
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+TRAIN_DATA="data/RACE/train/middle"
+VALID_DATA="data/RACE/dev/middle \
+            data/RACE/dev/high"
+VOCAB_FILE=bert-vocab.txt
+PRETRAINED_CHECKPOINT=checkpoints/bert_345m
+CHECKPOINT_PATH=checkpoints/bert_345m_race
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
+               --task RACE \
+               --seed 1234 \
+               --train-data $TRAIN_DATA \
+               --valid-data $VALID_DATA \
+               --tokenizer-type BertWordPieceLowerCase \
+               --vocab-file $VOCAB_FILE \
+               --epochs 3 \
+               --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
+               --tensor-model-parallel-size 1 \
+               --num-layers 24 \
+               --hidden-size 1024 \
+               --num-attention-heads 16 \
+               --micro-batch-size 4 \
+               --checkpoint-activations \
+               --lr 1.0e-5 \
+               --lr-decay-style linear \
+               --lr-warmup-fraction 0.06 \
+               --seq-length 512 \
+               --max-position-embeddings 512 \
+               --save-interval 100000 \
+               --save $CHECKPOINT_PATH \
+               --log-interval 10 \
+               --eval-interval 100 \
+               --eval-iters 50 \
+               --weight-decay 1.0e-1 \
+               --clip-grad 1.0 \
+               --hidden-dropout 0.1 \
+               --attention-dropout 0.1 \
+               --fp16
--- a/examples/generate_text.sh
+++ b/examples/generate_text.sh
+#!/bin/bash
+CHECKPOINT_PATH=checkpoints/gpt2
+VOCAB_FILE=gpt2-vocab.json
+MERGE_FILE=gpt2-merges.txt
+python tools/generate_samples_gpt.py \
+       --tensor-model-parallel-size 1 \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --load $CHECKPOINT_PATH \
+       --num-attention-heads 16 \
+       --max-position-embeddings 1024 \
+       --tokenizer-type GPT2BPETokenizer \
+       --fp16 \
+       --batch-size 2 \
+       --seq-length 1024 \
+       --out-seq-length 1024 \
+       --temperature 1.0 \
+       --vocab-file $VOCAB_FILE \
+       --merge-file $MERGE_FILE \
+       --genfile unconditional_samples.json \
+       --num-samples 2 \
+       --top_p 0.9 \
+       --recompute
--- a/examples/merge_mp_bert.sh
+++ b/examples/merge_mp_bert.sh
+#!/bin/bash
+TENSOR_MODEL_PARALLEL_SIZE=2
+VOCAB_FILE=bert-vocab.txt
+CHECKPOINT_PATH=checkpoints/bert_345m
+WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \
+                                --model-type BERT \
+                                --tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \
+                                --tokenizer-type BertWordPieceLowerCase \
+                                --vocab-file $VOCAB_FILE \
+                                --num-layers 24 \
+                                --hidden-size 1024 \
+                                --num-attention-heads 16 \
+                                --seq-length 512 \
+                                --max-position-embeddings 512 \
+                                --load $CHECKPOINT_PATH
--- a/examples/pretrain_bert.sh
+++ b/examples/pretrain_bert.sh
+#!/bin/bash
+RANK=0
+WORLD_SIZE=1
+DATA_PATH=<Specify path and file prefix>_text_sentence
+CHECKPOINT_PATH=<Specify path>
+python pretrain_bert.py \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 4 \
+       --global-batch-size 8 \
+       --seq-length 512 \
+       --max-position-embeddings 512 \
+       --train-iters 2000000 \
+       --lr-decay-iters 990000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file bert-vocab.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --lr 0.0001 \
+       --min-lr 0.00001 \
+       --lr-decay-style linear \
+       --lr-warmup-fraction .01 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/examples/pretrain_bert_distributed.sh
+++ b/examples/pretrain_bert_distributed.sh
+#!/bin/bash
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+DATA_PATH=<Specify path and file prefix>_text_sentence
+CHECKPOINT_PATH=<Specify path>
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_bert.py \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 4 \
+       --global-batch-size 32 \
+       --seq-length 512 \
+       --max-position-embeddings 512 \
+       --train-iters 1000000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file bert-vocab.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.0001 \
+       --lr-decay-style linear \
+       --min-lr 1.0e-5 \
+       --lr-decay-iters 990000 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16