version 1

581b8d15 · liangjing · 581b8d15 · 581b8d15 · 581b8d15 · 581b8d15
Commit 581b8d15 authored Apr 10, 2023 by liangjing
20 changed files
--- a/Dockerfiles/Dockerfile
+++ b/Dockerfiles/Dockerfile
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#To get the latest APEX
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:22.04-py3
+FROM ${FROM_IMAGE_NAME}
+# Install dependencies
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends \
+        bzip2 \
+        cabextract \
+        iputils-ping \
+        pbzip2 \
+        pv \
+        lsof \
+ && rm -rf /var/lib/apt/lists/*
+WORKDIR /workspace/bert
+COPY requirements.txt .
+ARG PYTHON=python3.8
+RUN $PYTHON -m pip install --no-cache-dir -r requirements.txt
+# Preprocessing
+# WORKDIR /workspace
+RUN cd /workspace && git clone https://github.com/attardi/wikiextractor.git
+RUN cd /workspace/wikiextractor && git checkout e4abb4cbd019b0257824ee47c23dd163919b731b
+# Install BERT
+ENV BERT_PREP_WORKING_DIR /workspace/bert/data
+ENV PYTHONPATH "/workspace/bert"
+# Install GCC 8.2
+RUN apt-get update
+RUN apt-get install -y libssl-dev ccache
+RUN mkdir -p /workspace/temp_install_dir
+WORKDIR /workspace/temp_install_dir 
+COPY sanitizer_platform_limits_posix.h .
+COPY sanitizer_platform_limits_posix.cc .
+RUN wget --no-proxy -O gcc-8.2.0.tar.xz https://paddle-ci.gz.bcebos.com/gcc-8.2.0.tar.xz 
+RUN tar -vxf gcc-8.2.0.tar.xz
+WORKDIR gcc-8.2.0
+RUN mv ../sanitizer_platform_limits_posix.h libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.h
+RUN mv ../sanitizer_platform_limits_posix.cc libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.cc
+RUN sed -i 's/ftp/http/g' ./contrib/download_prerequisites
+RUN ./contrib/download_prerequisites
+ARG GCC_NEW_DIR=/usr/local/gcc-8.2
+ARG GCC_OLD_DIR=/usr/bin
+RUN env LIBRARY_PATH="" ./configure --prefix=$GCC_NEW_DIR \
+        --enable-threads=posix --disable-checking --disable-multilib \
+        --enable-languages=c,c++
+RUN env LIBRARY_PATH="" make -j `nproc`
+RUN env LIBRARY_PATH="" make install -j `nproc`
+COPY replace_gcc_symlink.sh . 
+RUN bash -ex replace_gcc_symlink.sh "$GCC_NEW_DIR" "$GCC_OLD_DIR" 
+# Install cmake 3.16.0
+WORKDIR /workspace/temp_install_dir
+RUN wget -O cmake-3.16.0.tar.gz https://cmake.org/files/v3.16/cmake-3.16.0.tar.gz
+RUN tar -zvxf cmake-3.16.0.tar.gz
+WORKDIR cmake-3.16.0
+RUN ./bootstrap
+RUN make -j `nproc`
+RUN make install -j `nproc`
+COPY replace_cmake_symlink.sh . 
+RUN bash -ex replace_cmake_symlink.sh /usr/local/bin/cmake 
+# Cleanup install dir 
+WORKDIR /workspace/bert
+RUN rm -rf /workspace/temp_install_dir
+# Remove libsnappy-dev to avoid ld link error when compiling Paddle
+RUN apt-get purge -y libsnappy-dev
+# Remove protoc to avoid compilation error of Paddle
+RUN mv /usr/bin/protoc /usr/bin/protoc.bak  
--- a/Dockerfiles/build_image_fast.sh
+++ b/Dockerfiles/build_image_fast.sh
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+SRC_IMAGE="nvcr.io/nvidia/pytorch:22.04-py3"
+DST_IMAGE="nvcr.io/nvidia/pytorch:22.04-py3-paddle-fast-test"
+WHEEL_URL="https://paddle-wheel.bj.bcebos.com/mlperf-2.0"
+PD_WHEEL_NAME="paddlepaddle_gpu-0.0.0-cp38-cp38-linux_x86_64.whl"
+OP_TAR_NAME="custom_setup_ops.tar.gz"
+PYBIND_FUNCTION_SO_NAME="functions.cpython-38-x86_64-linux-gnu.so"
+PYTHON="python3.8"
+###################
+TMP_DOCKERFILE=Dockerfile.tmp
+APEX_CLONE_DIR=/workspace/apex_dir
+APEX_DIR=$APEX_CLONE_DIR/apex/build_scripts
+PY_PACKAGE_DIR=/opt/conda/lib/python3.8/site-packages
+OP_INSTALL_DIR=$PY_PACKAGE_DIR/custom_setup_ops
+if [[ $SRC_IMAGE == $DST_IMAGE ]]; then
+  echo "Error: SRC_IMAGE and DST_IMAGE cannot be the same!!!" 
+  exit 1
+fi
+OLD_DIR=`pwd`
+NEW_DIR=$(dirname `readlink -f "$0"`)
+cd $NEW_DIR
+cat <<EOF >$TMP_DOCKERFILE
+FROM $SRC_IMAGE
+RUN mkdir -p $APEX_CLONE_DIR \
+        && cd $APEX_CLONE_DIR \
+        && git clone -b new_fmhalib https://github.com/sneaxiy/apex \
+        && cd $APEX_DIR \
+        && bash build.sh 
+RUN curl -O $WHEEL_URL/$PD_WHEEL_NAME \
+        && $PYTHON -m pip install -U --force-reinstall $PD_WHEEL_NAME \
+        && rm -rf $PD_WHEEL_NAME
+RUN mkdir -p $OP_INSTALL_DIR \
+        && cd $OP_INSTALL_DIR \
+        && curl -O $WHEEL_URL/$OP_TAR_NAME \
+        && tar -zvxf $OP_TAR_NAME \
+        && rm -rf $OP_TAR_NAME
+RUN echo "from .custom_setup_ops import *">$OP_INSTALL_DIR/__init__.py 
+RUN $PYTHON -m pip install -U --force-reinstall git+https://github.com/mlperf/logging.git@2.0.0-rc1 
+RUN mkdir -p $PY_PACKAGE_DIR/pybind \
+        && cd $PY_PACKAGE_DIR/pybind \
+        && curl -O $WHEEL_URL/$PYBIND_FUNCTION_SO_NAME  
+COPY requirements.txt .
+RUN $PYTHON -m pip install -r requirements.txt
+RUN $PYTHON -m pip install -U --force-reinstall protobuf==3.20.1
+EOF
+docker build -t $DST_IMAGE \
+  --build-arg http_proxy=$http_proxy \
+  --build-arg https_proxy=$https_proxy \
+  --build-arg no_proxy=$no_proxy \
+  -f $TMP_DOCKERFILE .
+rm -rf $TMP_DOCKERFILE
+cd $OLD_DIR
--- a/Dockerfiles/build_image_from_scratch.sh
+++ b/Dockerfiles/build_image_from_scratch.sh
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+SRC_IMAGE="nvcr.io/nvidia/pytorch:22.04-py3"
+DST_IMAGE="nvcr.io/nvidia/pytorch:22.04-py3-paddle-dev-test"
+PYTHON_VER="3.8"
+###################
+TMP_IMAGE="nvcr.io/nvidia/pytorch:22.04-py3-paddle-dev"
+TMP_DOCKERFILE=Dockerfile.tmp
+PADDLE_DIR=/workspace/Paddle_src
+APEX_CLONE_DIR=/workspace/apex_dir
+APEX_DIR=$APEX_CLONE_DIR/apex/build_scripts
+PY_PACKAGE_DIR=/opt/conda/lib/python3.8/site-packages
+PYTHON=python$PYTHON_VER
+if [[ $SRC_IMAGE == $DST_IMAGE ]]; then
+  echo "Error: SRC_IMAGE and DST_IMAGE cannot be the same!!!" 
+  exit 1
+fi
+OLD_DIR=`pwd`
+NEW_DIR=$(dirname `readlink -f "$0"`)
+cd $NEW_DIR
+docker build -t $TMP_IMAGE \
+  --build-arg http_proxy=$http_proxy \
+  --build-arg https_proxy=$https_proxy \
+  --build-arg no_proxy=$no_proxy \
+  -f Dockerfile .
+cd $NEW_DIR/..
+cat <<EOF >$TMP_DOCKERFILE
+FROM $TMP_IMAGE
+RUN mkdir -p $APEX_CLONE_DIR \
+        && cd $APEX_CLONE_DIR \
+        && git clone -b new_fmhalib https://github.com/sneaxiy/apex \
+        && cd $APEX_DIR \
+        && bash build.sh 
+ENV APEX_DIR $APEX_DIR
+RUN mkdir -p $PADDLE_DIR \
+        && cd $PADDLE_DIR \
+        && git clone https://github.com/PaddlePaddle/Paddle \
+        && cd $PADDLE_DIR/Paddle \
+        && git checkout 108aeb28704e64a54f82b8a59266a4e9633f9949 
+ENV COMPILE_DIR $PADDLE_DIR/Paddle/build
+RUN mkdir -p $PADDLE_DIR/Paddle/build
+RUN cd $PADDLE_DIR/Paddle/build && cmake .. \
+  -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCUDA_ARCH_NAME=Ampere \
+  -DWITH_AVX=ON \
+  -DWITH_MKL=ON \
+  -DWITH_DISTRIBUTE=ON \
+  -DWITH_BRPC_RDMA=OFF \
+  -DWITH_LIBXSMM=OFF \
+  -DWITH_PSLIB=OFF \
+  -DWITH_BOX_PS=OFF \
+  -DWITH_XBYAK=ON \
+  -DWITH_PSCORE=ON \
+  -DWITH_HETERPS=OFF \
+  -DWITH_GLOO=ON \
+  -DWITH_TESTING=OFF \
+  -DPY_VERSION=$PYTHON_VER
+RUN cd $PADDLE_DIR/Paddle/build && make -j `nproc` 
+RUN $PYTHON -m pip install -U --force-reinstall $PADDLE_DIR/Paddle/build/python/dist/*.whl 
+COPY external_ops external_ops 
+RUN cd external_ops && $PYTHON setup.py install --force && rm -rf external_ops
+COPY pybind pybind 
+RUN cd pybind && $PYTHON compile.py && mkdir -p $PY_PACKAGE_DIR/pybind && cp *.so $PY_PACKAGE_DIR/pybind       
+RUN $PYTHON -m pip install -U --force-reinstall git+https://github.com/mlperf/logging.git@2.0.0-rc1
+RUN $PYTHON -m pip install -U --force-reinstall protobuf==3.20.1
+EOF
+docker build -t $DST_IMAGE \
+  --build-arg http_proxy=$http_proxy \
+  --build-arg https_proxy=$https_proxy \
+  --build-arg no_proxy=$no_proxy \
+  -f $TMP_DOCKERFILE .
+rm -rf $TMP_DOCKERFILE
+cd $OLD_DIR
--- a/Dockerfiles/replace_cmake_symlink.sh
+++ b/Dockerfiles/replace_cmake_symlink.sh
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+NEW_CMAKE="$1"
+OLD_CMAKE=`which cmake`
+mv "$OLD_CMAKE" "$OLD_CMAKE.bak"
+ln -s "$NEW_CMAKE" "$OLD_CMAKE"
--- a/Dockerfiles/replace_gcc_symlink.sh
+++ b/Dockerfiles/replace_gcc_symlink.sh
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+GCC_NEW_DIR="$1"
+GCC_OLD_DIR="$2"
+for prefix in gcc x86_64-linux-gnu-gcc;
+do
+  for suffix in "" "-ar" "-nm" "-ranlib";
+  do
+    old_file="$GCC_OLD_DIR/$prefix$suffix"
+    mv "$old_file" "$old_file.bak"
+    ln -s "$GCC_NEW_DIR/bin/$prefix$suffix" "$old_file"
+  done
+done
+for prefix in g++ x86_64-linux-gnu-g++;
+do
+  old_file="$GCC_OLD_DIR/$prefix"
+  mv "$old_file" "$old_file.bak"
+  ln -s "$GCC_NEW_DIR/bin/$prefix" "$old_file"
+done
--- a/Dockerfiles/requirements.txt
+++ b/Dockerfiles/requirements.txt
+# progress bars in model download and training scripts
+boto3==1.14.0
+gdown==3.13.0
+h5py==2.10.0
+html2text==2020.1.16
+ipdb==0.13.2
+nltk==3.5
+onnxruntime==1.3.0
+parameterized
+progressbar==2.5
+requests==2.23.0
+six==1.15.0
+tensorflow==2.2.0
+jieba
+colorlog
+colorama
+seqeval
+multiprocess
+mpi4py
+paddlenlp
+git+https://github.com/mlperf/logging.git@2.0.0-rc1
--- a/Dockerfiles/sanitizer_platform_limits_posix.cc
+++ b/Dockerfiles/sanitizer_platform_limits_posix.cc
--- a/Dockerfiles/sanitizer_platform_limits_posix.h
+++ b/Dockerfiles/sanitizer_platform_limits_posix.h
--- a/README.BERT.md
+++ b/README.BERT.md
+# Download and prepare the data
+Please download and prepare the data as described [here](https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/README.md#download-and-prepare-the-data).
+After preparation, you can see the directories are like:
+```
+<BASE_DATA_DIR>
+                     |_ phase1                                   # checkpoint to start from tf1
+                     |_ hdf5  
+                           |_ eval                               # evaluation chunks in binary hdf5 format fixed length 
+                           |_ eval_varlength                     # evaluation chunks in binary hdf5 format variable length
+                           |_ training_4320                      # 
+                              |_ hdf5_4320_shards_uncompressed   # sharded data in hdf5 format fixed length
+                              |_ hdf5_4320_shards_varlength      # sharded data in hdf5 format variable length
+```
+# Build the docker image
+We provide you two ways to build the docker image to run tests.
+## Build the docker image with pre-built binaries
+We built some necessary binaries beforehand, and you can build the docker image fast.
+```
+bash Dockerfiles/build_image_fast.sh
+```
+After the command finishes, you will get the docker image named `nvcr.io/nvidia/pytorch:22.04-py3-paddle-fast-test`.
+## Build the docker image from scratch
+This method would take a long time. It would contain the following steps:
+- Build the docker image which can compile the PaddlePaddle source code.
+- Compile the PaddlePaddle source code.
+- Compile the PaddlePaddle external operators.
+- Compile the PaddlePaddle external pybind functions.
+You can run the steps above by using the following command.
+```
+bash Dockerfiles/build_image_from_scratch.sh
+```
+After the command finishes, you will get the docker image named `nvcr.io/nvidia/pytorch:22.04-py3-paddle-dev-test`.
+# Prepare the checkpoint file
+Originally, the checkpoint of the BERT model is generated from TensorFlow. We can convert the original TensorFlow checkpoint file to the Python dictionary like this and dump the dictionary using the Python pickle module.
+```python
+{
+  "bert/encoder/layer_0/attention/self/query/kernel": numpy.ndarray(...),
+  "bert/encoder/layer_0/attention/self/query/bias": numpy.ndarray(...),
+  ...
+}
+```
+In this way, we can run tests without installing TensorFlow again after conversion. You can use the following command to convert the original TensorFlow checkpoint file:
+```python
+python models/load_tf_checkpoint.py \
+    <BASE_DATA_DIR>/phase1/model.ckpt-28252 \
+    <BASE_DATA_DIR>/phase1/model.ckpt-28252.tf_pickled
+```
+# Run the tests
+```
+export NEXP=10 # the trial test number
+export BASE_DATA_DIR=<your_bert_data_dir>
+export CONT=<your_docker_image_name>
+STAGE=run bash run_with_docker.sh
+```
\ No newline at end of file
--- a/README.md
+++ b/README.md
+# Bert-large（Bidirectional Encoder Representations from Transformers）
+## 模型介绍
+一种基于Transformer架构的预训练语言模型，由Google在2018年提出。BERT模型通过大规模的自监督预训练，学习了丰富的语言表示，可以应用于各种自然语言处理任务。
+BERT模型的核心是Transformer编码器，它可以对输入的文本数据进行编码，并生成对应的上下文表示。BERT模型使用双向编码器，即同时考虑输入序列的左侧和右侧上下文，可以捕捉更全面的语言表示。在预训练阶段，BERT模型使用了两种不同的预训练任务：Masked Language Model（MLM）和Next Sentence Prediction（NSP）。
+在MLM任务中，BERT模型会随机将一些单词替换成“[MASK]”标记，然后尝试预测这些标记所对应的单词。在NSP任务中，BERT模型需要判断两个句子是否是连续的。通过这两种任务的预训练，BERT模型可以学习到丰富的语言表示，从而可以应用于各种自然语言处理任务。
+## 模型结构
+BERT-large是BERT模型的一个更大、更复杂的版本。与BERT-base相比，BERT-large具有更多的层数、更多的参数和更深的网络结构，可以学习更深层次和更复杂的语言表示。
+BERT-large包含24个Transformer编码器，每个编码器有1024个隐藏层，总共包含340M个参数。在预训练阶段，BERT-large使用更多的未标记的文本数据进行预训练，并使用Masked Language Model（MLM）和Next Sentence Prediction（NSP）两个任务来优化模型。BERT-large的预训练阶段比BERT-base更复杂，并且需要更长的时间来训练。
+## 数据集
+模型训练的数据集来自Wikipedia 2020/01/01，即一种常用的自然语言处理数据集，它包含了维基百科上的文章和对应的摘要（即第一段内容），可用于各种文本相关的任务，例如文本分类、文本摘要、命名实体识别等。
+下载+预处理数据可按照下述进行，最终获得的输入数据如下图所示：
+    ./input_preprocessing/prepare_data.sh --outputdir /workspace/bert_data 
+    python3 models/load_tf_checkpoint.py \
+        /workspace/bert_data/phase1/model.ckpt-28252 \
+        /workspace/bert_data/phase1/model.ckpt-28252.tf_pickled
+![image-20230410143225383](C:\Users\liang\AppData\Roaming\Typora\typora-user-images\image-20230410143225383.png)
+## 训练
+### 环境配置
+提供[光源](https://www.sourcefind.cn/#/service-details)拉取的训练的docker镜像：
+* 训练镜像：
+python依赖安装：
+    pip3 install -r requirement.txt
+### 训练
+训练命令（此处以单机8卡规模为例说明）：
+    bash run_8gpu.sh
+    #不同环境的配置及数据的存放路径会有不同，请根据实际情况进行调整run_benchmark_8gpu.sh脚本中的如下内容：
+    BASE_DATA_DIR=${BASE_DATA_DIR:-"/public/DL_DATA/mlperf/bert"} //调整为具体的数据的路径
+### 预训练模型
+/workspace/bert_data文件夹存放预训练模型如下：
+    ├── /workpalce/bert_data/phase1
+    └── └──model.ckpt-28252.tf_pickled #预训练模型 
+### 单卡测试
+测试命令：
+    bash run_1gpu.sh
+## 性能和准确率数据
+测试采用上述输入数据，加速卡采用Z100L，下面为单机8卡测试结果：
+| 测试平台 | Accuacy | Speed(seq/s) |
+| :------: | :-----: | :----------: |
+|  Z100L   |  0.72   |    89.59     |
+## 历史版本
+* https://developer.hpccube.com/codes/modelzoo/mlperf_bert-large
+## 参考
+* https://mlcommons.org/en/
+* https://github.com/mlcommons
--- a/__pycache__/bert_padding.cpython-36.pyc
+++ b/__pycache__/bert_padding.cpython-36.pyc
--- a/__pycache__/dataset.cpython-36.pyc
+++ b/__pycache__/dataset.cpython-36.pyc
--- a/__pycache__/init_env.cpython-36.pyc
+++ b/__pycache__/init_env.cpython-36.pyc
--- a/__pycache__/list.cpython-36.pyc
+++ b/__pycache__/list.cpython-36.pyc
--- a/__pycache__/stack.cpython-36.pyc
+++ b/__pycache__/stack.cpython-36.pyc
--- a/__pycache__/utility.cpython-36.pyc
+++ b/__pycache__/utility.cpython-36.pyc
--- a/analyze_e2e_time.py
+++ b/analyze_e2e_time.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import json
+def get_mllog_json(line):
+    prefix = ":::MLLOG"
+    if not line.startswith(prefix):
+        return None
+    line = line[len(prefix):].strip()
+    return json.loads(line)
+def readlines(file_path):
+    with open(file_path, "r") as f:
+        return list(f.readlines())
+def analyze_one_file(file_path, gbs):
+    lines = readlines(file_path)
+    run_start_t = None
+    run_end_t = None
+    train_samples = None
+    success = None
+    for line in lines:
+        if "run_start" not in line and "run_stop" not in line and "train_samples" not in line:
+            continue
+        log_json = get_mllog_json(line)
+        if log_json is None or "key" not in log_json:
+            continue
+        key = log_json["key"]
+        if key == "run_start":
+            run_start_t = log_json["time_ms"]
+        elif key == "train_samples":
+            train_samples = log_json["value"]
+        elif key == "run_stop":
+            run_end_t = log_json["time_ms"]
+            success = 1 if log_json["metadata"]["status"] == "success" else 0
+            break
+    assert run_start_t is not None and run_end_t is not None or success is not None and train_samples is not None, file_path
+    assert train_samples % gbs == 0
+    return (run_end_t - run_start_t
+            ) / 60.0 / 1000.0, success, train_samples, train_samples / gbs
+def avg_without_min_max(times):
+    min_t = min(times)
+    max_t = max(times)
+    min_idx = [i for i, t in enumerate(times) if t == min_t][0]
+    max_idx = [i for i, t in enumerate(times) if t == max_t][0]
+    times = [t for i, t in enumerate(times) if i != min_idx and i != max_idx]
+    return sum(times) / len(times), min_idx, max_idx
+class TablePrinter(object):
+    def __init__(self, headers):
+        self.headers = list([str(h) for h in headers])
+        self.rows = []
+        self.max_lens = [len(h) for h in self.headers]
+    def add_row(self, row):
+        assert len(row) == len(self.headers)
+        row = [str(item) for item in row]
+        self.max_lens = [
+            max(length, len(row[i])) for i, length in enumerate(self.max_lens)
+        ]
+        self.rows.append(row)
+    def _aligned_str(self, s, length):
+        return s + (' ' * (length - len(s)))
+    def _aligned_row(self, row, separator='  '):
+        return separator.join([
+            self._aligned_str(s, self.max_lens[i]) for i, s in enumerate(row)
+        ])
+    def print_table(self):
+        print(self._aligned_row(self.headers))
+        for row in self.rows:
+            print(self._aligned_row(row))
+def analyze(file_pattern, file_num, gbs, min_train_samples, win_size=10):
+    results = []
+    for file_idx in range(file_num):
+        i = file_idx + 1
+        file_path = file_pattern.format(i)
+        ret = [i] + list(analyze_one_file(file_path, gbs))
+        results.append(ret)
+    table1 = TablePrinter([
+        'FileIdx',
+        'Success',
+        'TrainSamples',
+        'TrainingSteps',
+        'Time(min)',
+        'ValidTime(min)',
+        'Throughput(s/step)',
+    ])
+    for file_idx, t, success, samples, step in results:
+        table1.add_row([
+            file_idx,
+            success,
+            samples,
+            step,
+            t,
+            t if success else float('inf'),
+            t / step * 60.0,
+        ])
+    table1.print_table()
+    n = len(results)
+    win_results = []
+    for i in range(n - win_size + 1):
+        times = [
+            results[i + j][1] if results[i + j][2] else float('inf')
+            for j in range(win_size)
+        ]
+        avg_time, min_idx, max_idx = avg_without_min_max(times)
+        samples = [
+            float(results[i + j][3]) for j in range(win_size)
+            if j != min_idx and j != max_idx
+        ]
+        avg_samples = sum(samples) / len(samples)
+        start_idx = results[i][0]
+        end_idx = results[i + win_size - 1][0]
+        win_results.append((start_idx, end_idx, avg_samples, avg_time))
+    print('-' * 120)
+    table2 = TablePrinter([
+        'StartFileIdx',
+        'EndFileIdx',
+        'AvgSamples',
+        'AvgTime(min)',
+        'ValidAvgTime(min)',
+    ])
+    for start_idx, end_idx, avg_samples, avg_time in win_results:
+        valid_avg_time = avg_time if avg_samples >= min_train_samples else float(
+            'inf')
+        table2.add_row(
+            [start_idx, end_idx, avg_samples, avg_time, valid_avg_time])
+    table2.print_table()
+def get_or_default(idx, default, type=None):
+    args = sys.argv
+    value = args[idx] if idx < len(args) else default
+    return type(value) if type is not None else value
+if __name__ == "__main__":
+    nargv = len(sys.argv)
+    assert nargv >= 2 and nargv <= 5, "Usage: {} {} <file_path_pattern> [<file_num>] [<global_batch_size>] [<min_train_samples>]".format(
+        sys.executable, sys.argv[0])
+    file_pattern = sys.argv[1]
+    file_num = get_or_default(2, 1, int)
+    gbs = get_or_default(3, 8 * 56, int)
+    min_train_samples = get_or_default(4, 2621696.0 / 1.0387858550359907, float)
+    analyze(file_pattern, file_num, gbs, min_train_samples)
--- a/bert_padding.py
+++ b/bert_padding.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+def generate_mask(attention_mask, unpad_fmha=False):
+    if unpad_fmha:
+        # 对[bs, max_seq_len]，每一行求和，代表获取每一行的实际seq_len（一维）。
+        #seqlen = attention_mask.sum(dim=1).to(dtype=torch.int32).flatten()
+        attention_mask_tmp = paddle.sum(attention_mask, axis=1)
+        attention_mask_sum = paddle.cast(attention_mask_tmp, 'int32')
+        seqlen = paddle.reshape(attention_mask_sum, [-1])
+        print("seqlen is ", seqlen)
+        # 把非零元的下标存储下来。
+        #indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+        attention_mask_1d = paddle.reshape(attention_mask, [-1])
+        indices = paddle.nonzero(attention_mask_1d, as_tuple=False)
+        indices = paddle.reshape(indices, [-1])
+        # 当前batch的max cur_len_seq
+        # maxseqlen = seqlen.max().item()
+        maxseqlen_d = paddle.max(seqlen)
+        # Note: use paddle.CUDAPinnedPlace() will cause the following errors:
+        '''
+        File "/usr/local/lib/python3.8/dist-packages/paddle/fluid/framework.py", line 2305, in __init__
+        for frame in traceback.extract_stack():
+        UnimplementedError: Unsupported place type `CUDAPinnedPlace` when casting paddle place to enum place. (at /limin29/Paddle/paddle/fluid/framework/custom_tensor_utils.h:135)
+        [operator < custom_fmha > error]
+        '''
+        # maxseqlen = paddle.tensor.creation._memcpy(maxseqlen_d, paddle.CUDAPinnedPlace())
+        maxseqlen = paddle.tensor.creation._memcpy(maxseqlen_d,
+                                                   paddle.CPUPlace())
+        print("maxseqlen", maxseqlen)
+        prefix_sum = paddle.cumsum(seqlen, axis=0)
+        zero_tensor = paddle.zeros([1], dtype='int32')
+        # 返回数组前缀和。[0, a[0], a[0] + a[1], ...]
+        cu_seqlens = paddle.concat(x=[zero_tensor, prefix_sum])
+        # 返回cu_seqlens最后一个元素，代表当前batch的所有实际seq_len之和。
+        # device tensor with shape [1]
+        ntokens_d = cu_seqlens[-1]
+        # host tensor with shape [1]
+        #ntokens = paddle.tensor.creation._memcpy(ntokens_d, paddle.CUDAPinnedPlace()) 
+        ntokens = paddle.tensor.creation._memcpy(ntokens_d, paddle.CPUPlace())
+        print("ntokens = ", ntokens)
+        return indices, attention_mask, seqlen, ntokens, cu_seqlens, seqlen, maxseqlen
+    else:
+        raise NotImplementedError()
--- a/config_DGXA100_1x8x56x1.sh
+++ b/config_DGXA100_1x8x56x1.sh
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+## DL params
+export BATCHSIZE=56
+export GRADIENT_STEPS=1
+export LR=3.5e-4
+export MAX_SAMPLES_TERMINATION=4500000
+export MAX_STEPS=7100
+export OPT_LAMB_BETA_1=0.9
+export OPT_LAMB_BETA_2=0.999
+export START_WARMUP_STEP=0
+export WARMUP_PROPORTION=0.0
+export PHASE=2
+export EVAL_ITER_START_SAMPLES=150000
+export EVAL_ITER_SAMPLES=150000
+## System run parms
+export DGXNNODES=1
+export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
+export WALLTIME=01:15:00
+## System config params
+source ${BASH_SOURCE%/*}/config_DGXA100_common.sh
--- a/config_DGXA100_common.sh
+++ b/config_DGXA100_common.sh
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+## System config params
+export DGXNGPU=8
+export DGXSOCKETCORES=64
+export DGXNSOCKET=2
+export DGXHT=2         # HT is on is 2, HT off is 1