Commit 581b8d15 authored by liangjing's avatar liangjing
Browse files

version 1

parents
Pipeline #169 failed with stages
in 0 seconds
# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#To get the latest APEX
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:22.04-py3
FROM ${FROM_IMAGE_NAME}
# Install dependencies
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
bzip2 \
cabextract \
iputils-ping \
pbzip2 \
pv \
lsof \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /workspace/bert
COPY requirements.txt .
ARG PYTHON=python3.8
RUN $PYTHON -m pip install --no-cache-dir -r requirements.txt
# Preprocessing
# WORKDIR /workspace
RUN cd /workspace && git clone https://github.com/attardi/wikiextractor.git
RUN cd /workspace/wikiextractor && git checkout e4abb4cbd019b0257824ee47c23dd163919b731b
# Install BERT
ENV BERT_PREP_WORKING_DIR /workspace/bert/data
ENV PYTHONPATH "/workspace/bert"
# Install GCC 8.2
RUN apt-get update
RUN apt-get install -y libssl-dev ccache
RUN mkdir -p /workspace/temp_install_dir
WORKDIR /workspace/temp_install_dir
COPY sanitizer_platform_limits_posix.h .
COPY sanitizer_platform_limits_posix.cc .
RUN wget --no-proxy -O gcc-8.2.0.tar.xz https://paddle-ci.gz.bcebos.com/gcc-8.2.0.tar.xz
RUN tar -vxf gcc-8.2.0.tar.xz
WORKDIR gcc-8.2.0
RUN mv ../sanitizer_platform_limits_posix.h libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.h
RUN mv ../sanitizer_platform_limits_posix.cc libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.cc
RUN sed -i 's/ftp/http/g' ./contrib/download_prerequisites
RUN ./contrib/download_prerequisites
ARG GCC_NEW_DIR=/usr/local/gcc-8.2
ARG GCC_OLD_DIR=/usr/bin
RUN env LIBRARY_PATH="" ./configure --prefix=$GCC_NEW_DIR \
--enable-threads=posix --disable-checking --disable-multilib \
--enable-languages=c,c++
RUN env LIBRARY_PATH="" make -j `nproc`
RUN env LIBRARY_PATH="" make install -j `nproc`
COPY replace_gcc_symlink.sh .
RUN bash -ex replace_gcc_symlink.sh "$GCC_NEW_DIR" "$GCC_OLD_DIR"
# Install cmake 3.16.0
WORKDIR /workspace/temp_install_dir
RUN wget -O cmake-3.16.0.tar.gz https://cmake.org/files/v3.16/cmake-3.16.0.tar.gz
RUN tar -zvxf cmake-3.16.0.tar.gz
WORKDIR cmake-3.16.0
RUN ./bootstrap
RUN make -j `nproc`
RUN make install -j `nproc`
COPY replace_cmake_symlink.sh .
RUN bash -ex replace_cmake_symlink.sh /usr/local/bin/cmake
# Cleanup install dir
WORKDIR /workspace/bert
RUN rm -rf /workspace/temp_install_dir
# Remove libsnappy-dev to avoid ld link error when compiling Paddle
RUN apt-get purge -y libsnappy-dev
# Remove protoc to avoid compilation error of Paddle
RUN mv /usr/bin/protoc /usr/bin/protoc.bak
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
SRC_IMAGE="nvcr.io/nvidia/pytorch:22.04-py3"
DST_IMAGE="nvcr.io/nvidia/pytorch:22.04-py3-paddle-fast-test"
WHEEL_URL="https://paddle-wheel.bj.bcebos.com/mlperf-2.0"
PD_WHEEL_NAME="paddlepaddle_gpu-0.0.0-cp38-cp38-linux_x86_64.whl"
OP_TAR_NAME="custom_setup_ops.tar.gz"
PYBIND_FUNCTION_SO_NAME="functions.cpython-38-x86_64-linux-gnu.so"
PYTHON="python3.8"
###################
TMP_DOCKERFILE=Dockerfile.tmp
APEX_CLONE_DIR=/workspace/apex_dir
APEX_DIR=$APEX_CLONE_DIR/apex/build_scripts
PY_PACKAGE_DIR=/opt/conda/lib/python3.8/site-packages
OP_INSTALL_DIR=$PY_PACKAGE_DIR/custom_setup_ops
if [[ $SRC_IMAGE == $DST_IMAGE ]]; then
echo "Error: SRC_IMAGE and DST_IMAGE cannot be the same!!!"
exit 1
fi
OLD_DIR=`pwd`
NEW_DIR=$(dirname `readlink -f "$0"`)
cd $NEW_DIR
cat <<EOF >$TMP_DOCKERFILE
FROM $SRC_IMAGE
RUN mkdir -p $APEX_CLONE_DIR \
&& cd $APEX_CLONE_DIR \
&& git clone -b new_fmhalib https://github.com/sneaxiy/apex \
&& cd $APEX_DIR \
&& bash build.sh
RUN curl -O $WHEEL_URL/$PD_WHEEL_NAME \
&& $PYTHON -m pip install -U --force-reinstall $PD_WHEEL_NAME \
&& rm -rf $PD_WHEEL_NAME
RUN mkdir -p $OP_INSTALL_DIR \
&& cd $OP_INSTALL_DIR \
&& curl -O $WHEEL_URL/$OP_TAR_NAME \
&& tar -zvxf $OP_TAR_NAME \
&& rm -rf $OP_TAR_NAME
RUN echo "from .custom_setup_ops import *">$OP_INSTALL_DIR/__init__.py
RUN $PYTHON -m pip install -U --force-reinstall git+https://github.com/mlperf/logging.git@2.0.0-rc1
RUN mkdir -p $PY_PACKAGE_DIR/pybind \
&& cd $PY_PACKAGE_DIR/pybind \
&& curl -O $WHEEL_URL/$PYBIND_FUNCTION_SO_NAME
COPY requirements.txt .
RUN $PYTHON -m pip install -r requirements.txt
RUN $PYTHON -m pip install -U --force-reinstall protobuf==3.20.1
EOF
docker build -t $DST_IMAGE \
--build-arg http_proxy=$http_proxy \
--build-arg https_proxy=$https_proxy \
--build-arg no_proxy=$no_proxy \
-f $TMP_DOCKERFILE .
rm -rf $TMP_DOCKERFILE
cd $OLD_DIR
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
SRC_IMAGE="nvcr.io/nvidia/pytorch:22.04-py3"
DST_IMAGE="nvcr.io/nvidia/pytorch:22.04-py3-paddle-dev-test"
PYTHON_VER="3.8"
###################
TMP_IMAGE="nvcr.io/nvidia/pytorch:22.04-py3-paddle-dev"
TMP_DOCKERFILE=Dockerfile.tmp
PADDLE_DIR=/workspace/Paddle_src
APEX_CLONE_DIR=/workspace/apex_dir
APEX_DIR=$APEX_CLONE_DIR/apex/build_scripts
PY_PACKAGE_DIR=/opt/conda/lib/python3.8/site-packages
PYTHON=python$PYTHON_VER
if [[ $SRC_IMAGE == $DST_IMAGE ]]; then
echo "Error: SRC_IMAGE and DST_IMAGE cannot be the same!!!"
exit 1
fi
OLD_DIR=`pwd`
NEW_DIR=$(dirname `readlink -f "$0"`)
cd $NEW_DIR
docker build -t $TMP_IMAGE \
--build-arg http_proxy=$http_proxy \
--build-arg https_proxy=$https_proxy \
--build-arg no_proxy=$no_proxy \
-f Dockerfile .
cd $NEW_DIR/..
cat <<EOF >$TMP_DOCKERFILE
FROM $TMP_IMAGE
RUN mkdir -p $APEX_CLONE_DIR \
&& cd $APEX_CLONE_DIR \
&& git clone -b new_fmhalib https://github.com/sneaxiy/apex \
&& cd $APEX_DIR \
&& bash build.sh
ENV APEX_DIR $APEX_DIR
RUN mkdir -p $PADDLE_DIR \
&& cd $PADDLE_DIR \
&& git clone https://github.com/PaddlePaddle/Paddle \
&& cd $PADDLE_DIR/Paddle \
&& git checkout 108aeb28704e64a54f82b8a59266a4e9633f9949
ENV COMPILE_DIR $PADDLE_DIR/Paddle/build
RUN mkdir -p $PADDLE_DIR/Paddle/build
RUN cd $PADDLE_DIR/Paddle/build && cmake .. \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-DCMAKE_BUILD_TYPE=Release \
-DCUDA_ARCH_NAME=Ampere \
-DWITH_AVX=ON \
-DWITH_MKL=ON \
-DWITH_DISTRIBUTE=ON \
-DWITH_BRPC_RDMA=OFF \
-DWITH_LIBXSMM=OFF \
-DWITH_PSLIB=OFF \
-DWITH_BOX_PS=OFF \
-DWITH_XBYAK=ON \
-DWITH_PSCORE=ON \
-DWITH_HETERPS=OFF \
-DWITH_GLOO=ON \
-DWITH_TESTING=OFF \
-DPY_VERSION=$PYTHON_VER
RUN cd $PADDLE_DIR/Paddle/build && make -j `nproc`
RUN $PYTHON -m pip install -U --force-reinstall $PADDLE_DIR/Paddle/build/python/dist/*.whl
COPY external_ops external_ops
RUN cd external_ops && $PYTHON setup.py install --force && rm -rf external_ops
COPY pybind pybind
RUN cd pybind && $PYTHON compile.py && mkdir -p $PY_PACKAGE_DIR/pybind && cp *.so $PY_PACKAGE_DIR/pybind
RUN $PYTHON -m pip install -U --force-reinstall git+https://github.com/mlperf/logging.git@2.0.0-rc1
RUN $PYTHON -m pip install -U --force-reinstall protobuf==3.20.1
EOF
docker build -t $DST_IMAGE \
--build-arg http_proxy=$http_proxy \
--build-arg https_proxy=$https_proxy \
--build-arg no_proxy=$no_proxy \
-f $TMP_DOCKERFILE .
rm -rf $TMP_DOCKERFILE
cd $OLD_DIR
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
NEW_CMAKE="$1"
OLD_CMAKE=`which cmake`
mv "$OLD_CMAKE" "$OLD_CMAKE.bak"
ln -s "$NEW_CMAKE" "$OLD_CMAKE"
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
GCC_NEW_DIR="$1"
GCC_OLD_DIR="$2"
for prefix in gcc x86_64-linux-gnu-gcc;
do
for suffix in "" "-ar" "-nm" "-ranlib";
do
old_file="$GCC_OLD_DIR/$prefix$suffix"
mv "$old_file" "$old_file.bak"
ln -s "$GCC_NEW_DIR/bin/$prefix$suffix" "$old_file"
done
done
for prefix in g++ x86_64-linux-gnu-g++;
do
old_file="$GCC_OLD_DIR/$prefix"
mv "$old_file" "$old_file.bak"
ln -s "$GCC_NEW_DIR/bin/$prefix" "$old_file"
done
# progress bars in model download and training scripts
boto3==1.14.0
gdown==3.13.0
h5py==2.10.0
html2text==2020.1.16
ipdb==0.13.2
nltk==3.5
onnxruntime==1.3.0
parameterized
progressbar==2.5
requests==2.23.0
six==1.15.0
tensorflow==2.2.0
jieba
colorlog
colorama
seqeval
multiprocess
mpi4py
paddlenlp
git+https://github.com/mlperf/logging.git@2.0.0-rc1
This diff is collapsed.
This diff is collapsed.
# Download and prepare the data
Please download and prepare the data as described [here](https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/README.md#download-and-prepare-the-data).
After preparation, you can see the directories are like:
```
<BASE_DATA_DIR>
|_ phase1 # checkpoint to start from tf1
|_ hdf5
|_ eval # evaluation chunks in binary hdf5 format fixed length
|_ eval_varlength # evaluation chunks in binary hdf5 format variable length
|_ training_4320 #
|_ hdf5_4320_shards_uncompressed # sharded data in hdf5 format fixed length
|_ hdf5_4320_shards_varlength # sharded data in hdf5 format variable length
```
# Build the docker image
We provide you two ways to build the docker image to run tests.
## Build the docker image with pre-built binaries
We built some necessary binaries beforehand, and you can build the docker image fast.
```
bash Dockerfiles/build_image_fast.sh
```
After the command finishes, you will get the docker image named `nvcr.io/nvidia/pytorch:22.04-py3-paddle-fast-test`.
## Build the docker image from scratch
This method would take a long time. It would contain the following steps:
- Build the docker image which can compile the PaddlePaddle source code.
- Compile the PaddlePaddle source code.
- Compile the PaddlePaddle external operators.
- Compile the PaddlePaddle external pybind functions.
You can run the steps above by using the following command.
```
bash Dockerfiles/build_image_from_scratch.sh
```
After the command finishes, you will get the docker image named `nvcr.io/nvidia/pytorch:22.04-py3-paddle-dev-test`.
# Prepare the checkpoint file
Originally, the checkpoint of the BERT model is generated from TensorFlow. We can convert the original TensorFlow checkpoint file to the Python dictionary like this and dump the dictionary using the Python pickle module.
```python
{
"bert/encoder/layer_0/attention/self/query/kernel": numpy.ndarray(...),
"bert/encoder/layer_0/attention/self/query/bias": numpy.ndarray(...),
...
}
```
In this way, we can run tests without installing TensorFlow again after conversion. You can use the following command to convert the original TensorFlow checkpoint file:
```python
python models/load_tf_checkpoint.py \
<BASE_DATA_DIR>/phase1/model.ckpt-28252 \
<BASE_DATA_DIR>/phase1/model.ckpt-28252.tf_pickled
```
# Run the tests
```
export NEXP=10 # the trial test number
export BASE_DATA_DIR=<your_bert_data_dir>
export CONT=<your_docker_image_name>
STAGE=run bash run_with_docker.sh
```
\ No newline at end of file
# Bert-large(Bidirectional Encoder Representations from Transformers)
## 模型介绍
一种基于Transformer架构的预训练语言模型,由Google在2018年提出。BERT模型通过大规模的自监督预训练,学习了丰富的语言表示,可以应用于各种自然语言处理任务。
BERT模型的核心是Transformer编码器,它可以对输入的文本数据进行编码,并生成对应的上下文表示。BERT模型使用双向编码器,即同时考虑输入序列的左侧和右侧上下文,可以捕捉更全面的语言表示。在预训练阶段,BERT模型使用了两种不同的预训练任务:Masked Language Model(MLM)和Next Sentence Prediction(NSP)。
在MLM任务中,BERT模型会随机将一些单词替换成“[MASK]”标记,然后尝试预测这些标记所对应的单词。在NSP任务中,BERT模型需要判断两个句子是否是连续的。通过这两种任务的预训练,BERT模型可以学习到丰富的语言表示,从而可以应用于各种自然语言处理任务。
## 模型结构
BERT-large是BERT模型的一个更大、更复杂的版本。与BERT-base相比,BERT-large具有更多的层数、更多的参数和更深的网络结构,可以学习更深层次和更复杂的语言表示。
BERT-large包含24个Transformer编码器,每个编码器有1024个隐藏层,总共包含340M个参数。在预训练阶段,BERT-large使用更多的未标记的文本数据进行预训练,并使用Masked Language Model(MLM)和Next Sentence Prediction(NSP)两个任务来优化模型。BERT-large的预训练阶段比BERT-base更复杂,并且需要更长的时间来训练。
## 数据集
模型训练的数据集来自Wikipedia 2020/01/01,即一种常用的自然语言处理数据集,它包含了维基百科上的文章和对应的摘要(即第一段内容),可用于各种文本相关的任务,例如文本分类、文本摘要、命名实体识别等。
下载+预处理数据可按照下述进行,最终获得的输入数据如下图所示:
./input_preprocessing/prepare_data.sh --outputdir /workspace/bert_data
python3 models/load_tf_checkpoint.py \
/workspace/bert_data/phase1/model.ckpt-28252 \
/workspace/bert_data/phase1/model.ckpt-28252.tf_pickled
![image-20230410143225383](C:\Users\liang\AppData\Roaming\Typora\typora-user-images\image-20230410143225383.png)
## 训练
### 环境配置
提供[光源](https://www.sourcefind.cn/#/service-details)拉取的训练的docker镜像:
* 训练镜像:
python依赖安装:
pip3 install -r requirement.txt
### 训练
训练命令(此处以单机8卡规模为例说明):
bash run_8gpu.sh
#不同环境的配置及数据的存放路径会有不同,请根据实际情况进行调整run_benchmark_8gpu.sh脚本中的如下内容:
BASE_DATA_DIR=${BASE_DATA_DIR:-"/public/DL_DATA/mlperf/bert"} //调整为具体的数据的路径
### 预训练模型
/workspace/bert_data文件夹存放预训练模型如下:
├── /workpalce/bert_data/phase1
└── └──model.ckpt-28252.tf_pickled #预训练模型
### 单卡测试
测试命令:
bash run_1gpu.sh
## 性能和准确率数据
测试采用上述输入数据,加速卡采用Z100L,下面为单机8卡测试结果:
| 测试平台 | Accuacy | Speed(seq/s) |
| :------: | :-----: | :----------: |
| Z100L | 0.72 | 89.59 |
## 历史版本
* https://developer.hpccube.com/codes/modelzoo/mlperf_bert-large
## 参考
* https://mlcommons.org/en/
* https://github.com/mlcommons
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import json
def get_mllog_json(line):
prefix = ":::MLLOG"
if not line.startswith(prefix):
return None
line = line[len(prefix):].strip()
return json.loads(line)
def readlines(file_path):
with open(file_path, "r") as f:
return list(f.readlines())
def analyze_one_file(file_path, gbs):
lines = readlines(file_path)
run_start_t = None
run_end_t = None
train_samples = None
success = None
for line in lines:
if "run_start" not in line and "run_stop" not in line and "train_samples" not in line:
continue
log_json = get_mllog_json(line)
if log_json is None or "key" not in log_json:
continue
key = log_json["key"]
if key == "run_start":
run_start_t = log_json["time_ms"]
elif key == "train_samples":
train_samples = log_json["value"]
elif key == "run_stop":
run_end_t = log_json["time_ms"]
success = 1 if log_json["metadata"]["status"] == "success" else 0
break
assert run_start_t is not None and run_end_t is not None or success is not None and train_samples is not None, file_path
assert train_samples % gbs == 0
return (run_end_t - run_start_t
) / 60.0 / 1000.0, success, train_samples, train_samples / gbs
def avg_without_min_max(times):
min_t = min(times)
max_t = max(times)
min_idx = [i for i, t in enumerate(times) if t == min_t][0]
max_idx = [i for i, t in enumerate(times) if t == max_t][0]
times = [t for i, t in enumerate(times) if i != min_idx and i != max_idx]
return sum(times) / len(times), min_idx, max_idx
class TablePrinter(object):
def __init__(self, headers):
self.headers = list([str(h) for h in headers])
self.rows = []
self.max_lens = [len(h) for h in self.headers]
def add_row(self, row):
assert len(row) == len(self.headers)
row = [str(item) for item in row]
self.max_lens = [
max(length, len(row[i])) for i, length in enumerate(self.max_lens)
]
self.rows.append(row)
def _aligned_str(self, s, length):
return s + (' ' * (length - len(s)))
def _aligned_row(self, row, separator=' '):
return separator.join([
self._aligned_str(s, self.max_lens[i]) for i, s in enumerate(row)
])
def print_table(self):
print(self._aligned_row(self.headers))
for row in self.rows:
print(self._aligned_row(row))
def analyze(file_pattern, file_num, gbs, min_train_samples, win_size=10):
results = []
for file_idx in range(file_num):
i = file_idx + 1
file_path = file_pattern.format(i)
ret = [i] + list(analyze_one_file(file_path, gbs))
results.append(ret)
table1 = TablePrinter([
'FileIdx',
'Success',
'TrainSamples',
'TrainingSteps',
'Time(min)',
'ValidTime(min)',
'Throughput(s/step)',
])
for file_idx, t, success, samples, step in results:
table1.add_row([
file_idx,
success,
samples,
step,
t,
t if success else float('inf'),
t / step * 60.0,
])
table1.print_table()
n = len(results)
win_results = []
for i in range(n - win_size + 1):
times = [
results[i + j][1] if results[i + j][2] else float('inf')
for j in range(win_size)
]
avg_time, min_idx, max_idx = avg_without_min_max(times)
samples = [
float(results[i + j][3]) for j in range(win_size)
if j != min_idx and j != max_idx
]
avg_samples = sum(samples) / len(samples)
start_idx = results[i][0]
end_idx = results[i + win_size - 1][0]
win_results.append((start_idx, end_idx, avg_samples, avg_time))
print('-' * 120)
table2 = TablePrinter([
'StartFileIdx',
'EndFileIdx',
'AvgSamples',
'AvgTime(min)',
'ValidAvgTime(min)',
])
for start_idx, end_idx, avg_samples, avg_time in win_results:
valid_avg_time = avg_time if avg_samples >= min_train_samples else float(
'inf')
table2.add_row(
[start_idx, end_idx, avg_samples, avg_time, valid_avg_time])
table2.print_table()
def get_or_default(idx, default, type=None):
args = sys.argv
value = args[idx] if idx < len(args) else default
return type(value) if type is not None else value
if __name__ == "__main__":
nargv = len(sys.argv)
assert nargv >= 2 and nargv <= 5, "Usage: {} {} <file_path_pattern> [<file_num>] [<global_batch_size>] [<min_train_samples>]".format(
sys.executable, sys.argv[0])
file_pattern = sys.argv[1]
file_num = get_or_default(2, 1, int)
gbs = get_or_default(3, 8 * 56, int)
min_train_samples = get_or_default(4, 2621696.0 / 1.0387858550359907, float)
analyze(file_pattern, file_num, gbs, min_train_samples)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
def generate_mask(attention_mask, unpad_fmha=False):
if unpad_fmha:
# 对[bs, max_seq_len],每一行求和,代表获取每一行的实际seq_len(一维)。
#seqlen = attention_mask.sum(dim=1).to(dtype=torch.int32).flatten()
attention_mask_tmp = paddle.sum(attention_mask, axis=1)
attention_mask_sum = paddle.cast(attention_mask_tmp, 'int32')
seqlen = paddle.reshape(attention_mask_sum, [-1])
print("seqlen is ", seqlen)
# 把非零元的下标存储下来。
#indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
attention_mask_1d = paddle.reshape(attention_mask, [-1])
indices = paddle.nonzero(attention_mask_1d, as_tuple=False)
indices = paddle.reshape(indices, [-1])
# 当前batch的max cur_len_seq
# maxseqlen = seqlen.max().item()
maxseqlen_d = paddle.max(seqlen)
# Note: use paddle.CUDAPinnedPlace() will cause the following errors:
'''
File "/usr/local/lib/python3.8/dist-packages/paddle/fluid/framework.py", line 2305, in __init__
for frame in traceback.extract_stack():
UnimplementedError: Unsupported place type `CUDAPinnedPlace` when casting paddle place to enum place. (at /limin29/Paddle/paddle/fluid/framework/custom_tensor_utils.h:135)
[operator < custom_fmha > error]
'''
# maxseqlen = paddle.tensor.creation._memcpy(maxseqlen_d, paddle.CUDAPinnedPlace())
maxseqlen = paddle.tensor.creation._memcpy(maxseqlen_d,
paddle.CPUPlace())
print("maxseqlen", maxseqlen)
prefix_sum = paddle.cumsum(seqlen, axis=0)
zero_tensor = paddle.zeros([1], dtype='int32')
# 返回数组前缀和。[0, a[0], a[0] + a[1], ...]
cu_seqlens = paddle.concat(x=[zero_tensor, prefix_sum])
# 返回cu_seqlens最后一个元素,代表当前batch的所有实际seq_len之和。
# device tensor with shape [1]
ntokens_d = cu_seqlens[-1]
# host tensor with shape [1]
#ntokens = paddle.tensor.creation._memcpy(ntokens_d, paddle.CUDAPinnedPlace())
ntokens = paddle.tensor.creation._memcpy(ntokens_d, paddle.CPUPlace())
print("ntokens = ", ntokens)
return indices, attention_mask, seqlen, ntokens, cu_seqlens, seqlen, maxseqlen
else:
raise NotImplementedError()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
## DL params
export BATCHSIZE=56
export GRADIENT_STEPS=1
export LR=3.5e-4
export MAX_SAMPLES_TERMINATION=4500000
export MAX_STEPS=7100
export OPT_LAMB_BETA_1=0.9
export OPT_LAMB_BETA_2=0.999
export START_WARMUP_STEP=0
export WARMUP_PROPORTION=0.0
export PHASE=2
export EVAL_ITER_START_SAMPLES=150000
export EVAL_ITER_SAMPLES=150000
## System run parms
export DGXNNODES=1
export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
export WALLTIME=01:15:00
## System config params
source ${BASH_SOURCE%/*}/config_DGXA100_common.sh
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
## System config params
export DGXNGPU=8
export DGXSOCKETCORES=64
export DGXNSOCKET=2
export DGXHT=2 # HT is on is 2, HT off is 1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment