Commit 581b8d15 authored by liangjing's avatar liangjing
Browse files

version 1

parents
Pipeline #169 failed with stages
in 0 seconds
# CMAKE generated file: DO NOT EDIT!
# Generated by "Unix Makefiles" Generator, CMake Version 3.16
# Relative path conversion top directories.
set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/public/home/zhangqha/bert/pybind")
set(CMAKE_RELATIVE_PATH_TOP_BINARY "/public/home/zhangqha/bert/pybind/build")
# Force unix paths in dependencies.
set(CMAKE_FORCE_UNIX_PATHS 1)
# The C and CXX include file regular expressions for this directory.
set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
# CMAKE generated file: DO NOT EDIT!
# Generated by "Unix Makefiles" Generator, CMake Version 3.16
# Default target executed when no arguments are given to make.
default_target: all
.PHONY : default_target
# Allow only one "make -f Makefile2" at a time, but pass parallelism.
.NOTPARALLEL:
#=============================================================================
# Special targets provided by cmake.
# Disable implicit rules so canonical targets will work.
.SUFFIXES:
# Remove some rules from gmake that .SUFFIXES does not remove.
SUFFIXES =
.SUFFIXES: .hpux_make_needs_suffix_list
# Suppress display of executed commands.
$(VERBOSE).SILENT:
# A target that is always out of date.
cmake_force:
.PHONY : cmake_force
#=============================================================================
# Set environment variables for the build.
# The shell in which to execute make rules.
SHELL = /bin/sh
# The CMake executable.
CMAKE_COMMAND = /opt/cmake/bin/cmake
# The command to remove a file.
RM = /opt/cmake/bin/cmake -E remove -f
# Escaping for special characters.
EQUALS = =
# The top-level source directory on which CMake was run.
CMAKE_SOURCE_DIR = /public/home/zhangqha/bert/pybind
# The top-level build directory on which CMake was run.
CMAKE_BINARY_DIR = /public/home/zhangqha/bert/pybind/build
#=============================================================================
# Targets provided globally by CMake.
# Special rule for the target rebuild_cache
rebuild_cache:
@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
/opt/cmake/bin/cmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
.PHONY : rebuild_cache
# Special rule for the target rebuild_cache
rebuild_cache/fast: rebuild_cache
.PHONY : rebuild_cache/fast
# Special rule for the target edit_cache
edit_cache:
@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..."
/opt/cmake/bin/ccmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
.PHONY : edit_cache
# Special rule for the target edit_cache
edit_cache/fast: edit_cache
.PHONY : edit_cache/fast
# The main all target
all: cmake_check_build_system
cd /public/home/zhangqha/bert/pybind/build && $(CMAKE_COMMAND) -E cmake_progress_start /public/home/zhangqha/bert/pybind/build/CMakeFiles /public/home/zhangqha/bert/pybind/build/pybind11/CMakeFiles/progress.marks
cd /public/home/zhangqha/bert/pybind/build && $(MAKE) -f CMakeFiles/Makefile2 pybind11/all
$(CMAKE_COMMAND) -E cmake_progress_start /public/home/zhangqha/bert/pybind/build/CMakeFiles 0
.PHONY : all
# The main clean target
clean:
cd /public/home/zhangqha/bert/pybind/build && $(MAKE) -f CMakeFiles/Makefile2 pybind11/clean
.PHONY : clean
# The main clean target
clean/fast: clean
.PHONY : clean/fast
# Prepare targets for installation.
preinstall: all
cd /public/home/zhangqha/bert/pybind/build && $(MAKE) -f CMakeFiles/Makefile2 pybind11/preinstall
.PHONY : preinstall
# Prepare targets for installation.
preinstall/fast:
cd /public/home/zhangqha/bert/pybind/build && $(MAKE) -f CMakeFiles/Makefile2 pybind11/preinstall
.PHONY : preinstall/fast
# clear depends
depend:
cd /public/home/zhangqha/bert/pybind/build && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
.PHONY : depend
# Help Target
help:
@echo "The following are some of the valid targets for this Makefile:"
@echo "... all (the default if no target is provided)"
@echo "... clean"
@echo "... depend"
@echo "... rebuild_cache"
@echo "... edit_cache"
.PHONY : help
#=============================================================================
# Special targets to cleanup operation of make.
# Special rule to run CMake to check the build system integrity.
# No rule that depends on this can have commands that come from listfiles
# because they might be regenerated.
cmake_check_build_system:
cd /public/home/zhangqha/bert/pybind/build && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
.PHONY : cmake_check_build_system
# Install script for directory: /public/home/zhangqha/bert/pybind/pybind11
# Set the install prefix
if(NOT DEFINED CMAKE_INSTALL_PREFIX)
set(CMAKE_INSTALL_PREFIX "/usr/local")
endif()
string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
# Set the install configuration name.
if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
if(BUILD_TYPE)
string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
else()
set(CMAKE_INSTALL_CONFIG_NAME "Release")
endif()
message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
endif()
# Set the component getting installed.
if(NOT CMAKE_INSTALL_COMPONENT)
if(COMPONENT)
message(STATUS "Install component: \"${COMPONENT}\"")
set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
else()
set(CMAKE_INSTALL_COMPONENT)
endif()
endif()
# Install shared libraries without execute permission?
if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
set(CMAKE_INSTALL_SO_NO_EXE "0")
endif()
# Is this installation the result of a crosscompile?
if(NOT DEFINED CMAKE_CROSSCOMPILING)
set(CMAKE_CROSSCOMPILING "FALSE")
endif()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import paddle
import sys
import paddle.fluid.core as core
from paddle.utils.cpp_extension.extension_utils import _get_include_dirs_when_compiling
cur_dir = os.path.dirname(os.path.abspath(__file__))
os.chdir(cur_dir)
CMAKELISTS_TEMPLATE = '''
cmake_minimum_required(VERSION 3.4...3.18)
project(functions LANGUAGES CXX)
add_subdirectory(pybind11)
%s
%s
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=1 -fPIC -D__HIP_PLATFORM_HCC__=1 ")
set(extension_name "functions")
add_definitions("-DMLPERF_EXTENSION_NAME=${extension_name}")
pybind11_add_module(${extension_name} functions.cc)
target_link_libraries(${extension_name} PRIVATE %s)
'''
compile_dir = os.environ["COMPILE_DIR"]
dir_lists = _get_include_dirs_when_compiling(compile_dir)
dirs = ["include_directories({})".format(d) for d in dir_lists]
macros = {}
#if core.is_compiled_with_cuda():
# macros['PADDLE_WITH_CUDA'] = None
# macros['EIGEN_USE_GPU'] = None
if core.is_compiled_with_mkldnn():
macros['PADDLE_WITH_MKLDNN'] = None
if core.is_compiled_with_nccl():
macros['PADDLE_WITH_NCCL'] = None
macros['PADDLE_WITH_RCCL'] = None
macros['EIGEN_USE_HIP'] = None
macros['THRUST_IGNORE_CUB_VERSION_CHECK'] = None
macros = "\n".join([
'add_definitions(-D{}{})'.format(k, '=' + str(v) if v is not None else "")
for k, v in macros.items()
])
paddle_so = os.path.join(os.path.dirname(paddle.__file__), "fluid/core_avx.so")
cmakelists_context = CMAKELISTS_TEMPLATE % ("\n".join(dirs), macros, paddle_so)
with open("CMakeLists.txt", "w") as f:
f.write(cmakelists_context)
cmake_args = {
'CMAKE_BUILD_TYPE': 'Release',
'PYBIND11_PYTHON_VERSION':
'{}.{}'.format(sys.version_info.major, sys.version_info.minor),
}
cmd = "rm -rf pybind11 && ln -s {}/third_party/pybind/src/extern_pybind pybind11".format(
compile_dir)
assert os.system(cmd) == 0
cmd = "rm -rf build && mkdir -p build && cd build && cmake .. {} && make -j `nproc`"
cmd = cmd.format(" ".join(
["-D{}={}".format(k, v) for k, v in cmake_args.items()]))
assert os.system(cmd) == 0
print(cmd)
so_file = [f for f in os.listdir('build') if f.endswith('.so')]
assert len(so_file) == 1
so_file = so_file[0]
assert os.system("cp build/{} .".format(so_file)) == 0
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cstdint>
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "pybind11/numpy.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
namespace py = pybind11;
namespace framework = paddle::framework;
namespace platform = paddle::platform;
PYBIND11_MAKE_OPAQUE(framework::LoDTensorArray);
constexpr int kInputIdsIdx = 0;
constexpr int kSegmentIdsIdx = 1;
constexpr int kInputMaskIdx = 2;
constexpr int kMaskedLmLabelsIdx = 3;
constexpr int kNextSentenceLabelsIdx = 4;
constexpr int kSeqLenIdx = 5;
constexpr int kPrefixSumSeqLenIdx = 6;
constexpr int kNonZerosIndicesIdx = 7;
constexpr int kMaskedLmIdsIdx = 8;
constexpr int kMaskedLmPositionIdx = 9;
constexpr int kNumValidIdx = 10;
constexpr int kNumTensors = 11;
template <typename T>
std::vector<std::vector<framework::LoDTensorArray>>
ProcessAllGatheredBERTInputs(
const py::array_t<T, py::array::c_style | py::array::forcecast> &array,
size_t num_samples,
size_t max_seq_length,
size_t batch_size,
size_t trainer_id,
size_t num_trainers) {
using TensorT = framework::LoDTensor;
PADDLE_ENFORCE_EQ(array.ndim(), 1);
size_t length = array.shape()[0];
const T *arr = array.data();
py::gil_scoped_release gil_release_guard;
const size_t nbatch = (num_samples + batch_size - 1) / batch_size;
std::unique_ptr<T[]> seq_indices(new T[batch_size * num_trainers]);
const size_t numel = num_samples * max_seq_length;
const size_t num_per_device = numel * 4 + num_samples * 2;
PADDLE_ENFORCE_EQ(num_per_device * num_trainers, length);
auto resize_and_alloc = [](TensorT *t, const framework::DDim &dim) -> T * {
t->Resize(dim);
return t->mutable_data<T>(platform::CPUPlace());
};
auto resize_and_alloc_int = [](TensorT *t,
const framework::DDim &dim) -> int * {
t->Resize(dim);
return t->mutable_data<int>(platform::CPUPlace());
};
auto resize_and_alloc_float32 = [](TensorT *t,
const framework::DDim &dim) -> float * {
t->Resize(dim);
return t->mutable_data<float>(platform::CPUPlace());
};
VLOG(10) << "num_samples = " << num_samples;
VLOG(10) << "max_seq_length = " << max_seq_length;
VLOG(10) << "batch_size = " << batch_size;
VLOG(10) << "trainer_id = " << trainer_id;
VLOG(10) << "num_trainers = " << num_trainers;
VLOG(10) << "nbatch = " << nbatch;
VLOG(10) << "length= " << length;
std::vector<std::vector<framework::LoDTensorArray>> gpu_cpu_tensors;
std::vector<framework::LoDTensorArray> tensors(nbatch);
std::vector<framework::LoDTensorArray> tensors_2(nbatch);
for (size_t i = 0; i < nbatch; ++i) {
const size_t cur_bs =
std::min((i + 1) * batch_size, num_samples) - i * batch_size;
VLOG(10) << "Mini batch " << i << " " << cur_bs;
const size_t seq_length_offset =
num_samples * max_seq_length * 4 + num_samples + i * batch_size;
const size_t total_seq_length = cur_bs * num_trainers;
std::iota(seq_indices.get(),
seq_indices.get() + total_seq_length,
static_cast<size_t>(0));
std::sort(seq_indices.get(),
seq_indices.get() + total_seq_length,
[&](size_t idx1, size_t idx2) {
size_t real_idx1 = (idx1 % num_trainers) * num_per_device +
(idx1 / num_trainers) + seq_length_offset;
size_t real_idx2 = (idx2 % num_trainers) * num_per_device +
(idx2 / num_trainers) + seq_length_offset;
return arr[real_idx1] > arr[real_idx2];
});
tensors[i].resize(kNumTensors);
tensors_2[i].resize(1);
auto *input_ids = resize_and_alloc(
&tensors[i][kInputIdsIdx],
{static_cast<int64_t>(cur_bs), static_cast<int64_t>(max_seq_length)});
auto *segment_ids = resize_and_alloc(
&tensors[i][kSegmentIdsIdx],
{static_cast<int64_t>(cur_bs), static_cast<int64_t>(max_seq_length)});
auto *input_mask = resize_and_alloc(
&tensors[i][kInputMaskIdx],
{static_cast<int64_t>(cur_bs), static_cast<int64_t>(max_seq_length)});
auto *masked_lm_labels = resize_and_alloc(
&tensors[i][kMaskedLmLabelsIdx],
{static_cast<int64_t>(cur_bs), static_cast<int64_t>(max_seq_length)});
auto *next_sentence_labels = resize_and_alloc(
&tensors[i][kNextSentenceLabelsIdx], {static_cast<int64_t>(cur_bs)});
auto *seq_len = resize_and_alloc_int(&tensors[i][kSeqLenIdx],
{static_cast<int64_t>(cur_bs)});
auto *prefix_sum_seq_len = resize_and_alloc_int(
&tensors[i][kPrefixSumSeqLenIdx], {static_cast<int64_t>(cur_bs + 1)});
auto *num_valid = resize_and_alloc_float32(&tensors[i][kNumValidIdx],
{static_cast<int64_t>(1)});
// cpu tensor
auto *host_prefix_sum_seq_len = resize_and_alloc_int(
&tensors_2[i][0], {static_cast<int64_t>(cur_bs + 1)});
prefix_sum_seq_len[0] = 0;
int sum_seq_len = 0;
for (size_t j = 0; j < cur_bs; ++j) {
const size_t idx = seq_indices.get()[j * num_trainers + trainer_id];
const size_t dev_id = idx % num_trainers;
const T *data = arr + dev_id * num_per_device;
const size_t sample_id = idx / num_trainers + i * batch_size;
std::memcpy(input_ids + j * max_seq_length,
data + sample_id * max_seq_length,
max_seq_length * sizeof(T));
std::memcpy(segment_ids + j * max_seq_length,
data + numel + sample_id * max_seq_length,
max_seq_length * sizeof(T));
std::memcpy(input_mask + j * max_seq_length,
data + 2 * numel + sample_id * max_seq_length,
max_seq_length * sizeof(T));
std::memcpy(masked_lm_labels + j * max_seq_length,
data + 3 * numel + sample_id * max_seq_length,
max_seq_length * sizeof(T));
next_sentence_labels[j] = data[4 * numel + sample_id];
seq_len[j] = data[4 * numel + num_samples + sample_id];
sum_seq_len += seq_len[j];
if (j > 0) {
prefix_sum_seq_len[j] = prefix_sum_seq_len[j - 1] + seq_len[j - 1];
}
}
prefix_sum_seq_len[cur_bs] =
prefix_sum_seq_len[cur_bs - 1] + seq_len[cur_bs - 1];
std::memcpy(host_prefix_sum_seq_len,
prefix_sum_seq_len,
sizeof(int) * (cur_bs + 1));
PADDLE_ENFORCE_LE(sum_seq_len, cur_bs * max_seq_length);
auto *nonzeros_indices = resize_and_alloc_int(
&tensors[i][kNonZerosIndicesIdx], {static_cast<int64_t>(sum_seq_len)});
int cur_nonzero_ind = 0;
int cur_num_valid = 0;
for (size_t j = 0; j < cur_bs; ++j) {
for (size_t k = 0; k < max_seq_length; ++k) {
int ids = j * max_seq_length + k;
if (input_mask[ids] != 0) {
nonzeros_indices[cur_nonzero_ind++] = static_cast<int>(ids);
}
if (masked_lm_labels[ids] != 0) {
cur_num_valid += 1;
}
}
}
PADDLE_ENFORCE_EQ(cur_nonzero_ind, sum_seq_len);
*num_valid = static_cast<float>(cur_num_valid);
auto *masked_lm_ids = resize_and_alloc_int(
&tensors[i][kMaskedLmIdsIdx], {static_cast<int64_t>(cur_num_valid)});
auto *masked_lm_positions =
resize_and_alloc_int(&tensors[i][kMaskedLmPositionIdx],
{static_cast<int64_t>(cur_num_valid)});
cur_num_valid = 0;
for (size_t j = 0; j < cur_bs; ++j) {
for (size_t k = 0; k < max_seq_length; ++k) {
int ids = j * max_seq_length + k;
if (masked_lm_labels[ids] != 0) {
masked_lm_positions[cur_num_valid] = ids;
masked_lm_ids[cur_num_valid] = masked_lm_labels[ids];
cur_num_valid += 1;
}
}
}
}
gpu_cpu_tensors.push_back(tensors);
gpu_cpu_tensors.push_back(tensors_2);
return gpu_cpu_tensors;
}
template <typename T>
std::vector<std::vector<framework::LoDTensorArray>> ProcessBERTEvalInputs(
const py::array_t<T, py::array::c_style | py::array::forcecast> &array,
size_t max_seq_length,
size_t batch_size,
bool need_sort) {
using TensorT = framework::LoDTensor;
PADDLE_ENFORCE_EQ(array.ndim(), 2);
size_t num_samples = array.shape()[0];
size_t one_sample_len = array.shape()[1];
const T *arr = array.data();
py::gil_scoped_release gil_release_guard;
std::unique_ptr<size_t[]> seq_indices;
if (need_sort) {
seq_indices.reset(new size_t[num_samples]);
std::iota(seq_indices.get(),
seq_indices.get() + num_samples,
static_cast<size_t>(0));
std::sort(seq_indices.get(),
seq_indices.get() + num_samples,
[arr, one_sample_len](size_t idx1, size_t idx2) {
idx1 = (idx1 + 1) * one_sample_len - 1;
idx2 = (idx2 + 1) * one_sample_len - 1;
return arr[idx1] < arr[idx2];
});
}
const size_t nbatch = (num_samples + batch_size - 1) / batch_size;
auto resize_and_alloc = [](TensorT *t, const framework::DDim &dim) -> T * {
t->Resize(dim);
return t->mutable_data<T>(platform::CPUPlace());
};
auto resize_and_alloc_int = [](TensorT *t,
const framework::DDim &dim) -> int * {
t->Resize(dim);
return t->mutable_data<int>(platform::CPUPlace());
};
auto resize_and_alloc_float32 = [](TensorT *t,
const framework::DDim &dim) -> float * {
t->Resize(dim);
return t->mutable_data<float>(platform::CPUPlace());
};
VLOG(10) << "one_sample_len = " << one_sample_len;
VLOG(10) << "num_samples = " << num_samples;
VLOG(10) << "max_seq_length = " << max_seq_length;
VLOG(10) << "batch_size = " << batch_size;
VLOG(10) << "nbatch = " << nbatch;
std::vector<std::vector<framework::LoDTensorArray>> gpu_cpu_tensors;
std::vector<framework::LoDTensorArray> tensors(nbatch);
std::vector<framework::LoDTensorArray> tensors_2(nbatch);
for (size_t i = 0; i < nbatch; ++i) {
const size_t cur_bs =
std::min((i + 1) * batch_size, num_samples) - i * batch_size;
VLOG(10) << "Mini batch " << i << " " << cur_bs;
tensors[i].resize(kNumTensors);
tensors_2[i].resize(1);
auto *input_ids = resize_and_alloc(
&tensors[i][kInputIdsIdx],
{static_cast<int64_t>(cur_bs), static_cast<int64_t>(max_seq_length)});
auto *segment_ids = resize_and_alloc(
&tensors[i][kSegmentIdsIdx],
{static_cast<int64_t>(cur_bs), static_cast<int64_t>(max_seq_length)});
auto *input_mask = resize_and_alloc(
&tensors[i][kInputMaskIdx],
{static_cast<int64_t>(cur_bs), static_cast<int64_t>(max_seq_length)});
auto *masked_lm_labels = resize_and_alloc(
&tensors[i][kMaskedLmLabelsIdx],
{static_cast<int64_t>(cur_bs), static_cast<int64_t>(max_seq_length)});
auto *next_sentence_labels = resize_and_alloc(
&tensors[i][kNextSentenceLabelsIdx], {static_cast<int64_t>(cur_bs)});
auto *seq_len = resize_and_alloc_int(&tensors[i][kSeqLenIdx],
{static_cast<int64_t>(cur_bs)});
auto *prefix_sum_seq_len = resize_and_alloc_int(
&tensors[i][kPrefixSumSeqLenIdx], {static_cast<int64_t>(cur_bs + 1)});
auto *num_valid = resize_and_alloc_float32(&tensors[i][kNumValidIdx],
{static_cast<int64_t>(1)});
// cpu tensor
auto *host_prefix_sum_seq_len = resize_and_alloc_int(
&tensors_2[i][0], {static_cast<int64_t>(cur_bs + 1)});
prefix_sum_seq_len[0] = 0;
int sum_seq_len = 0;
for (size_t j = 0; j < cur_bs; ++j) {
const T *data = arr;
size_t sample_id = j + i * batch_size;
if (need_sort) sample_id = seq_indices.get()[sample_id];
std::memcpy(input_ids + j * max_seq_length,
data + sample_id * one_sample_len,
max_seq_length * sizeof(T));
std::memcpy(segment_ids + j * max_seq_length,
data + sample_id * one_sample_len + max_seq_length,
max_seq_length * sizeof(T));
std::memcpy(input_mask + j * max_seq_length,
data + sample_id * one_sample_len + 2 * max_seq_length,
max_seq_length * sizeof(T));
std::memcpy(masked_lm_labels + j * max_seq_length,
data + sample_id * one_sample_len + 3 * max_seq_length,
max_seq_length * sizeof(T));
next_sentence_labels[j] =
data[sample_id * one_sample_len + 4 * max_seq_length];
seq_len[j] = data[sample_id * one_sample_len + 4 * max_seq_length + 1];
sum_seq_len += seq_len[j];
if (j > 0) {
prefix_sum_seq_len[j] = prefix_sum_seq_len[j - 1] + seq_len[j - 1];
}
}
prefix_sum_seq_len[cur_bs] =
prefix_sum_seq_len[cur_bs - 1] + seq_len[cur_bs - 1];
std::memcpy(host_prefix_sum_seq_len,
prefix_sum_seq_len,
sizeof(int) * (cur_bs + 1));
PADDLE_ENFORCE_LE(sum_seq_len, cur_bs * max_seq_length);
auto *nonzeros_indices = resize_and_alloc_int(
&tensors[i][kNonZerosIndicesIdx], {static_cast<int64_t>(sum_seq_len)});
int cur_nonzero_ind = 0;
int cur_num_valid = 0;
for (size_t j = 0; j < cur_bs; ++j) {
for (size_t k = 0; k < max_seq_length; ++k) {
int ids = j * max_seq_length + k;
if (input_mask[ids] != 0) {
nonzeros_indices[cur_nonzero_ind++] = static_cast<int>(ids);
}
if (masked_lm_labels[ids] != 0) {
cur_num_valid += 1;
}
}
}
PADDLE_ENFORCE_EQ(cur_nonzero_ind, sum_seq_len);
*num_valid = cur_num_valid;
auto *masked_lm_ids = resize_and_alloc_int(
&tensors[i][kMaskedLmIdsIdx], {static_cast<int64_t>(cur_num_valid)});
auto *masked_lm_positions =
resize_and_alloc_int(&tensors[i][kMaskedLmPositionIdx],
{static_cast<int64_t>(cur_num_valid)});
cur_num_valid = 0;
for (size_t j = 0; j < cur_bs; ++j) {
for (size_t k = 0; k < max_seq_length; ++k) {
int ids = j * max_seq_length + k;
if (masked_lm_labels[ids] != 0) {
masked_lm_positions[cur_num_valid] = ids;
masked_lm_ids[cur_num_valid] = masked_lm_labels[ids];
cur_num_valid += 1;
}
}
}
}
gpu_cpu_tensors.push_back(tensors);
gpu_cpu_tensors.push_back(tensors_2);
return gpu_cpu_tensors;
}
PYBIND11_MODULE(MLPERF_EXTENSION_NAME, m) {
m.def("process_allgathered_inputs", &ProcessAllGatheredBERTInputs<int16_t>);
m.def("process_allgathered_inputs", &ProcessAllGatheredBERTInputs<int32_t>);
m.def("process_allgathered_inputs", &ProcessAllGatheredBERTInputs<int64_t>);
m.def("process_eval_inputs", &ProcessBERTEvalInputs<int16_t>);
m.def("process_eval_inputs", &ProcessBERTEvalInputs<int32_t>);
m.def("process_eval_inputs", &ProcessBERTEvalInputs<int64_t>);
}
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
DIR=$(readlink -f `dirname "$0"`)
g++ "$DIR/nccl.cc" -std=c++17 -fPIC -shared -o "$DIR/libnccl_wrapper.so" -I/usr/local/cuda/include -ldl -lnccl
(cd "$DIR" && rm -rf build && python3.8 setup.py install --force)
echo "Set the following env before run:"
echo ""
echo 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:'$DIR" LD_PRELOAD=$DIR/libnccl_wrapper.so"
echo ""
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#define ASSERT_CHECK(__cond) \
do { \
if (!(__cond)) throw std::runtime_error(#__cond); \
} while (0)
#include "nccl.h" // NOLINT
#include <cstdlib>
#include <iostream>
#include <stdexcept>
#include "dlfcn.h" // NOLINT
constexpr ncclRedOp_t UNUSED = ncclProd;
using AllReduceT = decltype(&ncclAllReduce);
using ReduceScatterT = decltype(&ncclReduceScatter);
using RedOpCreatePreMulSumT = decltype(&ncclRedOpCreatePreMulSum);
using RedOpDestroyT = decltype(&ncclRedOpDestroy);
static std::string GetNCCLSoPath() {
const char *env = std::getenv("NCCL_SO_PATH");
return env ? std::string(env) : "libnccl.so";
}
struct NCCLHandle {
NCCLHandle() {
auto so_path = GetNCCLSoPath();
void *handle = dlopen(so_path.c_str(), RTLD_NOW | RTLD_LOCAL);
this->ncclAllReduce =
reinterpret_cast<AllReduceT>(dlsym(handle, "ncclAllReduce"));
ASSERT_CHECK(this->ncclAllReduce != nullptr);
this->ncclReduceScatter =
reinterpret_cast<ReduceScatterT>(dlsym(handle, "ncclReduceScatter"));
ASSERT_CHECK(this->ncclReduceScatter != nullptr);
this->ncclRedOpCreatePreMulSum = reinterpret_cast<RedOpCreatePreMulSumT>(
dlsym(handle, "ncclRedOpCreatePreMulSum"));
ASSERT_CHECK(this->ncclRedOpCreatePreMulSum != nullptr);
this->ncclRedOpDestroy =
reinterpret_cast<RedOpDestroyT>(dlsym(handle, "ncclRedOpDestroy"));
ASSERT_CHECK(this->ncclRedOpDestroy != nullptr);
fprintf(stderr, "%s loaded successfully\n", so_path.c_str());
}
AllReduceT ncclAllReduce = nullptr;
ReduceScatterT ncclReduceScatter = nullptr;
RedOpCreatePreMulSumT ncclRedOpCreatePreMulSum = nullptr;
RedOpDestroyT ncclRedOpDestroy = nullptr;
} g_nccl_handle;
struct NCCLPreMulSumInfo {
void Init(const void *scalar,
ncclDataType_t dtype,
ncclScalarResidence_t residence) {
scalar_ = const_cast<void *>(scalar);
dtype_ = dtype;
residence_ = residence;
}
ncclRedOp_t CreateOrReturn(ncclRedOp_t op, ncclComm_t comm) {
if (op != UNUSED) return op;
ASSERT_CHECK(ncclSuccess ==
g_nccl_handle.ncclRedOpCreatePreMulSum(
&op_, scalar_, dtype_, residence_, comm));
comm_ = comm;
is_created_ = true;
return op_;
}
void Destroy() {
if (is_created_) {
ASSERT_CHECK(ncclSuccess == g_nccl_handle.ncclRedOpDestroy(op_, comm_));
op_ = UNUSED;
comm_ = nullptr;
is_created_ = false;
}
}
private:
ncclRedOp_t op_ = UNUSED;
ncclComm_t comm_ = nullptr;
bool is_created_ = false;
void *scalar_ = nullptr;
ncclDataType_t dtype_ = ncclFloat16;
ncclScalarResidence_t residence_ = ncclScalarDevice;
} g_info;
extern "C" {
void InitNCCLPreMulSum(const void *scalar,
ncclDataType_t dtype,
ncclScalarResidence_t residence) {
g_info.Init(scalar, dtype, residence);
}
ncclResult_t ncclAllReduce(const void *sendbuff,
void *recvbuff,
size_t count,
ncclDataType_t datatype,
ncclRedOp_t op,
ncclComm_t comm,
cudaStream_t stream) {
op = g_info.CreateOrReturn(op, comm);
auto ret = g_nccl_handle.ncclAllReduce(
sendbuff, recvbuff, count, datatype, op, comm, stream);
g_info.Destroy();
return ret;
}
ncclResult_t ncclReduceScatter(const void *sendbuff,
void *recvbuff,
size_t recvcount,
ncclDataType_t datatype,
ncclRedOp_t op,
ncclComm_t comm,
cudaStream_t stream) {
op = g_info.CreateOrReturn(op, comm);
auto ret = g_nccl_handle.ncclReduceScatter(
sendbuff, recvbuff, recvcount, datatype, op, comm, stream);
g_info.Destroy();
return ret;
}
} // extern "C"
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import os
from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME
from setuptools import setup, find_packages
os.environ['TORCH_CUDA_ARCH_LIST'] = '8.0'
setup(
name="torch_ex",
version="0.1",
description="PyTorch Extensions written by Baidu",
ext_modules=[
CUDAExtension(
name="torch_ex",
sources=['torch_ex.cc'],
extra_compile_args={
"cxx": [
"-O3", "-DVERSION_GE_1_1", "-DVERSION_GE_1_3",
"-DVERSION_GE_1_5", "-fPIC"
],
"nvcc": [
"-O3", "-DVERSION_GE_1_1", "-DVERSION_GE_1_3",
"-DVERSION_GE_1_5", "-Xcompiler='-fPIC'"
],
},
extra_link_args=['-fPIC', '-lnccl_wrapper'])
],
cmdclass={"build_ext": BuildExtension})
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <torch/extension.h>
#include "nccl.h" // NOLINT
#define ASSERT_CHECK(__cond) \
do { \
if (!(__cond)) throw std::runtime_error(#__cond); \
} while (0)
extern "C" {
extern void InitNCCLPreMulSum(const void *scalar,
ncclDataType_t dtype,
ncclScalarResidence_t residence);
}
void InitNCCLPreMulSumByTensor(const at::Tensor &t) {
const void *scalar = t.data_ptr();
auto dtype = t.options().dtype();
ncclDataType_t nccl_dtype;
if (dtype == at::ScalarType::Half) {
nccl_dtype = ncclFloat16;
} else if (dtype == at::ScalarType::Float) {
nccl_dtype = ncclFloat32;
} else if (dtype == at::ScalarType::Double) {
nccl_dtype = ncclFloat64;
} else {
ASSERT_CHECK(false);
}
auto residence =
(t.device().is_cuda() ? ncclScalarDevice : ncclScalarHostImmediate);
InitNCCLPreMulSum(scalar, nccl_dtype, residence);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
namespace py = pybind11;
m.def("make_nccl_premul_sum",
&InitNCCLPreMulSumByTensor,
py::call_guard<py::gil_scoped_release>());
}
# Bert介绍
## 应用领域:
自然语言理解大模型
## 目标精度
Mask-LM accuracy 达到0.72
## 模型基本参数设置
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"max_position_embeddings": 512,
"num_attention_heads": 16,
"num_hidden_layers": 24,
"type_vocab_size": 2,
"vocab_size": 30522
# 测试前准备
## 数据集准备
### progress bars in model download and training scripts
boto3==1.14.0
gdown==3.13.0
git+https://github.com/mlcommons/logging.git@2.0.0-rc2
h5py==2.10.0
html2text==2020.1.16
ipdb==0.13.2
nltk==3.5
onnxruntime==1.3.0
parameterized
progressbar==2.5
requests==2.23.0
six==1.15.0
tensorflow==2.2.0
数据预处理时尽量将所有采用库的版本号对齐,以免出现md5码不一致问题
参见bert目录下 README.md制作数据
## 环境部署
1、准备dtk 21.04环境
2、Mlperf bert文件夹内包含paddlepaddle_rocm-0.0.0-cp36-cp36m-linux_x86_64.whl
python3 -m pip install paddlepaddle_rocm-0.0.0-cp36-cp36m-linux_x86_64.whl
## 安装python依赖包
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
# 测试脚本
## 8卡打开exchange padding测试
cp rundir_8gpu_exchange/* .
sbatch run_sbatch.sh
## 1024卡大规模并发测试
cp rundir_8gpu_exchange/* .
sbatch run_sbatch.sh
输出结果见worker.*文件
# 优化测试结果整理
测试数据存放目录:result.log
## 扩展性测试
| GPU卡数 | 单卡batch_size | gradient_accumulation | 吞吐量(seq/s) | 并行效率 |
|-------|--------------|-----------------------|------------------|--------------------------|
| 4 | 4 | 1 | 36.69 | 100% |
| 8 | 4 | 1 | 65.7 | 89.53% |
| 1024 | 4 | 1 | 7723.38 | 82.23% |
| 1024 | 8 | 1 | 9362.93-.9416.84 | 99.6%-100.25%(以单节点4卡为基准) |
## 性能优化测试
| GPU卡数 | 单卡batch_size | gradient_accumulation_steps | global batch size | |  混精度 | gemm优化 | softmax+softmax_cross_entropy  | distributed_fused_lamb | GeLU近似算法 | exchange padding | 收敛global_steps | walltime(s) |
|-------|--------------|-----------------------------|-------------------|------|---------------|--------------|--------------------------------|------------------------|---------------|----------------------|----------------|-------------|
| 8 | 4 | 14 | 448 | 优化前: | | 51.26seq/s | 85.3seq/s | 89.59seq/s | | 6697 (global steps) | 6697 | 32522.67 |
| | | | | 优化后: | 91.92seq/s  | 85.3seq/s | 89.59seq/s | 91.92seq/s  | 91.92seq/s  |  5692 (global steps) | 5692 | |
| 1024 | 4 | 1 | 4096 | 优化前: | 4458.04seq/s | | 7461seq/s | 5174.44seq/s | 7353.08seq/s | 必须off | 684 | 369.325 |
| | | | | 优化后: | 7723.38seq/s | 5174.44seq/s | 7723.38seq/s | 7461seq/s | 7723.38seq/s | | | |
| 1024 | 8 | 2 | 16384 | 优化前: |  --- | | 10634seq/s | 9083seq/s | | 必须off | 794 | 580.618 |
| | | | | 优化后: | 11330.07seq/s | 9083seq/s | 11330.07seq/s | 10634seq/s | 11330.07seq/s |
jieba
h5py
colorlog
colorama
seqeval
multiprocess
mpi4py
paddlenlp
git+https://github.com/mlperf/logging.git@2.0.0-rc1
#!/bin/bash
cp rundir_1gpu/init_env.py .
export PADDLE_TRAINERS_NUM=1
export PADDLE_TRAINER_ENDPOINTS=localhost:60001
export SEED=${SEED:-"$RANDOM"}
mpirun -np 1 --allow-run-as-root -x PADDLE_TRAINERS_NUM -x PADDLE_TRAINER_ENDPOINTS ./run_benchmark_1gpu.sh
#!/bin/bash
cp rundir_8gpu/init_env.py .
export MIOPEN_FIND_MODE=1
export PADDLE_TRAINERS_NUM=8
export PADDLE_TRAINER_ENDPOINTS=localhost:60005,localhost:60006,localhost:60007,localhost:60008,localhost:60009,localhost:60010,localhost:60011,localhost:60012
export PYTHON=python3
export SEED=${SEED:-"$RANDOM"}
export LD_LIBRARY_PATH=/opt/dtk-21.04/rccl/lib:$LD_LIBRARY_PATH
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_P2P_LEVEL=5
export use_hierarchical_allreduce=True
export num_process=16
if [[ $num_process -gt 1 ]]; then
ORTERUN=`which orterun`
mpirun="mpirun --allow-run-as-root -np $num_process --bind-to none -x PADDLE_TRAINERS_NUM -x PADDLE_TRAINER_ENDPOINTS -x LD_LIBRARY_PATH -x SEED -x PYTHON -x NPROC_PER_NODE -x use_hierarchical_allreduce ./run_benchmark_8gpu.sh"
else
mpirun=""
fi
echo "command is " $mpirun $CMD
for NPROC_PER_NODE in 8; do
export NPROC_PER_NODE=$NPROC_PER_NODE
$mpirun $CMD
done
#!/bin/bash
cp rundir_8gpu/init_env.py .
export MIOPEN_FIND_MODE=1
export PADDLE_TRAINERS_NUM=8
export PADDLE_TRAINER_ENDPOINTS=localhost:60005,localhost:60006,localhost:60007,localhost:60008,localhost:60009,localhost:60010,localhost:60011,localhost:60012
export PYTHON=python3
export SEED=${SEED:-"$RANDOM"}
export LD_LIBRARY_PATH=/opt/dtk-21.04/rccl/lib:$LD_LIBRARY_PATH
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_P2P_LEVEL=5
export use_hierarchical_allreduce=True
export num_process=16
if [[ $num_process -gt 1 ]]; then
ORTERUN=`which orterun`
mpirun="mpirun --allow-run-as-root -np $num_process --bind-to none -x PADDLE_TRAINERS_NUM -x PADDLE_TRAINER_ENDPOINTS -x LD_LIBRARY_PATH -x SEED -x PYTHON -x NPROC_PER_NODE -x use_hierarchical_allreduce ./run_benchmark_8gpu_fp32.sh"
else
mpirun=""
fi
echo "command is " $mpirun $CMD
for NPROC_PER_NODE in 8; do
export NPROC_PER_NODE=$NPROC_PER_NODE
$mpirun $CMD
done
#!/bin/bash
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -ex
export FLAGS_rocm_dir=/opt/dtk-21.04
export FLAGS_max_inplace_grad_add=2
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_P2P_LEVEL=5
export USE_NV_INPUT=1
USE_UNCOMPRESSED_DATASET=1
BASE_DATA_DIR=${BASE_DATA_DIR:-"/data/mlperf/bert"}
export USE_NV_INPUT
UNCOMPRESSED_DATA_DIR=$BASE_DATA_DIR/hdf5/training-4320/hdf5_4320_shards_uncompressed
VARLENGTH_DATA_DIR=$BASE_DATA_DIR/hdf5/training-4320/hdf5_4320_shards_varlength
export DATA_DIR=$UNCOMPRESSED_DATA_DIR
export EVAL_DIR=$BASE_DATA_DIR/hdf5/eval
if [[ "$USE_NV_INPUT" == "1" && "$USE_UNCOMPRESSED_DATASET" == "0" ]]; then
export DATA_DIR="$VARLENGTH_DATA_DIR"
export EVAL_DIR=$BASE_DATA_DIR/hdf5/eval
else
export USE_UNCOMPRESSED_DATASET=1
fi
export USE_UNCOMPRESSED_DATASET
export TF_CKPT_PATH=$BASE_DATA_DIR/phase1/model.ckpt-28252.tf_pickled
export BERT_CONFIG_PATH=$BASE_DATA_DIR/phase1/bert_config.json
export PYTHON=python3
export PADDLE_TRAINER_ID=${OMPI_COMM_WORLD_RANK}
export PADDLE_TRAINERS_NUM=${PADDLE_TRAINERS_NUM:-"1"}
export PADDLE_TRAINER_ENDPOINTS=${PADDLE_TRAINER_ENDPOINTS:-"localhost:60045"}
OMPI_COMM_WORLD_RANK=${OMPI_COMM_WORLD_RANK:-"0"}
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
function get_device_id() {
$PYTHON <<EOF
import paddle
import os
gpus = os.environ.get("CUDA_VISIBLE_DEVICES", None)
if gpus is None:
print($OMPI_COMM_WORLD_RANK)
else:
gpus = gpus.split(",")
print(gpus[$OMPI_COMM_WORLD_RANK])
EOF
}
if [[ $PADDLE_TRAINER_ID -lt $PADDLE_TRAINERS_NUM ]]; then
export CUDA_VISIBLE_DEVICES=0
export IS_TRAINER=1
export IS_READER=0
else
export CUDA_VISIBLE_DEVICES=""
export IS_TRAINER=0
export IS_READER=1
fi
echo "Trainer :" $CUDA_VISIBLE_DEVICES $PADDLE_TRAINER_ENDPOINTS $PADDLE_TRAINERS_NUM
export FLAGS_sync_nccl_allreduce=0
export FLAGS_fraction_of_gpu_memory_to_use=0.99
export FLAGS_call_stack_level=2
export FLAGS_use_fast_math=0
export FLAGS_enable_nvtx=1
batch_size=4
eval_batch_size=9
#use_amp=True
#use_pure_fp16=True
use_amp=True
use_pure_fp16=False
max_steps=820
log_freq=50
eval_iter_start_samples=175000
eval_iter_samples=175000
max_seq_length=512
dense_seq_output=True
unpad=False
unpad_fmha=False
fused_bias_mha=True
fused_bias_fc=True
## can be False or True
weight_transpose=True
###fused_dropout_add_ln=True
fused_dropout_add_ln=False
exchange_padding=True
cpu_exchange_padding=True
distributed_lamb=True
unpad_embed=False
unpad_fmha_mke_opt=True
sort_eval_data=False
LOG_DIR="log_${PADDLE_TRAINERS_NUM}"
mkdir -p ${LOG_DIR}
LOG_FILE=${LOG_DIR}/worker.${PADDLE_TRAINER_ID}
export FLAGS_max_inplace_grad_add=2
if [[ "$exchange_padding" == "true" || "$exchange_padding" == "True" ]]; then
if [[ "$cpu_exchange_padding" == "true" || "$cpu_exchange_padding" == "True" ]]; then
export DATA_DIR="$UNCOMPRESSED_DATA_DIR"
fi
fi
#$NSYS_CMD $BIND_CMD $PYTHON -u run_pretrain.py \
BERT_CMD="run_pretrain.py \
--max_predictions_per_seq 76 \
--train_batch_size $batch_size \
--eval_batch_size $eval_batch_size \
--sort_eval_data $sort_eval_data \
--learning_rate 0.00255 \
--weight_decay 0.0 \
--lamb_epsilon 1e-06 \
--start_warmup_step -76 \
--warmup_proportion 0.0 \
--warmup_steps 256 \
--input_dir $DATA_DIR \
--log_freq $log_freq \
--max_steps $max_steps \
--tf_ckpt_path $TF_CKPT_PATH \
--bert_config_path $BERT_CONFIG_PATH \
--unpad $unpad \
--unpad_fmha $unpad_fmha \
--unpad_fmha_mke_opt $unpad_fmha_mke_opt \
--unpad_embed $unpad_embed \
--fused_bias_mha $fused_bias_mha \
--fused_bias_fc $fused_bias_fc \
--fused_dropout_add_ln $fused_dropout_add_ln \
--weight_transpose $weight_transpose \
--max_seq_length $max_seq_length \
--eval_dir $EVAL_DIR \
--distributed_lamb $distributed_lamb \
--exchange_padding $exchange_padding \
--cpu_exchange_padding $cpu_exchange_padding \
--seed $SEED \
--use_uncompressed_dataset $USE_UNCOMPRESSED_DATASET \
--dense_seq_output $dense_seq_output \
--gradient_accumulation_steps 1 \
--opt_lamb_beta_1 0.71 \
--opt_lamb_beta_2 0.88 \
--enable_addto True \
--use_pure_fp16 $use_pure_fp16 \
--use_amp $use_amp"
#Run experiments
#python3 -u $BERT_CMD >& $LOG_FILE
python3 -u $BERT_CMD
#!/bin/bash
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -ex
#export ROCBLAS_LAYER=3
export FLAGS_rocm_dir=/opt/dtk-21.04/
export FLAGS_max_inplace_grad_add=2
export NCCL_P2P_LEVEL=5
export USE_NV_INPUT=1
USE_UNCOMPRESSED_DATASET=1
BASE_DATA_DIR=${BASE_DATA_DIR:-"/data/mlperf/bert"}
export USE_NV_INPUT
UNCOMPRESSED_DATA_DIR=$BASE_DATA_DIR/hdf5/training-4320/hdf5_4320_shards_uncompressed
VARLENGTH_DATA_DIR=$BASE_DATA_DIR/hdf5/training-4320/hdf5_4320_shards_varlength
export DATA_DIR=$UNCOMPRESSED_DATA_DIR
export EVAL_DIR=$BASE_DATA_DIR/hdf5/eval
if [[ "$USE_NV_INPUT" == "1" && "$USE_UNCOMPRESSED_DATASET" == "0" ]]; then
export DATA_DIR="$VARLENGTH_DATA_DIR"
export EVAL_DIR=$BASE_DATA_DIR/hdf5/eval
else
export USE_UNCOMPRESSED_DATASET=1
fi
export USE_UNCOMPRESSED_DATASET
export TF_CKPT_PATH=$BASE_DATA_DIR/phase1/model.ckpt-28252.tf_pickled
export BERT_CONFIG_PATH=$BASE_DATA_DIR/phase1/bert_config.json
export PYTHON=python3
export PADDLE_TRAINER_ID=${OMPI_COMM_WORLD_RANK}
export PADDLE_TRAINERS_NUM=${PADDLE_TRAINERS_NUM:-"1"}
export PADDLE_TRAINER_ENDPOINTS=${PADDLE_TRAINER_ENDPOINTS:-""}
OMPI_COMM_WORLD_RANK=${OMPI_COMM_WORLD_RANK:-"0"}
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
function get_device_id() {
$PYTHON <<EOF
import paddle
import os
gpus = os.environ.get("CUDA_VISIBLE_DEVICES", None)
if gpus is None:
print($OMPI_COMM_WORLD_RANK)
else:
gpus = gpus.split(",")
print(gpus[$OMPI_COMM_WORLD_RANK])
EOF
}
if [[ $PADDLE_TRAINER_ID -lt $PADDLE_TRAINERS_NUM ]]; then
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 #$(expr $OMPI_COMM_WORLD_RANK % 4) #`get_device_id`
export IS_TRAINER=1
export IS_READER=0
else
export CUDA_VISIBLE_DEVICES=""
export IS_TRAINER=0
export IS_READER=1
fi
echo "Trainer :" $CUDA_VISIBLE_DEVICES $PADDLE_TRAINER_ENDPOINTS $PADDLE_TRAINERS_NUM
export FLAGS_sync_nccl_allreduce=0
export FLAGS_fraction_of_gpu_memory_to_use=0.99
#export FLAGS_allocator_strategy=naive_best_fit
export FLAGS_call_stack_level=2
export FLAGS_use_fast_math=0
export FLAGS_enable_nvtx=1
#export FLAGS_inplace_addto_external_ops=custom_fused_dense_grad
batch_size=4
eval_batch_size=63
#eval_batch_size=16
use_amp=True
use_pure_fp16=True
max_steps=7100
log_freq=50
eval_iter_start_samples=150000
eval_iter_samples=150000
max_seq_length=512
dense_seq_output=True
unpad=False
unpad_fmha=False
fused_bias_mha=True
fused_bias_fc=True
## can be False or True
weight_transpose=True
#fused_dropout_add_ln=True
fused_dropout_add_ln=False
exchange_padding=True
cpu_exchange_padding=True
distributed_lamb=True
unpad_embed=False
unpad_fmha_mke_opt=True
sort_eval_data=False
LOG_DIR="log_${PADDLE_TRAINERS_NUM}"
mkdir -p ${LOG_DIR}
LOG_FILE=${LOG_DIR}/worker.${PADDLE_TRAINER_ID}
#export FLAGS_lamb_allreduce_first=1
#export FLAGS_use_multi_tensor_apply=1
export FLAGS_max_inplace_grad_add=2
if [[ "$exchange_padding" == "true" || "$exchange_padding" == "True" ]]; then
if [[ "$cpu_exchange_padding" == "true" || "$cpu_exchange_padding" == "True" ]]; then
export DATA_DIR="$UNCOMPRESSED_DATA_DIR"
fi
fi
#$NSYS_CMD $BIND_CMD $PYTHON -u run_pretrain.py \
BERT_CMD="run_pretrain.py \
--max_predictions_per_seq 76 \
--train_batch_size $batch_size \
--eval_batch_size $eval_batch_size \
--sort_eval_data $sort_eval_data \
--learning_rate 0.000425 \
--weight_decay 1e-2 \
--lamb_epsilon 1e-6 \
--start_warmup_step 0 \
--warmup_proportion 0.0 \
--warmup_steps 0 \
--input_dir $DATA_DIR \
--log_freq $log_freq \
--max_steps $max_steps \
--tf_ckpt_path $TF_CKPT_PATH \
--bert_config_path $BERT_CONFIG_PATH \
--unpad $unpad \
--unpad_fmha $unpad_fmha \
--unpad_fmha_mke_opt $unpad_fmha_mke_opt \
--unpad_embed $unpad_embed \
--fused_bias_mha $fused_bias_mha \
--fused_bias_fc $fused_bias_fc \
--fused_dropout_add_ln $fused_dropout_add_ln \
--weight_transpose $weight_transpose \
--max_seq_length $max_seq_length \
--eval_dir $EVAL_DIR \
--distributed_lamb $distributed_lamb \
--exchange_padding $exchange_padding \
--cpu_exchange_padding $cpu_exchange_padding \
--seed $SEED \
--use_uncompressed_dataset $USE_UNCOMPRESSED_DATASET \
--dense_seq_output $dense_seq_output \
--gradient_accumulation_steps 14 \
--opt_lamb_beta_1 0.9 \
--opt_lamb_beta_2 0.999 \
--enable_addto True \
--use_pure_fp16 $use_pure_fp16 \
--use_amp $use_amp"
## 2>&1 | tee $LOG_FILE"
APP="python3 -u $BERT_CMD"
case $(expr $lrank % 8) in
[0])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=0
#export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_0:50Gbs
numactl --cpunodebind=0 --membind=0 ${APP} >& $LOG_FILE
;;
[1])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=1
#export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_1:50Gbs
numactl --cpunodebind=1 --membind=1 ${APP} >& $LOG_FILE
;;
[2])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=2
# export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_2:50Gbs
numactl --cpunodebind=2 --membind=2 ${APP} >& $LOG_FILE
;;
[3])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=3
# export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_3:50Gbs
numactl --cpunodebind=3 --membind=3 ${APP} >& $LOG_FILE
;;
[4])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=4
#export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_0:50Gbs
numactl --cpunodebind=4 --membind=4 ${APP} >& $LOG_FILE
;;
[5])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=5
#export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_1:50Gbs
numactl --cpunodebind=5 --membind=5 ${APP} >& $LOG_FILE
;;
[6])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=6
# export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_2:50Gbs
numactl --cpunodebind=6 --membind=6 ${APP} >& $LOG_FILE
;;
[7])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=7
# export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_3:50Gbs
numactl --cpunodebind=7 --membind=7 ${APP} >& $LOG_FILE
;;
esac
#!/bin/bash
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -ex
#export ROCBLAS_LAYER=3
export FLAGS_rocm_dir=/opt/dtk-21.04/
export FLAGS_max_inplace_grad_add=2
export NCCL_P2P_LEVEL=5
export USE_NV_INPUT=1
USE_UNCOMPRESSED_DATASET=1
BASE_DATA_DIR=${BASE_DATA_DIR:-"/data/mlperf/bert"}
export USE_NV_INPUT
UNCOMPRESSED_DATA_DIR=$BASE_DATA_DIR/hdf5/training-4320/hdf5_4320_shards_uncompressed
VARLENGTH_DATA_DIR=$BASE_DATA_DIR/hdf5/training-4320/hdf5_4320_shards_varlength
export DATA_DIR=$UNCOMPRESSED_DATA_DIR
export EVAL_DIR=$BASE_DATA_DIR/hdf5/eval
if [[ "$USE_NV_INPUT" == "1" && "$USE_UNCOMPRESSED_DATASET" == "0" ]]; then
export DATA_DIR="$VARLENGTH_DATA_DIR"
export EVAL_DIR=$BASE_DATA_DIR/hdf5/eval
else
export USE_UNCOMPRESSED_DATASET=1
fi
export USE_UNCOMPRESSED_DATASET
export TF_CKPT_PATH=$BASE_DATA_DIR/phase1/model.ckpt-28252.tf_pickled
export BERT_CONFIG_PATH=$BASE_DATA_DIR/phase1/bert_config.json
export PYTHON=python3
export PADDLE_TRAINER_ID=${OMPI_COMM_WORLD_RANK}
export PADDLE_TRAINERS_NUM=${PADDLE_TRAINERS_NUM:-"1"}
export PADDLE_TRAINER_ENDPOINTS=${PADDLE_TRAINER_ENDPOINTS:-""}
OMPI_COMM_WORLD_RANK=${OMPI_COMM_WORLD_RANK:-"0"}
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
function get_device_id() {
$PYTHON <<EOF
import paddle
import os
gpus = os.environ.get("CUDA_VISIBLE_DEVICES", None)
if gpus is None:
print($OMPI_COMM_WORLD_RANK)
else:
gpus = gpus.split(",")
print(gpus[$OMPI_COMM_WORLD_RANK])
EOF
}
if [[ $PADDLE_TRAINER_ID -lt $PADDLE_TRAINERS_NUM ]]; then
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 #$(expr $OMPI_COMM_WORLD_RANK % 4) #`get_device_id`
export IS_TRAINER=1
export IS_READER=0
else
export CUDA_VISIBLE_DEVICES=""
export IS_TRAINER=0
export IS_READER=1
fi
echo "Trainer :" $CUDA_VISIBLE_DEVICES $PADDLE_TRAINER_ENDPOINTS $PADDLE_TRAINERS_NUM
export FLAGS_sync_nccl_allreduce=0
export FLAGS_fraction_of_gpu_memory_to_use=0.99
#export FLAGS_allocator_strategy=naive_best_fit
export FLAGS_call_stack_level=2
export FLAGS_use_fast_math=0
export FLAGS_enable_nvtx=1
#export FLAGS_inplace_addto_external_ops=custom_fused_dense_grad
batch_size=4
eval_batch_size=63
#eval_batch_size=16
use_amp=True
use_pure_fp16=False
max_steps=7100
log_freq=50
eval_iter_start_samples=150000
eval_iter_samples=150000
max_seq_length=512
dense_seq_output=True
unpad=False
unpad_fmha=False
fused_bias_mha=True
fused_bias_fc=True
## can be False or True
weight_transpose=True
#fused_dropout_add_ln=True
fused_dropout_add_ln=False
exchange_padding=True
cpu_exchange_padding=True
distributed_lamb=True
unpad_embed=False
unpad_fmha_mke_opt=True
sort_eval_data=False
LOG_DIR="log_${PADDLE_TRAINERS_NUM}"
mkdir -p ${LOG_DIR}
LOG_FILE=${LOG_DIR}/worker.${PADDLE_TRAINER_ID}
#export FLAGS_lamb_allreduce_first=1
#export FLAGS_use_multi_tensor_apply=1
export FLAGS_max_inplace_grad_add=2
if [[ "$exchange_padding" == "true" || "$exchange_padding" == "True" ]]; then
if [[ "$cpu_exchange_padding" == "true" || "$cpu_exchange_padding" == "True" ]]; then
export DATA_DIR="$UNCOMPRESSED_DATA_DIR"
fi
fi
#$NSYS_CMD $BIND_CMD $PYTHON -u run_pretrain.py \
BERT_CMD="run_pretrain_fp32.py \
--max_predictions_per_seq 76 \
--train_batch_size $batch_size \
--eval_batch_size $eval_batch_size \
--sort_eval_data $sort_eval_data \
--learning_rate 0.000425 \
--weight_decay 1e-2 \
--lamb_epsilon 1e-6 \
--start_warmup_step 0 \
--warmup_proportion 0.0 \
--warmup_steps 0 \
--input_dir $DATA_DIR \
--log_freq $log_freq \
--max_steps $max_steps \
--tf_ckpt_path $TF_CKPT_PATH \
--bert_config_path $BERT_CONFIG_PATH \
--unpad $unpad \
--unpad_fmha $unpad_fmha \
--unpad_fmha_mke_opt $unpad_fmha_mke_opt \
--unpad_embed $unpad_embed \
--fused_bias_mha $fused_bias_mha \
--fused_bias_fc $fused_bias_fc \
--fused_dropout_add_ln $fused_dropout_add_ln \
--weight_transpose $weight_transpose \
--max_seq_length $max_seq_length \
--eval_dir $EVAL_DIR \
--distributed_lamb $distributed_lamb \
--exchange_padding $exchange_padding \
--cpu_exchange_padding $cpu_exchange_padding \
--seed $SEED \
--use_uncompressed_dataset $USE_UNCOMPRESSED_DATASET \
--dense_seq_output $dense_seq_output \
--gradient_accumulation_steps 14 \
--opt_lamb_beta_1 0.9 \
--opt_lamb_beta_2 0.999 \
--enable_addto True \
--use_pure_fp16 $use_pure_fp16 \
--use_amp $use_amp"
## 2>&1 | tee $LOG_FILE"
APP="python3 -u $BERT_CMD"
case $(expr $lrank % 8) in
[0])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=0
#export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_0:50Gbs
numactl --cpunodebind=0 --membind=0 ${APP} >& $LOG_FILE
;;
[1])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=1
#export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_1:50Gbs
numactl --cpunodebind=1 --membind=1 ${APP} >& $LOG_FILE
;;
[2])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=2
# export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_2:50Gbs
numactl --cpunodebind=2 --membind=2 ${APP} >& $LOG_FILE
;;
[3])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=3
# export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_3:50Gbs
numactl --cpunodebind=3 --membind=3 ${APP} >& $LOG_FILE
;;
[4])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=4
#export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_0:50Gbs
numactl --cpunodebind=4 --membind=4 ${APP} >& $LOG_FILE
;;
[5])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=5
#export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_1:50Gbs
numactl --cpunodebind=5 --membind=5 ${APP} >& $LOG_FILE
;;
[6])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=6
# export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_2:50Gbs
numactl --cpunodebind=6 --membind=6 ${APP} >& $LOG_FILE
;;
[7])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=7
# export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_3:50Gbs
numactl --cpunodebind=7 --membind=7 ${APP} >& $LOG_FILE
;;
esac
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
export BASE_DATA_DIR="/home/users/mlperf-workspace/bert_data"
export NEXP="${2:-"400"}"
export STAGE=${STAGE:-"run"}
export CONT="nvcr.io/nvidia/pytorch:22.04-py3-paddle-dev-test"
export CLEAR_CACHES=1
if [[ $STAGE == "build" ]]; then
bash -ex Dockerfiles/build_with_pip_install_whl.sh
else
# rm -rf results
bash -ex run_with_docker.sh
fi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment