Commit 581b8d15 authored by liangjing's avatar liangjing
Browse files

version 1

parents
Pipeline #169 failed with stages
in 0 seconds
This diff is collapsed.
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
get_timestamp() {
local log_file="$1"
cat "$log_file" | grep -E 'run_start|run_stop' | awk '{print $5}' | awk -F',' '{print $1}'
}
unset GREP_OPTIONS
for i in `seq 1 10`; do
log_file="$1$i/worker.0"
start_t=`get_timestamp "$log_file" | head -n 1`
end_t=`get_timestamp "$log_file" | tail -n 1`
time_cost=`python -c "print(($end_t - $start_t) / 60.0 / 1000.0)"`
echo "$time_cost"
done
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from mpi4py import MPI
import numpy as np
import time
import paddle
from pybind.functions import process_allgathered_inputs as process_bert_inputs
from pybind.functions import process_eval_inputs as process_bert_eval_inputs
import h5py
import random
global_comm = MPI.COMM_WORLD
global_rank = global_comm.rank
global_world_size = global_comm.size
assert global_world_size % 2 == 0
def create_group_comm(ranks):
ranks = list(ranks)
new_group = global_comm.group.Incl(ranks)
new_comm = global_comm.Create_group(new_group)
return new_comm
def generate_seeds(rng, size):
"""
Generate list of random seeds
:param rng: random number generator
:param size: length of the returned list
"""
seeds = [rng.randint(0, 2**32 - 1) for _ in range(size)]
return seeds
def broadcast_seeds(comm, seeds, root=0):
seeds = np.array(seeds).astype(np.int64)
comm.Bcast(seeds, root=root)
return seeds.tolist()
def select_dataset_file_for_each_worker(files, f_start_id, worker_num,
worker_index):
"""
Spliting the train file according to the worker index.
"""
num_files = len(files)
if worker_num > num_files:
remainder = worker_num % num_files
data_file = files[(
f_start_id * worker_num + worker_index + remainder * f_start_id) %
num_files]
else:
data_file = files[(f_start_id * worker_num + worker_index) % num_files]
# limin-todo:
#data_file = "/data2/zengjinle/dataset/bert_data/hdf5/training-4320/hdf5_4320_shards_uncompressed/part_01799_of_04320.hdf5"
#print("data_file: ", data_file)
return data_file
def read_hdf5_file(input_file, dtype=np.int16):
keys = [
'input_ids',
'input_mask',
'segment_ids',
'masked_lm_positions',
'masked_lm_ids',
'next_sentence_labels',
]
if not os.path.exists(input_file):
return None
with h5py.File(input_file, 'r') as f:
outputs = [np.array(f[key], dtype=dtype) for key in keys]
n = outputs[0].shape[0]
masked_lm_labels = np.zeros(outputs[0].shape, dtype=dtype)
lengths = np.zeros(n, dtype=dtype)
for i in range(n):
masked_lm_positions = outputs[3][i]
masked_lm_ids = outputs[4][i]
length = np.count_nonzero(masked_lm_positions)
masked_lm_labels[i][
masked_lm_positions[:length]] = masked_lm_ids[:length]
lengths[i] = np.count_nonzero(outputs[1][i])
outputs = [
outputs[0], outputs[2], outputs[1], masked_lm_labels, outputs[-1],
lengths
]
idx = np.random.choice(np.arange(n), n, replace=False)
for i in range(len(outputs)):
outputs[i] = outputs[i][idx]
return outputs
def read_eval_hdf5_file(input_file, dtype=np.int16):
keys = [
'input_ids',
'input_mask',
'segment_ids',
'masked_lm_positions',
'masked_lm_ids',
'next_sentence_labels',
]
if not os.path.exists(input_file):
return None
with h5py.File(input_file, 'r') as f:
outputs = [np.asarray(f[key][:]) for key in keys]
nsamples = outputs[0].shape[0]
all_data = []
for index in range(nsamples):
[
input_ids, input_mask, segment_ids, masked_lm_positions,
masked_lm_ids, next_sentence_labels
] = [
input[index].astype(dtype)
if indice < 5 else np.asarray(input[index].astype(dtype))
for indice, input in enumerate(outputs)
]
length = np.count_nonzero(masked_lm_positions)
masked_lm_positions = masked_lm_positions[:length]
masked_lm_ids = masked_lm_ids[:length]
masked_lm_labels = np.zeros(input_ids.shape, dtype=dtype)
masked_lm_labels[masked_lm_positions] = masked_lm_ids
#if index == 0:
# print("masked_lm_labels = ", masked_lm_labels)
# print("masked_lm_positions = ", masked_lm_positions)
# print("masked_lm_ids = ", masked_lm_ids)
seq_len = np.asarray(np.count_nonzero(input_mask))
data = [
input_ids,
segment_ids,
input_mask,
masked_lm_labels,
next_sentence_labels,
seq_len,
]
# (2050, ), i.e., 512 * 4 + 1 + 1
one_sample_data = np.concatenate([d.flatten() for d in data])
all_data.extend(one_sample_data)
# (2050000, ) -> (10000, 2050)
return np.asarray(all_data).reshape((nsamples, -1))
class WorkerInitObj(object):
"Construct the object with different seed, and the Dataloader will generate the data "
"with different seed in each worker."
def __init__(self, seed):
self.seed = seed
def __call__(self, id):
np.random.seed(seed=self.seed + id)
random.seed(self.seed + id)
class Context:
def __init__(self):
half_size = int(global_world_size / 2)
self.trainer_id = global_rank % half_size
self.trainer_num = half_size
self.is_trainer = (global_rank < half_size)
self.reader_id = self.trainer_id
self.reader_num = self.trainer_num
self.is_reader = not self.is_trainer
self.trainer_comm = create_group_comm(range(0, half_size))
self.reader_comm = create_group_comm(
range(half_size, global_world_size))
self.trainer_reader_comm = create_group_comm(
[self.trainer_id, self.trainer_id + half_size])
self.global_comm = global_comm
def init_args(self, args, dtype=np.int16):
self.args = args
self.files = [
os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
if os.path.isfile(os.path.join(args.input_dir, f)) and "part" in f
]
self.files.sort()
self.fid_buf = np.array([1], dtype=np.int64)
with h5py.File(self.files[0], 'r') as f:
self.num_samples = np.array(f["next_sentence_labels"][:]).size
self.batch_size = args.train_batch_size
self.max_seq_length = args.max_seq_length
self.worker_seeds, self.shuffling_seeds = self._setup_seeds(
args.seed, args.num_epochs_to_generate_seeds_for)
self.epoch_idx = 0
data_buf_size = self.num_samples * 4 * self.max_seq_length + self.num_samples * 2
self.data_buf = np.empty(
shape=[self.trainer_num * data_buf_size], dtype=dtype)
self.eval_dir = args.eval_dir
self.num_eval_examples = args.num_eval_examples
self.eval_batch_size = args.eval_batch_size
cur_seed = self.worker_seeds[self.trainer_id]
np.random.seed(cur_seed)
random.seed(cur_seed)
paddle.seed(cur_seed)
self.worker_init = WorkerInitObj(cur_seed)
self.barrier()
def shuffle_files(self):
random.Random(self.shuffling_seeds[self.epoch_idx]).shuffle(self.files)
self.epoch_idx += 1
def _setup_seeds(self, master_seed, epochs):
if master_seed is None:
master_seed = random.SystemRandom().randint(0, 2**32 - 1)
if self.trainer_id == 0:
print('Using random master seed: {}'.format(master_seed))
else:
print('Using master seed from command line: {}'.format(master_seed))
# initialize seeding RNG
seeding_rng = random.Random(master_seed)
# generate worker seeds, one seed for every distributed worker
worker_seeds = generate_seeds(seeding_rng, self.trainer_num)
# generate seeds for data shuffling, one seed for every epoch
shuffling_seeds = generate_seeds(seeding_rng, epochs)
worker_seeds = broadcast_seeds(self.global_comm, worker_seeds)
shuffling_seeds = broadcast_seeds(self.global_comm, shuffling_seeds)
return worker_seeds, shuffling_seeds
def worker_seed(self):
return self.worker_seeds[self.trainer_id]
def barrier(self):
self.global_comm.barrier()
def stop_reader(self):
if self.is_trainer:
self.read_file(-1)
def file_num(self):
return len(self.files)
def read_file(self, f_id=None):
if self.is_trainer:
self.fid_buf[0] = f_id
self.trainer_reader_comm.Isend(self.fid_buf, dest=1)
if f_id == 0:
self.shuffle_files()
elif f_id < 0:
return
self.trainer_reader_comm.Recv(self.data_buf, source=1)
results = process_bert_inputs(self.data_buf, self.num_samples,
self.max_seq_length, self.batch_size,
self.trainer_id, self.trainer_num)
return results
else:
self.trainer_reader_comm.Recv(self.fid_buf, 0)
f_id = self.fid_buf[0]
if f_id == 0:
self.shuffle_files()
elif f_id < 0:
return False
fname = select_dataset_file_for_each_worker(
self.files, f_id, self.trainer_num, self.trainer_id)
data = read_hdf5_file(fname, dtype=self.data_buf.dtype)
send_buf = np.concatenate([d.flatten() for d in data])
self.reader_comm.Allgather(send_buf, self.data_buf)
self.trainer_reader_comm.Send(self.data_buf, dest=0)
return True
def read_eval_file(self):
if self.is_trainer:
eval_data = []
for eval_file in sorted(os.listdir(self.eval_dir)):
eval_file_path = os.path.join(self.eval_dir, eval_file)
if os.path.isfile(eval_file_path) and 'part' in eval_file_path:
data = read_eval_hdf5_file(
eval_file_path, dtype=self.data_buf.dtype)
eval_data.extend(data)
if len(eval_data) > self.num_eval_examples:
break
chunk_size = self.num_eval_examples // self.trainer_num
rank = self.trainer_id
remainder = self.num_eval_examples % self.trainer_num
if rank < remainder:
eval_data = eval_data[(chunk_size + 1) * rank:(chunk_size + 1) *
(rank + 1)]
else:
eval_data = eval_data[chunk_size * rank + remainder:chunk_size *
(rank + 1) + remainder]
results = process_bert_eval_inputs(eval_data, self.max_seq_length,
self.eval_batch_size,
self.args.sort_eval_data)
return results
_context = Context()
def get_context():
return _context
#!/bin/bash
export MIOPEN_FIND_MODE=1
export PADDLE_TRAINERS_NUM=8
export PADDLE_TRAINER_ENDPOINTS=localhost:60005,localhost:60006,localhost:60007,localhost:60008,localhost:60009,localhost:60010,localhost:60011,localhost:60012
export PYTHON=python3
export SEED=${SEED:-"$RANDOM"}
export LD_LIBRARY_PATH=/opt/dtk-21.04/rccl/lib:$LD_LIBRARY_PATH
#export HSA_FORCE_FINE_GRAIN_PCIE=1
#export NCCL_P2P_LEVEL=5
#export use_hierarchical_allreduce=True
export num_process=16
if [[ $num_process -gt 1 ]]; then
ORTERUN=`which orterun`
mpirun="mpirun --allow-run-as-root -np $num_process --bind-to none -x UCX_IB_ADDR_TYPE=ib_global -x UCX_TLS=rc_x,rocm_copy -mca btl_tcp_if_exclude ib0 -x PADDLE_TRAINERS_NUM -x PADDLE_TRAINER_ENDPOINTS -x LD_LIBRARY_PATH -x SEED -x PYTHON -x NPROC_PER_NODE -x use_hierarchical_allreduce ./run_benchmark.sh"
else
mpirun=""
fi
echo "command is " $mpirun $CMD
for NPROC_PER_NODE in 8; do
echo "command is " $mpirun $CMD
export NPROC_PER_NODE=$NPROC_PER_NODE
$mpirun $CMD
done
#mpirun -np 8 --allow-run-as-root --bind-to none -x PADDLE_TRAINERS_NUM -x PADDLE_TRAINER_ENDPOINTS -x LD_LIBRARY_PATH -x SEED -x PYTHON ./run_benchmark.sh
#!/bin/bash
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -ex
#export ROCBLAS_LAYER=3
export FLAGS_rocm_dir=/opt/dtk-21.04/
export FLAGS_max_inplace_grad_add=2
export NCCL_P2P_LEVEL=5
export USE_NV_INPUT=1
USE_UNCOMPRESSED_DATASET=1
BASE_DATA_DIR=${BASE_DATA_DIR:-"/public/DL_DATA/mlperf/bert/"}
export USE_NV_INPUT
UNCOMPRESSED_DATA_DIR=$BASE_DATA_DIR/hdf5/training-4320/hdf5_4320_shards_uncompressed
VARLENGTH_DATA_DIR=$BASE_DATA_DIR/hdf5/training-4320/hdf5_4320_shards_varlength
export DATA_DIR=$UNCOMPRESSED_DATA_DIR
export EVAL_DIR=$BASE_DATA_DIR/hdf5/eval
if [[ "$USE_NV_INPUT" == "1" && "$USE_UNCOMPRESSED_DATASET" == "0" ]]; then
export DATA_DIR="$VARLENGTH_DATA_DIR"
export EVAL_DIR=$BASE_DATA_DIR/hdf5/eval
else
export USE_UNCOMPRESSED_DATASET=1
fi
export USE_UNCOMPRESSED_DATASET
export TF_CKPT_PATH=$BASE_DATA_DIR/phase1/model.ckpt-28252.tf_pickled
export BERT_CONFIG_PATH=$BASE_DATA_DIR/phase1/bert_config.json
export PYTHON=python3
export PADDLE_TRAINER_ID=${OMPI_COMM_WORLD_RANK}
export PADDLE_TRAINERS_NUM=${PADDLE_TRAINERS_NUM:-"1"}
export PADDLE_TRAINER_ENDPOINTS=${PADDLE_TRAINER_ENDPOINTS:-""}
OMPI_COMM_WORLD_RANK=${OMPI_COMM_WORLD_RANK:-"0"}
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
function get_device_id() {
$PYTHON <<EOF
import paddle
import os
gpus = os.environ.get("CUDA_VISIBLE_DEVICES", None)
if gpus is None:
print($OMPI_COMM_WORLD_RANK)
else:
gpus = gpus.split(",")
print(gpus[$OMPI_COMM_WORLD_RANK])
EOF
}
if [[ $PADDLE_TRAINER_ID -lt $PADDLE_TRAINERS_NUM ]]; then
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 #$(expr $OMPI_COMM_WORLD_RANK % 4) #`get_device_id`
export IS_TRAINER=1
export IS_READER=0
else
export CUDA_VISIBLE_DEVICES=""
export IS_TRAINER=0
export IS_READER=1
fi
echo "Trainer :" $CUDA_VISIBLE_DEVICES $PADDLE_TRAINER_ENDPOINTS $PADDLE_TRAINERS_NUM
export FLAGS_sync_nccl_allreduce=0
export FLAGS_fraction_of_gpu_memory_to_use=0.99
#export FLAGS_allocator_strategy=naive_best_fit
export FLAGS_call_stack_level=2
export FLAGS_use_fast_math=0
#export FLAGS_check_nan_inf=1
#export PADDLE_INF_NAN_SKIP_OP="matmul_v2_grad,reshape2_grad,cast,elementwise_add_grad,reduce_mean_grad,softmax_with_cross_entropy_grad"
#export CUDA_LAUNCH_BLOCKING=1
#export FLAGS_benchmark=1
export FLAGS_enable_nvtx=1
#export FLAGS_inplace_addto_external_ops=custom_fused_dense_grad
batch_size=4
eval_batch_size=63
#eval_batch_size=16
use_amp=True
use_pure_fp16=True
max_steps=7100
log_freq=50
eval_iter_start_samples=150000
eval_iter_samples=150000
max_seq_length=512
dense_seq_output=True
unpad=False
unpad_fmha=False
fused_bias_mha=True
fused_bias_fc=True
## can be False or True
weight_transpose=True
#fused_dropout_add_ln=True
fused_dropout_add_ln=False
exchange_padding=True
cpu_exchange_padding=True
distributed_lamb=True
unpad_embed=False
unpad_fmha_mke_opt=True
sort_eval_data=False
LOG_DIR="log_${PADDLE_TRAINERS_NUM}"
mkdir -p ${LOG_DIR}
LOG_FILE=${LOG_DIR}/worker.${PADDLE_TRAINER_ID}
#export FLAGS_lamb_allreduce_first=1
#export FLAGS_use_multi_tensor_apply=1
export FLAGS_max_inplace_grad_add=2
if [[ "$exchange_padding" == "true" || "$exchange_padding" == "True" ]]; then
if [[ "$cpu_exchange_padding" == "true" || "$cpu_exchange_padding" == "True" ]]; then
export DATA_DIR="$UNCOMPRESSED_DATA_DIR"
fi
fi
#$NSYS_CMD $BIND_CMD $PYTHON -u run_pretrain.py \
BERT_CMD="run_pretrain.py \
--max_predictions_per_seq 76 \
--train_batch_size $batch_size \
--eval_batch_size $eval_batch_size \
--sort_eval_data $sort_eval_data \
--learning_rate 0.000425 \
--weight_decay 1e-2 \
--lamb_epsilon 1e-6 \
--start_warmup_step 0 \
--warmup_proportion 0.0 \
--warmup_steps 0 \
--input_dir $DATA_DIR \
--log_freq $log_freq \
--max_steps $max_steps \
--tf_ckpt_path $TF_CKPT_PATH \
--bert_config_path $BERT_CONFIG_PATH \
--unpad $unpad \
--unpad_fmha $unpad_fmha \
--unpad_fmha_mke_opt $unpad_fmha_mke_opt \
--unpad_embed $unpad_embed \
--fused_bias_mha $fused_bias_mha \
--fused_bias_fc $fused_bias_fc \
--fused_dropout_add_ln $fused_dropout_add_ln \
--weight_transpose $weight_transpose \
--max_seq_length $max_seq_length \
--eval_dir $EVAL_DIR \
--distributed_lamb $distributed_lamb \
--exchange_padding $exchange_padding \
--cpu_exchange_padding $cpu_exchange_padding \
--seed $SEED \
--use_uncompressed_dataset $USE_UNCOMPRESSED_DATASET \
--dense_seq_output $dense_seq_output \
--gradient_accumulation_steps 14 \
--opt_lamb_beta_1 0.9 \
--opt_lamb_beta_2 0.999 \
--enable_addto True \
--use_pure_fp16 $use_pure_fp16 \
--use_amp $use_amp"
## 2>&1 | tee $LOG_FILE"
APP="python3 -u $BERT_CMD"
case $(expr $lrank % 8) in
[0])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=0
#export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_0:50Gbs
numactl --cpunodebind=0 --membind=0 ${APP} >& $LOG_FILE
;;
[1])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=1
#export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_1:50Gbs
numactl --cpunodebind=1 --membind=1 ${APP} >& $LOG_FILE
;;
[2])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=2
# export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_2:50Gbs
numactl --cpunodebind=2 --membind=2 ${APP} >& $LOG_FILE
;;
[3])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=3
# export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_3:50Gbs
numactl --cpunodebind=3 --membind=3 ${APP} >& $LOG_FILE
;;
[4])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=4
#export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_0:50Gbs
numactl --cpunodebind=4 --membind=4 ${APP} >& $LOG_FILE
;;
[5])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=5
#export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_1:50Gbs
numactl --cpunodebind=5 --membind=5 ${APP} >& $LOG_FILE
;;
[6])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=6
# export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_2:50Gbs
numactl --cpunodebind=6 --membind=6 ${APP} >& $LOG_FILE
;;
[7])
echo "work ${lrank} less than ${PADDLE_TRAINERS_NUM} on DCU $(expr $lrank % 4)"
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export FLAGS_selected_gpus=7
# export UCX_NET_DEVICES=mlx5_0:1
#export UCX_IB_PCI_BW=mlx5_3:50Gbs
numactl --cpunodebind=7 --membind=7 ${APP} >& $LOG_FILE
;;
esac
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/extension.h"
std::vector<std::vector<int64_t>> AccMergeInferShape(
const std::vector<int64_t> &acc, const std::vector<int64_t> &total) {
return {{2}, {2}};
}
std::vector<paddle::DataType> AccMergeInferDType(paddle::DataType acc,
paddle::DataType total) {
return {paddle::DataType::FLOAT64, paddle::DataType::INT64};
}
PD_BUILD_OP(acc_merge)
.Inputs({"Acc", "Total"})
.Outputs({"Out", "Step"})
.SetInferShapeFn(PD_INFER_SHAPE(AccMergeInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(AccMergeInferDType));
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/custom_raw_op_kernel_func.h"
template <typename T, bool NeedAccumulate>
static __device__ __forceinline__ void AccMerge(T acc, T total, T *out) {
auto correct = static_cast<int64_t>(acc * total + 0.5);
if (NeedAccumulate) {
out[0] += correct;
out[1] += total;
} else {
out[0] = correct;
out[1] = total;
}
}
template <typename T1, typename T2, bool NeedAccumulate>
static __global__ void AccMergeKernelCPUTotal(const T1 *acc,
int64_t total,
T2 *out) {
AccMerge<T2, NeedAccumulate>(*acc, total, out);
}
template <typename T1, typename T2, bool NeedAccumulate>
static __global__ void AccMergeKernelGPUTotal(const T1 *acc,
const T1 *total,
T2 *out) {
AccMerge<T2, NeedAccumulate>(*acc, *total, out);
}
__PD_DEFINE_RAW_OP_KERNEL_FUNC(acc_merge, ctx) {
namespace f = paddle::framework;
namespace p = paddle::platform;
auto &step_t = *ctx.Output<f::Tensor>("Step");
auto *step = step_t.data<int64_t>();
if (step[1] <= 0) return;
const auto &total_t = *ctx.Input<f::Tensor>("Total");
bool is_cpu_place = p::is_cpu_place(total_t.place());
using Type1 = float;
using Type2 = double;
const auto &acc_t = *ctx.Input<f::Tensor>("Acc");
auto *acc = acc_t.data<Type1>();
auto &out_t = *ctx.Output<f::Tensor>("Out");
out_t.Resize({2});
auto *out = out_t.mutable_data<Type2>(acc_t.place());
auto stream = ctx.cuda_device_context().stream();
if (step[0] == 0) {
if (is_cpu_place) {
AccMergeKernelCPUTotal<Type1, Type2, false><<<1, 1, 0, stream>>>(
acc, *total_t.data<int64_t>(), out);
} else {
AccMergeKernelGPUTotal<Type1, Type2, false><<<1, 1, 0, stream>>>(
acc, total_t.data<float>(), out);
}
} else {
if (is_cpu_place) {
AccMergeKernelCPUTotal<Type1, Type2, true><<<1, 1, 0, stream>>>(
acc, *total_t.data<int64_t>(), out);
} else {
AccMergeKernelGPUTotal<Type1, Type2, true><<<1, 1, 0, stream>>>(
acc, total_t.data<Type1>(), out);
}
}
step[0] = (step[0] + 1) % step[1];
}
{
"custom_setup_ops.so": "56632beeaffe91ec64ae156669db712a",
"define_macros": [
[
"PADDLE_WITH_HIP",
null
],
[
"PADDLE_WITH_RCCL",
null
]
],
"extra_compile_args": [
"-w"
],
"extra_link_args": [
"-l:core_avx.so",
"-lamdhip64"
],
"include_dirs": [
"/public/home/zhangqha/.conda/envs/hhenv/include/python3.6m",
"/public/home/zhangqha/.conda/envs/hhenv/lib/python3.6/site-packages/numpy/core/include",
"/public/home/zhangqha/dtk-21.04/hipcub/include",
"/public/home/zhangqha/dtk-21.04/hiprand/include",
"/public/home/zhangqha/dtk-21.04/hipsparse/include",
"/public/home/zhangqha/dtk-21.04/include",
"/public/home/zhangqha/dtk-21.04/miopen/include",
"/public/home/zhangqha/dtk-21.04/rccl/include",
"/public/home/zhangqha/dtk-21.04/rocblas/include",
"/public/home/zhangqha/dtk-21.04/rocfft/include",
"/public/home/zhangqha/dtk-21.04/rocprim/include",
"/public/home/zhangqha/dtk-21.04/rocrand/include",
"/public/home/zhangqha/dtk-21.04/rocsparse/include",
"/public/home/zhangqha/dtk-21.04/rocthrust/include",
"/public/home/zhangqha/for_baidu/Paddle-develop",
"/public/home/zhangqha/for_baidu/Paddle-develop/build",
"/public/home/zhangqha/for_baidu/Paddle-develop/build",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/paddle/fluid/platform",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/dlpack/src/extern_dlpack/include",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/eigen3/src/extern_eigen3",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/brpc/include",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/gflags/include",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/glog/include",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/gloo/include",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/gtest/include",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/leveldb/include",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/libmct/include",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/mklml/include",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/mklml/include",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/protobuf/include",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/rocksdb/include",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/snappy/include",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/utf8proc/include",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/warpctc/include",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/xbyak/include",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/xbyak/include/xbyak",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/xxhash/include",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/xxhash/include",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/zlib/include",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/pocketfft/src",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/pybind/src/extern_pybind/include",
"/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/threadpool/src/extern_threadpool",
"/public/home/zhangqha/for_baidu/Paddle-develop/paddle/fluid/framework/io",
"/public/home/zhangqha/for_baidu/Paddle-develop/paddle/fluid/platform",
"/public/home/zhangqha/for_baidu/Paddle-develop/patches/thrust",
"/usr/local/lib/python3.6/site-packages/paddle/include",
"/usr/local/lib/python3.6/site-packages/paddle/include/third_party",
"/opt/dtk-21.04/include"
],
"library_dirs": [
"/usr/local/lib/python3.6/site-packages/paddle/libs",
"/opt/dtk-21.04/lib",
"/usr/local/lib/python3.6/site-packages/paddle/fluid"
],
"runtime_library_dirs": [
"/usr/local/lib/python3.6/site-packages/paddle/libs",
"/opt/dtk-21.04/lib",
"/usr/local/lib/python3.6/site-packages/paddle/fluid"
],
"sources": [
"./sort_bert_inputs_across_devices/sort_bert_inputs_across_devices.cc",
"./sort_bert_inputs_across_devices/sort_bert_inputs_across_devices.cu",
"./lr_op/lr_op_cuda.cc",
"./lr_op/lr_op_cuda.cu",
"./acc_merge/acc_merge.cc",
"./acc_merge/acc_merge.cu"
],
"undef_macros": []
}
\ No newline at end of file
Metadata-Version: 1.0
Name: custom-setup-ops
Version: 0.0.0
Summary: UNKNOWN
Home-page: UNKNOWN
Author: UNKNOWN
Author-email: UNKNOWN
License: UNKNOWN
Description: UNKNOWN
Platform: UNKNOWN
setup.py
./acc_merge/acc_merge.cc
./acc_merge/acc_merge.cu
./lr_op/lr_op_cuda.cc
./lr_op/lr_op_cuda.cu
./sort_bert_inputs_across_devices/sort_bert_inputs_across_devices.cc
./sort_bert_inputs_across_devices/sort_bert_inputs_across_devices.cu
custom_setup_ops.egg-info/PKG-INFO
custom_setup_ops.egg-info/SOURCES.txt
custom_setup_ops.egg-info/dependency_links.txt
custom_setup_ops.egg-info/not-zip-safe
custom_setup_ops.egg-info/top_level.txt
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment