Commit 581b8d15 authored by liangjing's avatar liangjing
Browse files

version 1

parents
Pipeline #169 failed with stages
in 0 seconds
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/extension.h"
#define CHECK_GPU_INPUT(__x) \
do { \
if (__x.place().GetType() != phi::AllocationType::GPU) { \
PD_THROW(#__x " must be GPU Tensor."); \
} \
} while (0)
//#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP
std::vector<paddle::Tensor> GPUSortBERTInputsAcrossDevices(
const paddle::Tensor &input_ids,
const paddle::Tensor &segment_ids,
const paddle::Tensor &input_mask,
const paddle::Tensor &masked_lm_labels,
const paddle::Tensor &next_sentence_labels,
int max_batch_size,
int ring_id,
int device_id,
int num_devices);
#endif
static std::vector<paddle::Tensor> SortBERTInputsAcrossDevices(
const paddle::Tensor &input_ids,
const paddle::Tensor &segment_ids,
const paddle::Tensor &input_mask,
const paddle::Tensor &masked_lm_labels,
const paddle::Tensor &next_sentence_labels,
int max_batch_size,
int ring_id,
int device_id,
int num_devices) {
//#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP
CHECK_GPU_INPUT(input_ids);
CHECK_GPU_INPUT(segment_ids);
CHECK_GPU_INPUT(input_mask);
CHECK_GPU_INPUT(masked_lm_labels);
CHECK_GPU_INPUT(next_sentence_labels);
return GPUSortBERTInputsAcrossDevices(input_ids,
segment_ids,
input_mask,
masked_lm_labels,
next_sentence_labels,
max_batch_size,
ring_id,
device_id,
num_devices);
#else
PADDLE_THROW(platform::errors::InvalidArgument("Does not support CPU yet."));
#endif
}
static std::vector<std::vector<int64_t>> SortBERTInputsAcrossDevicesInferShape(
const std::vector<int64_t> &input_ids_shape,
const std::vector<int64_t> &segment_ids_shape,
const std::vector<int64_t> &input_mask_shape,
const std::vector<int64_t> &masked_lm_labels_shape,
const std::vector<int64_t> &next_sentence_labels_shape,
const int &max_batch_size,
const int &ring_id,
const int &device_id,
const int &num_devices) {
return {input_ids_shape,
segment_ids_shape,
input_mask_shape,
masked_lm_labels_shape,
next_sentence_labels_shape};
}
static std::vector<paddle::DataType> SortBERTInputsAcrossDevicesInferDType(
paddle::DataType input_ids_dtype,
paddle::DataType segment_ids_dtype,
paddle::DataType input_mask_dtype,
paddle::DataType masked_lm_labels_dtype,
paddle::DataType next_sentence_labels_dtype) {
return {input_ids_dtype,
segment_ids_dtype,
input_mask_dtype,
masked_lm_labels_dtype,
next_sentence_labels_dtype};
}
PD_BUILD_OP(sort_bert_inputs_across_devices)
.Inputs({"InputIds",
"SegmentIds",
"InputMask",
"MaskedLMLabels",
"NextSentenceLabels"})
.Outputs({"InputIdsOut",
"SegmentIdsOut",
"InputMaskOut",
"MaskedLMLabelsOut",
"NextSentenceLabelsOut"})
.Attrs({"max_batch_size: int",
"ring_id: int",
"device_id: int",
"num_devices: int"})
.SetKernelFn(PD_KERNEL(SortBERTInputsAcrossDevices))
.SetInferShapeFn(PD_INFER_SHAPE(SortBERTInputsAcrossDevicesInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(SortBERTInputsAcrossDevicesInferDType));
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cstdint>
#include <iostream>
#include <sstream>
#include "glog/logging.h"
#include "paddle/extension.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/for_range.h"
namespace framework = paddle::framework;
namespace operators = paddle::operators;
namespace platform = paddle::platform;
namespace memory = paddle::memory;
namespace kps = paddle::operators::kernel_primitives;
template <typename T>
struct NCCLDataTypeTrait;
#define DEFINE_NCCL_DTYPE_TRAIT(__cpp_type, __nccl_dtype) \
template <> \
struct NCCLDataTypeTrait<__cpp_type> { \
static constexpr ncclDataType_t DataType = __nccl_dtype; \
}
DEFINE_NCCL_DTYPE_TRAIT(int16_t, ncclFloat16);
DEFINE_NCCL_DTYPE_TRAIT(int32_t, ncclInt32);
DEFINE_NCCL_DTYPE_TRAIT(int64_t, ncclInt64);
template <typename T>
static std::string GPUTensorToString(const T *ptr, size_t numel) {
platform::CUDAPlace place(platform::GetCurrentDeviceId());
auto &dev_ctx = *platform::DeviceContextPool::Instance().GetByPlace(place);
auto stream = dev_ctx.stream();
std::vector<T> cpu_data(numel);
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(
cpu_data.data(), ptr, sizeof(T) * numel, hipMemcpyDeviceToHost, stream));
PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream));
std::stringstream ss;
ss << "[";
for (decltype(numel) i = 0; i < numel; ++i) {
if (i > 0) ss << ",";
ss << cpu_data[i];
}
ss << "]";
return ss.str();
}
template <typename T>
static std::string GPUTensorToString(const paddle::Tensor &t) {
return GPUTensorToString<T>(t.data<T>(), t.numel());
}
template <typename T>
static paddle::Tensor PadTensor(const paddle::Tensor &x,
int max_batch_size,
hipStream_t stream) {
#define CALL_PAD_ZERO_DATA \
do { \
T *y_data = y.mutable_data<T>(x.place()); \
const T *x_data = x.data<T>(); \
int n = batch_size * seq_len; \
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync( \
y_data, x_data, n * sizeof(T), hipMemcpyDeviceToDevice, stream)); \
PADDLE_ENFORCE_GPU_SUCCESS( \
hipMemsetAsync(y_data + n, \
0, \
(max_batch_size - batch_size) * seq_len * sizeof(T), \
stream)); \
return y; \
} while (0)
const auto &x_dim = x.dims();
int batch_size = x_dim[0];
if (x_dim.size() == 2) {
int seq_len = x_dim[1];
paddle::Tensor y(x.place(), {max_batch_size, seq_len});
CALL_PAD_ZERO_DATA;
} else {
PADDLE_ENFORCE_EQ(x_dim.size(), 1);
int seq_len = 1;
paddle::Tensor y(x.place(), {max_batch_size});
CALL_PAD_ZERO_DATA;
}
}
template <typename T>
struct RetrieveAllGatheredInputMaskFunctor {
RetrieveAllGatheredInputMaskFunctor(
const T *x, T *y, int fused_numel, int each_mask_numel, int mask_offset)
: x_(x),
y_(y),
fused_numel_(fused_numel),
each_mask_numel_(each_mask_numel),
mask_offset_(mask_offset) {}
HOSTDEVICE void operator()(int idx) const {
int rank = idx / each_mask_numel_;
int offset = mask_offset_ + idx % each_mask_numel_;
int x_idx = rank * fused_numel_ + offset;
y_[idx] = x_[x_idx];
}
private:
const T *x_;
T *y_;
int fused_numel_;
int each_mask_numel_;
int mask_offset_;
};
template <typename T>
struct IotaFunctor {
explicit IotaFunctor(T *x) : x_(x) {}
HOSTDEVICE void operator()(int idx) const { x_[idx] = static_cast<T>(idx); }
private:
T *x_;
};
template <typename T, typename IndexT>
struct IsNonZeroFunctor {
HOSTDEVICE IndexT operator()(T x) const {
return static_cast<IndexT>(x != 0);
}
};
template <typename T, typename IndexT>
struct ReorderBERTInputTensorsFunctor {
public:
ReorderBERTInputTensorsFunctor(const T *fused_inputs,
const IndexT *indices,
int device_id,
int num_devices,
int max_batch_size,
int seq_len,
T *input_ids_out,
T *segment_ids_out,
T *input_mask_out,
T *masked_lm_labels_out,
T *next_sentence_labels_out)
: fused_inputs_(fused_inputs),
indices_(indices),
device_id_(device_id),
num_devices_(num_devices),
max_batch_size_(max_batch_size),
seq_len_(seq_len),
input_ids_out_(input_ids_out),
segment_ids_out_(segment_ids_out),
input_mask_out_(input_mask_out),
masked_lm_labels_out_(masked_lm_labels_out),
next_sentence_labels_out_(next_sentence_labels_out) {}
// idx is in range [0, new_bs * seq_len)
HOSTDEVICE void operator()(int idx) const {
int out_idx_i = idx / seq_len_;
int seq_len_idx = idx % seq_len_;
auto index_per_device = indices_[device_id_ + out_idx_i * num_devices_];
auto gpu_idx = index_per_device / max_batch_size_;
auto bs_idx = index_per_device % max_batch_size_;
// [gpu_idx, bs_idx, seq_len_idx]
int device_stride = 4 * max_batch_size_ * seq_len_ + max_batch_size_;
int tensor_offset = max_batch_size_ * seq_len_;
int device_offset = gpu_idx * device_stride;
int offset = device_offset + bs_idx * seq_len_ + seq_len_idx;
input_ids_out_[idx] = fused_inputs_[offset];
segment_ids_out_[idx] = fused_inputs_[offset + tensor_offset];
input_mask_out_[idx] = fused_inputs_[offset + 2 * tensor_offset];
masked_lm_labels_out_[idx] = fused_inputs_[offset + 3 * tensor_offset];
if (seq_len_idx == 0) {
next_sentence_labels_out_[out_idx_i] =
fused_inputs_[device_offset + 4 * max_batch_size_ * seq_len_ +
bs_idx];
}
}
private:
const T *fused_inputs_;
const IndexT *indices_;
int device_id_;
int num_devices_;
int max_batch_size_;
int seq_len_;
T *input_ids_out_;
T *segment_ids_out_;
T *input_mask_out_;
T *masked_lm_labels_out_;
T *next_sentence_labels_out_;
};
/**
* input_ids: shape: [bs, seq_len], range: [0, vocab_size)
* segment_ids: shape: [bs, seq_len], range: {0, 1}
* input_mask: shape: [bs, seq_len], range: {0, 1}
* masked_lm_labels: shape: [bs, seq_len], range: [0, vocab_size)
* next_sentence_labels:shape: [bs] or [bs, 1], range: {0, 1}
*/
template <typename T>
std::vector<paddle::Tensor> GPUSortBERTInputsAcrossDevicesWithDType(
paddle::Tensor input_ids,
paddle::Tensor segment_ids,
paddle::Tensor input_mask,
paddle::Tensor masked_lm_labels,
paddle::Tensor next_sentence_labels,
int max_batch_size,
int ring_id,
int device_id,
int num_devices) {
const auto &dim = input_ids.dims();
int batch_size = dim[0];
int seq_len = dim[1];
PADDLE_ENFORCE_EQ(dim.size(), 2);
PADDLE_ENFORCE_LE(batch_size, max_batch_size);
PADDLE_ENFORCE_EQ(dim, segment_ids.dims());
PADDLE_ENFORCE_EQ(dim, input_mask.dims());
PADDLE_ENFORCE_EQ(dim, masked_lm_labels.dims());
const auto &nsl_dim = next_sentence_labels.dims();
if (nsl_dim.size() == 2) {
PADDLE_ENFORCE_EQ(nsl_dim[0], batch_size);
PADDLE_ENFORCE_EQ(nsl_dim[1], 1);
} else if (nsl_dim.size() == 1) {
PADDLE_ENFORCE_EQ(nsl_dim[0], batch_size);
} else {
PADDLE_THROW("invalid next_sentence_labels rank, should be 1 or 2.");
}
bool need_pad = (batch_size < max_batch_size);
// NOTE: device_id may be different from platform::GetCurrentDeviceId()!
platform::CUDAPlace place(platform::GetCurrentDeviceId());
auto &dev_ctx = *platform::DeviceContextPool::Instance().GetByPlace(place);
auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place)->comm();
auto stream = dev_ctx.stream();
// Step 1: pad to max_batch_size
if (need_pad) {
input_ids = PadTensor<T>(input_ids, max_batch_size, stream);
segment_ids = PadTensor<T>(segment_ids, max_batch_size, stream);
input_mask = PadTensor<T>(input_mask, max_batch_size, stream);
masked_lm_labels = PadTensor<T>(masked_lm_labels, max_batch_size, stream);
next_sentence_labels =
PadTensor<T>(next_sentence_labels, max_batch_size, stream);
}
VLOG(10) << "input_ids = " << GPUTensorToString<T>(input_ids);
VLOG(10) << "segment_ids = " << GPUTensorToString<T>(segment_ids);
VLOG(10) << "input_mask = " << GPUTensorToString<T>(input_mask);
VLOG(10) << "masked_lm_labels = " << GPUTensorToString<T>(masked_lm_labels);
VLOG(10) << "next_sentence_labels = "
<< GPUTensorToString<T>(next_sentence_labels);
// Step 2: fuse to continous space
int n = max_batch_size * seq_len;
int numel = 4 * n + max_batch_size;
auto buffer = memory::Alloc(place, numel * sizeof(T));
T *buf_ptr = reinterpret_cast<T *>(buffer->ptr());
const T *input_ids_ptr =
static_cast<const paddle::Tensor &>(input_ids).data<T>();
const T *segment_ids_ptr =
static_cast<const paddle::Tensor &>(segment_ids).data<T>();
const T *input_mask_ptr =
static_cast<const paddle::Tensor &>(input_mask).data<T>();
const T *masked_lm_labels_ptr =
static_cast<const paddle::Tensor &>(masked_lm_labels).data<T>();
const T *next_sentence_labels_ptr =
static_cast<const paddle::Tensor &>(next_sentence_labels).data<T>();
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(
buf_ptr, input_ids_ptr, n * sizeof(T), hipMemcpyDeviceToDevice, stream));
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(buf_ptr + n,
segment_ids_ptr,
n * sizeof(T),
hipMemcpyDeviceToDevice,
stream));
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(buf_ptr + 2 * n,
input_mask_ptr,
n * sizeof(T),
hipMemcpyDeviceToDevice,
stream));
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(buf_ptr + 3 * n,
masked_lm_labels_ptr,
n * sizeof(T),
hipMemcpyDeviceToDevice,
stream));
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(buf_ptr + 4 * n,
next_sentence_labels_ptr,
max_batch_size * sizeof(T),
hipMemcpyDeviceToDevice,
stream));
VLOG(10) << "fused input = " << GPUTensorToString<T>(buf_ptr, numel);
// Step 3: allgather
auto allgather_buffer = memory::Alloc(place, numel * num_devices * sizeof(T));
T *allgather_buf_ptr = reinterpret_cast<T *>(allgather_buffer->ptr());
auto nccl_dtype = NCCLDataTypeTrait<T>::DataType;
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
buf_ptr, allgather_buf_ptr, numel, nccl_dtype, comm, stream));
buffer = nullptr;
VLOG(10) << "allgather fused input = "
<< GPUTensorToString<T>(allgather_buf_ptr, numel * num_devices);
// Step 4: sort by seq_len
framework::Tensor allgather_input_mask;
int gbs = num_devices * max_batch_size;
allgather_input_mask.Resize({gbs, seq_len});
T *allgather_input_mask_ptr = allgather_input_mask.mutable_data<T>(place);
platform::ForRange<platform::CUDADeviceContext> retrieve_mask_for_range(
dev_ctx, num_devices * n);
int input_mask_offset = 2 * n;
retrieve_mask_for_range(
RetrieveAllGatheredInputMaskFunctor<T>(allgather_buf_ptr,
allgather_input_mask_ptr,
numel,
n,
input_mask_offset));
VLOG(10) << "allgather mask = "
<< GPUTensorToString<T>(allgather_input_mask_ptr, gbs * seq_len);
framework::Tensor allgather_seq_len;
allgather_seq_len.Resize({gbs});
allgather_seq_len.mutable_data<T>(place);
operators::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dev_ctx,
allgather_input_mask,
&allgather_seq_len,
kps::IdentityFunctor<T>(),
{1},
stream);
VLOG(10) << "allgather seq_len = "
<< GPUTensorToString<T>(allgather_seq_len.data<T>(), gbs);
using IndexT = int;
auto indices = memory::Alloc(place, gbs * sizeof(IndexT));
auto *indices_ptr = reinterpret_cast<IndexT *>(indices->ptr());
platform::ForRange<platform::CUDADeviceContext> itoa_for_range(dev_ctx, gbs);
itoa_for_range(IotaFunctor<IndexT>(indices_ptr));
auto sorted_indices = memory::Alloc(place, gbs * sizeof(IndexT));
auto *sorted_indices_ptr = reinterpret_cast<IndexT *>(sorted_indices->ptr());
auto *allgather_seq_len_ptr = allgather_seq_len.data<T>();
auto sorted_allgather_seq_len = memory::Alloc(place, gbs * sizeof(T));
auto *sorted_allgather_seq_len_ptr =
reinterpret_cast<T *>(sorted_allgather_seq_len->ptr());
memory::AllocationPtr tmp_storage;
void *tmp_storage_ptr;
size_t tmp_storage_size = 0;
for (int i = 0; i < 2; ++i) {
if (tmp_storage_size > 0 && tmp_storage == nullptr) {
tmp_storage = memory::Alloc(place, tmp_storage_size);
}
tmp_storage_ptr = tmp_storage ? tmp_storage->ptr() : nullptr;
PADDLE_ENFORCE_GPU_SUCCESS(
hipcub::DeviceRadixSort::SortPairsDescending(tmp_storage_ptr,
tmp_storage_size,
allgather_seq_len_ptr,
sorted_allgather_seq_len_ptr,
indices_ptr,
sorted_indices_ptr,
gbs,
0,
sizeof(T) * 8,
stream));
}
VLOG(10) << "allgather sorted seq_len = "
<< GPUTensorToString<T>(sorted_allgather_seq_len_ptr, gbs);
VLOG(10) << "allgather sorted indices = "
<< GPUTensorToString<IndexT>(sorted_indices_ptr, gbs);
// Step 5: reorder inputs
memory::AllocationPtr ntokens_alloc;
IndexT ntokens;
if (need_pad) {
// find the max valid length here
ntokens_alloc = memory::Alloc(place, sizeof(IndexT));
auto *ntokens_ptr = reinterpret_cast<IndexT *>(ntokens_alloc->ptr());
using CubIterator =
hipcub::TransformInputIterator<IndexT, IsNonZeroFunctor<T, IndexT>, T *>;
CubIterator input_iter(sorted_allgather_seq_len_ptr,
IsNonZeroFunctor<T, IndexT>());
tmp_storage_size = 0;
for (int i = 0; i < 2; ++i) {
if (i > 0 && tmp_storage_size > 0 &&
(tmp_storage == nullptr || tmp_storage->size() < tmp_storage_size)) {
tmp_storage = memory::Alloc(place, tmp_storage_size);
}
tmp_storage_ptr = tmp_storage ? tmp_storage->ptr() : nullptr;
PADDLE_ENFORCE_GPU_SUCCESS(
hipcub::DeviceReduce::Reduce(tmp_storage_ptr,
tmp_storage_size,
input_iter,
ntokens_ptr,
gbs,
hipcub::Sum(),
static_cast<IndexT>(0),
stream));
VLOG(10) << "ntokens_ptr = " << GPUTensorToString<IndexT>(ntokens_ptr, 1);
}
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(
&ntokens, ntokens_ptr, sizeof(IndexT), hipMemcpyDeviceToHost, stream));
// NOTE: Maybe we do not need this line? D2H copy is always
// synchronous if we do not use pinned memory.
PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream));
} else {
ntokens = gbs;
}
VLOG(10) << "ntokens = " << ntokens;
// The indices in GPU:device_id would be:
// indices[device_id], indices[device_id + num_devices], ...,
// indices[device_id + (batch_size - 1) * num_devices]
// Therefore, device_id + (batch_size - 1) * num_devices < ntokens
// i.e., batch_size < (ntokens - device_id) / num_devices + 1
int new_bs = (ntokens - device_id) / num_devices;
if ((ntokens - device_id) % num_devices > 0) {
++new_bs;
}
VLOG(10) << "New batch size is: " << new_bs
<< " , original batch size is: " << batch_size;
std::vector<int64_t> out_shape = {new_bs, seq_len};
std::vector<int64_t> nsl_out_shape = next_sentence_labels.shape();
nsl_out_shape[0] = new_bs;
paddle::Tensor new_input_ids(input_ids.place(), out_shape);
paddle::Tensor new_segment_ids(segment_ids.place(), out_shape);
paddle::Tensor new_input_mask(input_mask.place(), out_shape);
paddle::Tensor new_masked_lm_labels(masked_lm_labels.place(), out_shape);
paddle::Tensor new_next_sentence_labels(next_sentence_labels.place(),
nsl_out_shape);
VLOG(10) << "starts to reorder";
platform::ForRange<platform::CUDADeviceContext> reorder_for_range(
dev_ctx, new_bs * seq_len);
ReorderBERTInputTensorsFunctor<T, IndexT> reorder_functor(
allgather_buf_ptr,
sorted_indices_ptr,
device_id,
num_devices,
max_batch_size,
seq_len,
new_input_ids.mutable_data<T>(input_ids.place()),
new_segment_ids.mutable_data<T>(segment_ids.place()),
new_input_mask.mutable_data<T>(input_mask.place()),
new_masked_lm_labels.mutable_data<T>(masked_lm_labels.place()),
new_next_sentence_labels.mutable_data<T>(next_sentence_labels.place()));
reorder_for_range(reorder_functor);
VLOG(10) << "ends to reorder";
return {new_input_ids,
new_segment_ids,
new_input_mask,
new_masked_lm_labels,
new_next_sentence_labels};
}
std::vector<paddle::Tensor> GPUSortBERTInputsAcrossDevices(
const paddle::Tensor &input_ids,
const paddle::Tensor &segment_ids,
const paddle::Tensor &input_mask,
const paddle::Tensor &masked_lm_labels,
const paddle::Tensor &next_sentence_labels,
int max_batch_size,
int ring_id,
int device_id,
int num_devices) {
PADDLE_ENFORCE_GT(num_devices, 1);
auto dtype = input_ids.dtype();
#define CALL_DTYPE_FUNC(__dtype, __cpp_type) \
do { \
if (dtype == paddle::DataType::__dtype) { \
return GPUSortBERTInputsAcrossDevicesWithDType<__cpp_type>( \
input_ids, \
segment_ids, \
input_mask, \
masked_lm_labels, \
next_sentence_labels, \
max_batch_size, \
ring_id, \
device_id, \
num_devices); \
} \
} while (0)
CALL_DTYPE_FUNC(INT16, int16_t);
CALL_DTYPE_FUNC(INT32, int32_t);
CALL_DTYPE_FUNC(INT64, int64_t);
PD_THROW("Unsupported data type: %d", static_cast<int>(dtype));
}
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
import paddle
import paddle.distributed.fleet as fleet
from paddle.fluid.layers.collective import _c_broadcast as broadcast
from custom_setup_ops import sort_bert_inputs_across_devices
def run_test_case(batch_size, max_batch_size=None):
if max_batch_size is None:
max_batch_size = batch_size
max_seq_len = 512
vocab_size = 30528
dtype = 'int64'
ring_id = 0
rank = paddle.distributed.get_rank()
world_size = paddle.distributed.get_world_size()
device_id = rank
num_devices = world_size
global_batch_size = batch_size * num_devices
seq_lens = np.random.randint(
low=100, high=max_seq_len + 1, size=[num_devices * batch_size])
input_ids = []
segment_ids = []
input_mask = []
masked_lm_labels = []
next_sentence_labels = []
for seq_len in seq_lens:
input_id = np.random.randint(low=1, high=vocab_size, size=[max_seq_len])
input_id[seq_len:] = 0
input_ids.append(input_id)
sentence_1_len = np.random.randint(
low=10, high=seq_len - 10, size=[1])[0]
segment_id = np.zeros([max_seq_len])
segment_id[sentence_1_len:seq_len] = 1
segment_ids.append(segment_id)
mask = np.ones([max_seq_len])
mask[seq_len:] = 0
input_mask.append(mask)
masked_lm_label = np.random.randint(
low=0, high=vocab_size, size=[max_seq_len])
masked_lm_labels.append(masked_lm_label)
next_sentence_label = np.random.randint(low=0, high=2, size=[1])[0]
next_sentence_labels.append(next_sentence_label)
shape = [num_devices, batch_size, max_seq_len]
input_ids = np.reshape(np.array(input_ids, dtype=dtype), shape)
segment_ids = np.reshape(np.array(segment_ids, dtype=dtype), shape)
input_mask = np.reshape(np.array(input_mask, dtype=dtype), shape)
masked_lm_labels = np.reshape(
np.array(
masked_lm_labels, dtype=dtype), shape)
next_sentence_labels = np.reshape(
np.array(
next_sentence_labels, dtype=dtype), [num_devices, batch_size])
seq_lens = np.reshape(np.sum(input_mask, axis=2), [-1])
sorted_indices = np.argsort(-seq_lens, kind='mergesort')
sorted_indices = np.reshape(sorted_indices,
[batch_size, num_devices])[:, rank]
unsorted_indices = np.array(
range(rank * batch_size, (rank + 1) * batch_size),
dtype=sorted_indices.dtype)
def reorder_by_indices(indices):
shape = [num_devices * batch_size, max_seq_len]
input_ids_ret = np.reshape(input_ids, shape)[indices, :]
segment_ids_ret = np.reshape(segment_ids, shape)[indices, :]
input_mask_ret = np.reshape(input_mask, shape)[indices, :]
masked_lm_labels_ret = np.reshape(masked_lm_labels, shape)[indices, :]
next_sentence_labels_ret = np.reshape(next_sentence_labels,
shape[:1])[indices]
return input_ids_ret, segment_ids_ret, input_mask_ret, masked_lm_labels_ret, next_sentence_labels_ret
input_ids_sorted, segment_ids_sorted, input_mask_sorted, masked_lm_labels_sorted, next_sentence_labels_sorted = reorder_by_indices(
sorted_indices)
input_ids_unsorted, segment_ids_unsorted, input_mask_unsorted, masked_lm_labels_unsorted, next_sentence_labels_unsorted = reorder_by_indices(
unsorted_indices)
input_ids_sorted_actual, segment_ids_sorted_actual, input_mask_sorted_actual, masked_lm_labels_sorted_actual, next_sentence_labels_sorted_actual = sort_bert_inputs_across_devices(
paddle.to_tensor(input_ids_unsorted),
paddle.to_tensor(segment_ids_unsorted),
paddle.to_tensor(input_mask_unsorted),
paddle.to_tensor(masked_lm_labels_unsorted),
paddle.to_tensor(next_sentence_labels_unsorted), max_batch_size,
ring_id, rank, world_size)
def assert_equal(x, y):
assert np.array_equal(np.array(x), np.array(y))
assert_equal(input_ids_sorted, input_ids_sorted_actual)
assert_equal(segment_ids_sorted, segment_ids_sorted_actual)
assert_equal(input_mask_sorted, input_mask_sorted_actual)
assert_equal(masked_lm_labels_sorted, masked_lm_labels_sorted_actual)
assert_equal(next_sentence_labels_sorted,
next_sentence_labels_sorted_actual)
print('Test Passed')
def gen_seed_if_not_exists():
seed = os.environ.get('SEED')
if seed:
return int(seed)
with paddle.no_grad():
seed = np.random.randint(low=0, high=10000, size=[1])
seed = paddle.to_tensor(seed)
seed = broadcast(seed, use_calc_stream=True)
seed = seed.numpy()[0]
os.environ['SEED'] = str(seed)
return seed
def run_main():
fleet.init(is_collective=True)
seed = gen_seed_if_not_exists()
np.random.seed(seed)
run_test_case(56)
run_test_case(47, 56)
if __name__ == "__main__":
run_main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import subprocess
def get_gpu_list():
gpus = os.environ.get("CUDA_VISIBLE_DEVICES")
if gpus is None:
output = subprocess.check_output(
["sh", "-c", "nvidia-smi --list-gpus | wc -l"])
gpu_num = int(output.strip())
gpus = list(range(gpu_num))
else:
gpus = [int(g.strip()) for g in gpus.split(",") if g.strip()]
return gpus
def get_local_rank():
world_rank = os.environ.get("OMPI_COMM_WORLD_RANK")
gpus = get_gpu_list()
if world_rank is None:
local_rank = gpus[0]
else:
local_rank = gpus[int(world_rank) % 4]
return local_rank
if __name__ == "__main__":
local_rank = get_local_rank()
print(local_rank)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from mpi4py import MPI
import numpy as np
import time
import paddle
from pybind.functions import process_allgathered_inputs as process_bert_inputs
from pybind.functions import process_eval_inputs as process_bert_eval_inputs
import h5py
import random
global_comm = MPI.COMM_WORLD
global_rank = global_comm.rank
global_world_size = global_comm.size
assert global_world_size % 2 == 0
def create_group_comm(ranks):
ranks = list(ranks)
new_group = global_comm.group.Incl(ranks)
new_comm = global_comm.Create_group(new_group)
return new_comm
def generate_seeds(rng, size):
"""
Generate list of random seeds
:param rng: random number generator
:param size: length of the returned list
"""
seeds = [rng.randint(0, 2**32 - 1) for _ in range(size)]
return seeds
def broadcast_seeds(comm, seeds, root=0):
seeds = np.array(seeds).astype(np.int64)
comm.Bcast(seeds, root=root)
return seeds.tolist()
def select_dataset_file_for_each_worker(files, f_start_id, worker_num,
worker_index):
"""
Spliting the train file according to the worker index.
"""
num_files = len(files)
if worker_num > num_files:
remainder = worker_num % num_files
data_file = files[(
f_start_id * worker_num + worker_index + remainder * f_start_id) %
num_files]
else:
data_file = files[(f_start_id * worker_num + worker_index) % num_files]
# limin-todo:
#data_file = "/data2/zengjinle/dataset/bert_data/hdf5/training-4320/hdf5_4320_shards_uncompressed/part_01799_of_04320.hdf5"
#print("data_file: ", data_file)
return data_file
def read_hdf5_file(input_file, dtype=np.int16):
keys = [
'input_ids',
'input_mask',
'segment_ids',
'masked_lm_positions',
'masked_lm_ids',
'next_sentence_labels',
]
if not os.path.exists(input_file):
return None
with h5py.File(input_file, 'r') as f:
outputs = [np.array(f[key], dtype=dtype) for key in keys]
n = outputs[0].shape[0]
masked_lm_labels = np.zeros(outputs[0].shape, dtype=dtype)
lengths = np.zeros(n, dtype=dtype)
for i in range(n):
masked_lm_positions = outputs[3][i]
masked_lm_ids = outputs[4][i]
length = np.count_nonzero(masked_lm_positions)
masked_lm_labels[i][
masked_lm_positions[:length]] = masked_lm_ids[:length]
lengths[i] = np.count_nonzero(outputs[1][i])
outputs = [
outputs[0], outputs[2], outputs[1], masked_lm_labels, outputs[-1],
lengths
]
idx = np.random.choice(np.arange(n), n, replace=False)
for i in range(len(outputs)):
outputs[i] = outputs[i][idx]
return outputs
def read_eval_hdf5_file(input_file, dtype=np.int16):
keys = [
'input_ids',
'input_mask',
'segment_ids',
'masked_lm_positions',
'masked_lm_ids',
'next_sentence_labels',
]
if not os.path.exists(input_file):
return None
with h5py.File(input_file, 'r') as f:
outputs = [np.asarray(f[key][:]) for key in keys]
nsamples = outputs[0].shape[0]
all_data = []
for index in range(nsamples):
[
input_ids, input_mask, segment_ids, masked_lm_positions,
masked_lm_ids, next_sentence_labels
] = [
input[index].astype(dtype)
if indice < 5 else np.asarray(input[index].astype(dtype))
for indice, input in enumerate(outputs)
]
length = np.count_nonzero(masked_lm_positions)
masked_lm_positions = masked_lm_positions[:length]
masked_lm_ids = masked_lm_ids[:length]
masked_lm_labels = np.zeros(input_ids.shape, dtype=dtype)
masked_lm_labels[masked_lm_positions] = masked_lm_ids
#if index == 0:
# print("masked_lm_labels = ", masked_lm_labels)
# print("masked_lm_positions = ", masked_lm_positions)
# print("masked_lm_ids = ", masked_lm_ids)
seq_len = np.asarray(np.count_nonzero(input_mask))
data = [
input_ids,
segment_ids,
input_mask,
masked_lm_labels,
next_sentence_labels,
seq_len,
]
# (2050, ), i.e., 512 * 4 + 1 + 1
one_sample_data = np.concatenate([d.flatten() for d in data])
all_data.extend(one_sample_data)
# (2050000, ) -> (10000, 2050)
return np.asarray(all_data).reshape((nsamples, -1))
class WorkerInitObj(object):
"Construct the object with different seed, and the Dataloader will generate the data "
"with different seed in each worker."
def __init__(self, seed):
self.seed = seed
def __call__(self, id):
np.random.seed(seed=self.seed + id)
random.seed(self.seed + id)
class Context:
def __init__(self):
half_size = int(global_world_size / 2)
self.trainer_id = global_rank % half_size
self.trainer_num = half_size
self.is_trainer = (global_rank < half_size)
self.reader_id = self.trainer_id
self.reader_num = self.trainer_num
self.is_reader = not self.is_trainer
self.trainer_comm = create_group_comm(range(0, half_size))
self.reader_comm = create_group_comm(
range(half_size, global_world_size))
self.trainer_reader_comm = create_group_comm(
[self.trainer_id, self.trainer_id + half_size])
self.global_comm = global_comm
def init_args(self, args, dtype=np.int16):
self.args = args
self.files = [
os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
if os.path.isfile(os.path.join(args.input_dir, f)) and "part" in f
]
self.files.sort()
self.fid_buf = np.array([1], dtype=np.int64)
with h5py.File(self.files[0], 'r') as f:
self.num_samples = np.array(f["next_sentence_labels"][:]).size
self.batch_size = args.train_batch_size
self.max_seq_length = args.max_seq_length
self.worker_seeds, self.shuffling_seeds = self._setup_seeds(
args.seed, args.num_epochs_to_generate_seeds_for)
self.epoch_idx = 0
data_buf_size = self.num_samples * 4 * self.max_seq_length + self.num_samples * 2
self.data_buf = np.empty(
shape=[self.trainer_num * data_buf_size], dtype=dtype)
self.eval_dir = args.eval_dir
self.num_eval_examples = args.num_eval_examples
self.eval_batch_size = args.eval_batch_size
cur_seed = self.worker_seeds[self.trainer_id]
np.random.seed(cur_seed)
random.seed(cur_seed)
paddle.seed(cur_seed)
self.worker_init = WorkerInitObj(cur_seed)
self.barrier()
def shuffle_files(self):
random.Random(self.shuffling_seeds[self.epoch_idx]).shuffle(self.files)
self.epoch_idx += 1
def _setup_seeds(self, master_seed, epochs):
if master_seed is None:
master_seed = random.SystemRandom().randint(0, 2**32 - 1)
if self.trainer_id == 0:
print('Using random master seed: {}'.format(master_seed))
else:
print('Using master seed from command line: {}'.format(master_seed))
# initialize seeding RNG
seeding_rng = random.Random(master_seed)
# generate worker seeds, one seed for every distributed worker
worker_seeds = generate_seeds(seeding_rng, self.trainer_num)
# generate seeds for data shuffling, one seed for every epoch
shuffling_seeds = generate_seeds(seeding_rng, epochs)
worker_seeds = broadcast_seeds(self.global_comm, worker_seeds)
shuffling_seeds = broadcast_seeds(self.global_comm, shuffling_seeds)
return worker_seeds, shuffling_seeds
def worker_seed(self):
return self.worker_seeds[self.trainer_id]
def barrier(self):
self.global_comm.barrier()
def stop_reader(self):
if self.is_trainer:
self.read_file(-1)
def file_num(self):
return len(self.files)
def read_file(self, f_id=None):
if self.is_trainer:
self.fid_buf[0] = f_id
self.trainer_reader_comm.Isend(self.fid_buf, dest=1)
if f_id == 0:
self.shuffle_files()
elif f_id < 0:
return
self.trainer_reader_comm.Recv(self.data_buf, source=1)
results = process_bert_inputs(self.data_buf, self.num_samples,
self.max_seq_length, self.batch_size,
self.trainer_id, self.trainer_num)
return results
else:
self.trainer_reader_comm.Recv(self.fid_buf, 0)
f_id = self.fid_buf[0]
if f_id == 0:
self.shuffle_files()
elif f_id < 0:
return False
fname = select_dataset_file_for_each_worker(
self.files, f_id, self.trainer_num, self.trainer_id)
data = read_hdf5_file(fname, dtype=self.data_buf.dtype)
send_buf = np.concatenate([d.flatten() for d in data])
self.reader_comm.Allgather(send_buf, self.data_buf)
self.trainer_reader_comm.Send(self.data_buf, dest=0)
return True
def read_eval_file(self):
if self.is_trainer:
eval_data = []
for eval_file in sorted(os.listdir(self.eval_dir)):
eval_file_path = os.path.join(self.eval_dir, eval_file)
if os.path.isfile(eval_file_path) and 'part' in eval_file_path:
data = read_eval_hdf5_file(
eval_file_path, dtype=self.data_buf.dtype)
eval_data.extend(data)
if len(eval_data) > self.num_eval_examples:
break
chunk_size = self.num_eval_examples // self.trainer_num
rank = self.trainer_id
remainder = self.num_eval_examples % self.trainer_num
if rank < remainder:
eval_data = eval_data[(chunk_size + 1) * rank:(chunk_size + 1) *
(rank + 1)]
else:
eval_data = eval_data[chunk_size * rank + remainder:chunk_size *
(rank + 1) + remainder]
results = process_bert_eval_inputs(eval_data, self.max_seq_length,
self.eval_batch_size,
self.args.sort_eval_data)
return results
_context = Context()
def get_context():
return _context
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
ps -aux | grep "$1" | awk '{print $2}' | xargs kill -9
import hostlist
import os
import json
def get_list():
worker = []
nodelist = os.environ["SLURM_JOB_NODELIST"]
nodelist = hostlist.expand_hostlist(nodelist)
num_nodes = int(os.getenv("SLURM_JOB_NUM_NODES"))
port_number = 60001
worker_nodes = [node for i, node in enumerate(nodelist) if i >= 0 ]
for node in worker_nodes:
for index in range(4):
worker_sockets = ":".join([node, str(port_number + index )])
worker.append(worker_sockets)
return ','.join(worker)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
import subprocess
from mlperf_logging import mllog
from mlperf_logging.mllog import constants
mllogger = mllog.get_mllogger()
def log_start(*args, **kwargs):
_log_print(mllogger.start, *args, **kwargs)
def log_end(*args, **kwargs):
_log_print(mllogger.end, *args, **kwargs)
def log_event(*args, **kwargs):
_log_print(mllogger.event, *args, **kwargs)
def _log_print(logger, *args, **kwargs):
if kwargs.pop('sync', False):
barrier()
if 'stack_offset' not in kwargs:
kwargs['stack_offset'] = 3
if 'value' not in kwargs:
kwargs['value'] = None
if kwargs.pop('log_all_ranks', False):
log = True
else:
log = (get_rank() == 0)
if log:
logger(*args, **kwargs)
def mlperf_submission_log(benchmark):
num_nodes = os.environ.get('SLURM_NNODES', 1)
mllog.config(filename=os.path.join(
os.path.dirname(os.path.abspath(__file__)), f'{benchmark}.log'))
mllogger = mllog.get_mllogger()
mllogger.logger.propagate = False
log_event(
key=constants.SUBMISSION_BENCHMARK,
value=benchmark, )
log_event(key=constants.SUBMISSION_ORG, value='NVIDIA')
log_event(key=constants.SUBMISSION_DIVISION, value='closed')
log_event(key=constants.SUBMISSION_STATUS, value='onprem')
log_event(
key=constants.SUBMISSION_PLATFORM,
value=f'{num_nodes}xSUBMISSION_PLATFORM_PLACEHOLDER')
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .modeling import *
from .optimization import *
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import functools
import inspect
import paddle
from paddle.nn import Layer
def fn_args_to_dict(func, *args, **kwargs):
"""
Inspect function `func` and its arguments for running, and extract a
dict mapping between argument names and keys.
"""
if hasattr(inspect, 'getfullargspec'):
(spec_args, spec_varargs, spec_varkw, spec_defaults, _, _,
_) = inspect.getfullargspec(func)
else:
(spec_args, spec_varargs, spec_varkw,
spec_defaults) = inspect.getargspec(func)
# add positional argument values
init_dict = dict(zip(spec_args, args))
# add default argument values
kwargs_dict = dict(zip(spec_args[-len(spec_defaults):],
spec_defaults)) if spec_defaults else {}
kwargs_dict.update(kwargs)
init_dict.update(kwargs_dict)
return init_dict
class InitTrackerMeta(type(Layer)):
"""
This metaclass wraps the `__init__` method of a class to add `init_config`
attribute for instances of that class, and `init_config` use a dict to track
the initial configuration. If the class has `_wrap_init` method, it would be
hooked after `__init__` and called as `_wrap_init(self, init_fn, init_args)`.
Since InitTrackerMeta would be used as metaclass for pretrained model classes,
which always are Layer and `type(Layer)` is not `type`, thus use `type(Layer)`
rather than `type` as base class for it to avoid inheritance metaclass
conflicts.
"""
def __init__(cls, name, bases, attrs):
init_func = cls.__init__
# If attrs has `__init__`, wrap it using accessable `_wrap_init`.
# Otherwise, no need to wrap again since the super cls has been wraped.
# TODO: remove reduplicated tracker if using super cls `__init__`
help_func = getattr(cls, '_wrap_init',
None) if '__init__' in attrs else None
cls.__init__ = InitTrackerMeta.init_and_track_conf(init_func, help_func)
super(InitTrackerMeta, cls).__init__(name, bases, attrs)
@staticmethod
def init_and_track_conf(init_func, help_func=None):
"""
wraps `init_func` which is `__init__` method of a class to add `init_config`
attribute for instances of that class.
Args:
init_func (callable): It should be the `__init__` method of a class.
help_func (callable, optional): If provided, it would be hooked after
`init_func` and called as `_wrap_init(self, init_func, *init_args, **init_args)`.
Default None.
Returns:
function: the wrapped function
"""
@functools.wraps(init_func)
def __impl__(self, *args, **kwargs):
# keep full configuration
init_func(self, *args, **kwargs)
# registed helper by `_wrap_init`
if help_func:
help_func(self, init_func, *args, **kwargs)
self.init_config = kwargs
if args:
kwargs['init_args'] = args
kwargs['init_class'] = self.__class__.__name__
return __impl__
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .collate import *
from .vocab import *
from .sampler import *
from .tokenizer import *
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
__all__ = ['Stack', 'Pad', 'Tuple', 'Dict']
class Stack(object):
"""
Stacks the input data samples to construct the batch. The N input samples
must have the same shape/length and will be stacked to construct a batch.
Args:
axis (int, optional): The axis in the result data along which the input
data are stacked. Default: 0.
dtype (str|numpy.dtype, optional): The value type of the output. If it
is set to None, the type of input data is used. Default: None.
"""
def __init__(self, axis=0, dtype=None):
self._axis = axis
self._dtype = dtype
def __call__(self, data):
"""
Batchifies the input data by stacking.
Args:
data (list[numpy.ndarray]): The input data samples. It is a list.
Each element is a numpy.ndarray or list.
Returns:
numpy.ndarray: Stacked batch data.
Example:
.. code-block:: python
from paddlenlp.data import Stack
a = [1, 2, 3, 4]
b = [3, 4, 5, 6]
c = [5, 6, 7, 8]
result = Stack()([a, b, c])
'''
[[1, 2, 3, 4],
[3, 4, 5, 6],
[5, 6, 7, 8]]
'''
"""
data = np.stack(
data,
axis=self._axis).astype(self._dtype) if self._dtype else np.stack(
data, axis=self._axis)
return data
class Pad(object):
"""
Pads the input data samples to the largest length at `axis`.
Args:
pad_val (float|int, optional): The padding value. Default: 0.
axis (int, optional): The axis to pad the arrays. The arrays will be
padded to the largest length at `axis`. For example, assume the
input arrays have shape (10, 8, 5), (6, 8, 5), (3, 8, 5) and the
axis is 0. Each input will be padded into (10, 8, 5) and then
stacked to form the final output, which has shape (3, 10, 8, 5).
Default: 0.
ret_length (bool|numpy.dtype, optional): If it is bool, indicate whether
to return the valid length in the output, and the data type of
returned length is int32 if True. If it is numpy.dtype, indicate the
data type of returned length. Default: None.
dtype (numpy.dtype, optional): The value type of the output. If it is
set to None, the input data type is used. Default: None.
pad_right (bool, optional): Whether the padding direction is right-side.
If True, it indicates we pad to the right side, while False indicates
we pad to the left side. Default: True.
"""
def __init__(self,
pad_val=0,
axis=0,
ret_length=None,
dtype=None,
pad_right=True):
self._pad_val = pad_val
self._axis = axis
self._ret_length = ret_length
self._dtype = dtype
self._pad_right = pad_right
def __call__(self, data):
"""
Batchifies the input data by padding. The input will be padded to the
largest dimension at `axis` and then stacked to form the final output.
In addition, the function will output the original dimensions at the
`axis` if `ret_length` is not None or False.
Args:
data (list[numpy.ndarray|list]): The input data samples. It is a
list. Each element is a numpy.ndarray or list.
Returns:
numpy.ndarray|tuple[numpy.ndarray]: If `ret_length` is False, it
is a numpy.ndarray representing the padded batch data and the
shape is (N, …). Otherwise, it is a tuple, besides the padded batch
data, the tuple also includes a numpy.ndarray representing original
length at `axis` of all input samples, which shaped `(N,)`.
Example:
.. code-block:: python
from paddlenlp.data import Pad
a = [1, 2, 3, 4]
b = [5, 6, 7]
c = [8, 9]
result = Pad(pad_val=0)([a, b, c])
'''
[[1, 2, 3, 4],
[5, 6, 7, 0],
[8, 9, 0, 0]]
'''
"""
arrs = [np.asarray(ele) for ele in data]
original_length = [ele.shape[self._axis] for ele in arrs]
max_size = max(original_length)
ret_shape = list(arrs[0].shape)
ret_shape[self._axis] = max_size
ret_shape = (len(arrs), ) + tuple(ret_shape)
ret = np.full(
shape=ret_shape,
fill_value=self._pad_val,
dtype=arrs[0].dtype if self._dtype is None else self._dtype)
for i, arr in enumerate(arrs):
if arr.shape[self._axis] == max_size:
ret[i] = arr
else:
slices = [slice(None) for _ in range(arr.ndim)]
if self._pad_right:
slices[self._axis] = slice(0, arr.shape[self._axis])
else:
slices[self._axis] = slice(max_size - arr.shape[self._axis],
max_size)
if slices[self._axis].start != slices[self._axis].stop:
slices = [slice(i, i + 1)] + slices
ret[tuple(slices)] = arr
if self._ret_length:
return ret, np.asarray(
original_length,
dtype="int32") if self._ret_length == True else np.asarray(
original_length, self._ret_length)
else:
return ret
class Tuple(object):
"""
Wraps multiple batchify functions together. The input functions will be applied
to the corresponding input fields.
Each sample should be a list or tuple containing multiple fields. The i'th
batchify function stored in Tuple will be applied on the i'th field.
For example, when data sample is (nd_data, label), you can wrap two batchify
functions using `Tuple(DataBatchify, LabelBatchify)` to batchify nd_data and
label correspondingly.
Args:
fn (callable|list[callable]|tuple[callable]): The batchify functions to
wrap. It is a callable function or a list/tuple of callable functions.
args (tuple[callable]): The additional batchify functions to wrap.
"""
def __init__(self, fn, *args):
if isinstance(fn, (list, tuple)):
assert len(args) == 0, 'Input pattern not understood. The input of Tuple can be ' \
'Tuple(A, B, C) or Tuple([A, B, C]) or Tuple((A, B, C)). ' \
'Received fn=%s, args=%s' % (str(fn), str(args))
self._fn = fn
else:
self._fn = (fn, ) + args
for i, ele_fn in enumerate(self._fn):
assert callable(
ele_fn
), 'Batchify functions must be callable! type(fn[%d]) = %s' % (
i, str(type(ele_fn)))
def __call__(self, data):
"""
Batchifies data samples by applying each function on the corresponding
data field, and each data field is produced by stacking the field data
of samples.
Args:
data (list|tuple): The samples to batchfy. Each sample in list/tuple
should contain `N` fields.
Returns:
tuple: A tuple composed of results from all including batchifying
functions.
Example:
.. code-block:: python
from paddlenlp.data import Stack, Pad, Tuple
data = [
[[1, 2, 3, 4], [1]],
[[5, 6, 7], [0]],
[[8, 9], [1]],
]
batchify_fn = Tuple(Pad(pad_val=0), Stack())
ids, label = batchify_fn(data)
'''
ids:
[[1, 2, 3, 4],
[5, 6, 7, 0],
[8, 9, 0, 0]]
label: [[1], [0], [1]]
'''
"""
assert len(data[0]) == len(self._fn),\
'The number of attributes in each data sample should contain' \
' {} elements'.format(len(self._fn))
ret = []
for i, ele_fn in enumerate(self._fn):
result = ele_fn([ele[i] for ele in data])
if isinstance(result, (tuple, list)):
ret.extend(result)
else:
ret.append(result)
return tuple(ret)
class Dict(object):
"""
Wraps multiple batchify functions together. The input functions will be
applied to the corresponding input fields.
Each sample should be a dict containing multiple fields. Each batchify
function with key stored in `Dict` will be applied on the field which has
the same key.
For example, when data sample is {'tokens': tokens, 'labels': labels}, you
can wrap two batchify functions using
`Dict({'tokens': DataBatchify, 'labels': LabelBatchify})` to batchify tokens
and labels correspondingly.
Args:
fn (dict): The batchify functions to wrap. It is a dict, which values is
callable functions.
"""
def __init__(self, fn):
assert isinstance(fn, (dict)), 'Input pattern not understood. The input of Dict must be a dict with key of input column name and value of collate_fn ' \
'Received fn=%s' % (str(fn))
self._fn = fn
for col_name, ele_fn in self._fn.items():
assert callable(
ele_fn
), 'Batchify functions must be callable! type(fn[%d]) = %s' % (
col_name, str(type(ele_fn)))
def __call__(self, data):
"""
Batchifies data samples by applying each function on the corresponding
data field, and each data field is produced by stacking the field data
with the same key as batchify functions of all samples.
Args:
data (list[dict]|tuple[dict]): The samples to batchfy. Each sample
in list/tuple is a dict with `N` key-values.
Returns:
tuple: A tuple composed of results from all including batchifying
functions.
Example:
.. code-block:: python
from paddlenlp.data import Stack, Pad, Dict
data = [
{'labels':[1], 'token_ids':[1, 2, 3, 4]},
{'labels':[0], 'token_ids':[5, 6, 7]},
{'labels':[1], 'token_ids':[8, 9]},
]
batchify_fn = Dict({'token_ids':Pad(pad_val=0), 'labels':Stack()})
ids, label = batchify_fn(data)
'''
ids:
[[1, 2, 3, 4],
[5, 6, 7, 0],
[8, 9, 0, 0]]
label: [[1], [0], [1]]
'''
"""
ret = []
for col_name, ele_fn in self._fn.items():
result = ele_fn([ele[col_name] for ele in data])
if isinstance(result, (tuple, list)):
ret.extend(result)
else:
ret.append(result)
return tuple(ret)
# Iterator for NLP Dataset
\ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import functools
import math
import six
import numpy as np
import paddle.distributed as dist
class SamplerHelper(object):
"""
The class is to help construct iterable sampler used for
:class:`paddle.io.DataLoader`. It wraps a dataset and uses its
:meth:`__getitem__` method. Every subclass of :class:`SamplerHelper` has
to provide an :meth:`__iter__` method, providing a way to iterate over
indices of dataset elements, and a :meth:`__len__` method that returns the
length of the returned iterators.
The class also can be used as batch iterator instead of indices iterator
when `iterator` yield samples rather than indices by initializing `iterator`
with a iterable dataset.
.. note::
The :meth:`__len__` method isn't strictly required by
:class:`paddle.io.DataLoader`, but is expected in any calculation
involving the length of a :class:`paddle.io.DataLoader`.
Args:
dataset (Dataset): Input dataset for :class:`SamplerHelper`.
iterable (Iterable, optional): Iterator of dataset. Default: None.
"""
# chain sampler
def __init__(self, dataset, iterable=None):
self.data_source = dataset
self.iterable = iterable
if isinstance(dataset, collections.Iterable) and iterable is None:
# iterable-style datasets
self.iterable = dataset
def __iter__(self):
if self.iterable is None:
return iter(range(len(self.data_source)))
elif isinstance(self.iterable, collections.Iterable):
return iter(self.iterable)
elif callable(self.iterable):
return self.iterable()
else:
raise ValueError(
"`iterable` should be None, instance of Iterable or callable "
"producing generator.")
def __len__(self):
# Allow some samplers have different length with `len(data_source)`,
# such as batch sampler.
if hasattr(self, "_length"):
return self._length
else:
return len(self.data_source)
@property
def length(self):
"""
Returns the length.
"""
# since `len()` only produce integer, use length property to get None
# for uncertain length. samplers can set length if necessary.
try:
length = len(self)
except Exception:
length = None
return length
@length.setter
def length(self, length):
self._length = length
def apply(self, fn):
# Transformation functions would be performed. It includes
# :meth:`shuffle`, :meth:`sort`, :meth:`fit` and :meth:`shard`.
# Args:
# fn (callable): Transformation functions to be performed.
# Returns:
# SamplerHelper: A new transformed :class:`SamplerHelper` object.
rs = fn(self)
if isinstance(rs, (list, tuple)):
iterable, data_source = rs
else:
iterable, data_source = rs, self.data_source
sampler = type(self)(data_source, iterable)
return sampler
def shuffle(self, buffer_size=-1, seed=None):
"""
Shuffles the dataset according to the given buffer size and random seed.
Args:
buffer_size (int, optional): Buffer size for shuffle. If
`buffer_size < 0` or more than the length of the dataset,
`buffer_size` is the length of the dataset. Default: -1.
seed (int, optional): Seed for the random. Default: None.
Returns:
SamplerHelper: A new shuffled :class:`SamplerHelper` object.
Example:
.. code-block:: python
from paddlenlp.data import SamplerHelper
from paddle.io import Dataset
class MyDataset(Dataset):
def __init__(self):
super(MyDataset, self).__init__()
self.data = [
[[1, 2, 3, 4], [1]],
[[5, 6, 7], [0]],
[[8, 9], [1]],
]
def __getitem__(self, index):
data = self.data[index][0]
label = self.data[index][1]
return data, label
def __len__(self):
return len(self.data)
dataset = MyDataset()
sampler = SamplerHelper(dataset)
print(list(sampler)) # indices of dataset elements
# [0, 1, 2]
sampler = sampler.shuffle(seed=2)
print(list(sampler)) # indices of dataset elements
# [2, 1, 0]
"""
if seed is not None:
random_generator = np.random.RandomState(seed)
else: # use the global random generator
random_generator = np.random
def _impl():
buf = []
for idx in iter(self):
buf.append(idx)
if buffer_size > 0 and len(buf) >= buffer_size:
random_generator.shuffle(buf)
for b in buf:
yield b
buf = []
if len(buf) > 0:
random_generator.shuffle(buf)
for b in buf:
yield b
return type(self)(self.data_source, _impl)
def sort(self, cmp=None, key=None, reverse=False, buffer_size=-1):
"""
Sorts the dataset according to given callable :meth:`cmp` or :meth:`key`.
Args:
cmp (callable, optional): The function of comparison. Default: None.
key (callable, optional): The function of key. Default: None.
reverse (bool, optional): Whether to reverse when sorting the data
samples. If True, it means in descending order, and False means
in ascending order. Default: False.
buffer_size (int, optional): Buffer size for sort. If
`buffer_size < 0` or `buffer_size` is more than the length
of the data, `buffer_size` will be set to the length of the data.
Default: -1.
Returns:
SamplerHelper: A new sorted :class:`SamplerHelper` object.
Example:
.. code-block:: python
from paddlenlp.data import SamplerHelper
from paddle.io import Dataset
class MyDataset(Dataset):
def __init__(self):
super(MyDataset, self).__init__()
self.data = [
[[1, 2, 3, 4], [1]],
[[5, 6, 7], [0]],
[[8, 9], [1]],
]
def __getitem__(self, index):
data = self.data[index][0]
label = self.data[index][1]
return data, label
def __len__(self):
return len(self.data)
dataset = MyDataset()
sampler = SamplerHelper(dataset)
print(list(sampler)) # indices of dataset elements
# [0, 1, 2]
# Sorted in ascending order by the length of the first field
# of the sample
key = (lambda x, data_source: len(data_source[x][0]))
sampler = sampler.sort(key=key)
print(list(sampler)) # indices of dataset elements
# [2, 1, 0]
"""
if key:
key_wrapper = (lambda x: key(x, self.data_source))
elif cmp:
key_wrapper = functools.cmp_to_key(
lambda x, y: cmp(x, y, self.data_source))
else:
key_wrapper = (lambda x: len(self.data_source[x]))
def _impl():
data_source = self.data_source
buf = []
for idx in iter(self):
buf.append(idx)
if buffer_size > 0 and len(buf) >= buffer_size:
buf = sorted(buf, key=key_wrapper, reverse=reverse)
for b in buf:
yield b
buf = []
if len(buf) > 0:
buf = sorted(buf, key=key_wrapper, reverse=reverse)
for b in buf:
yield b
return type(self)(self.data_source, _impl)
def batch(self, batch_size, drop_last=False, batch_size_fn=None, key=None):
"""
Batches the dataset according to given `batch_size`.
Args:
batch_size (int): The batch size.
drop_last (bool, optional): Whether to drop the last mini batch.
Default: False.
batch_size_fn (callable, optional): It accepts four arguments:
index of data source, the length of minibatch, the size of
minibatch so far and data source, and it returns the size of
mini batch so far. Actually, the returned value can be anything
and would used as argument `size_so_far` in `key`. If None, it
would return the length of mini match. Default: None.
key (callable, optional): The function of key. It accepts the size of minibatch so far
and the length of minibatch, and returns what to be compared
with `batch_size`. If None, only the size of mini batch so far
would be compared with `batch_size`. Default: None.
Returns:
SamplerHelper: A new batched :class:`SamplerHelper` object.
Example:
.. code-block:: python
from paddlenlp.data import SamplerHelper
from paddle.io import Dataset
class MyDataset(Dataset):
def __init__(self):
super(MyDataset, self).__init__()
self.data = [
[[1, 2, 3, 4], [1]],
[[5, 6, 7], [0]],
[[8, 9], [1]],
]
def __getitem__(self, index):
data = self.data[index][0]
label = self.data[index][1]
return data, label
def __len__(self):
return len(self.data)
dataset = MyDataset()
sampler = SamplerHelper(dataset)
print(list(sampler)) # indices of dataset elements
# [0, 1, 2]
sampler = sampler.batch(batch_size=2)
print(list(sampler)) # indices of dataset elements
# [[0, 1], [2]]
"""
_key = lambda size_so_far, minibatch_len: size_so_far
ori_batch_size_fn = batch_size_fn
if batch_size_fn is None:
batch_size_fn = lambda new, count, sofar, data_source: count
key = _key if key is None else key
def _impl():
data_source = self.data_source
minibatch, size_so_far = [], 0
for idx in iter(self):
minibatch.append(idx)
size_so_far = batch_size_fn(idx,
len(minibatch), size_so_far,
data_source)
if key(size_so_far, len(minibatch)) == batch_size:
yield minibatch
minibatch, size_so_far = [], 0
elif key(size_so_far, len(minibatch)) > batch_size:
if len(minibatch) == 1:
raise ValueError(
"Please increase the value of `batch_size`, or limit the max length of batch."
)
yield minibatch[:-1]
minibatch, size_so_far = minibatch[-1:], batch_size_fn(
idx, 1, 0, data_source)
if minibatch and not drop_last:
yield minibatch
sampler = type(self)(self.data_source, _impl)
if ori_batch_size_fn is None and self.length is not None:
sampler.length = (self.length + int(not drop_last) *
(batch_size - 1)) // batch_size
else:
sampler.length = None
return sampler
def shard(self, num_replicas=None, rank=None):
"""
Slices the dataset for multi GPU training.
Args:
num_replicas (int, optional): The number of training process, and
is also the number of GPU cards used in training. If None, it
will be set by :meth:`paddle.distributed.get_world_size` method.
Default: None.
rank (int, optional): The id of current training process. Equal
to the value of the environment variable PADDLE_TRAINER_ID. If
None, it will be intialized by :meth:`paddle.distributed.get_rank`
method. Default: None.
Returns:
SamplerHelper: A new sliced :class:`SamplerHelper` object.
Example:
.. code-block:: python
from paddlenlp.data import SamplerHelper
from paddle.io import Dataset
class MyDataset(Dataset):
def __init__(self):
super(MyDataset, self).__init__()
self.data = [
[[1, 2, 3, 4], [1]],
[[5, 6, 7], [0]],
[[8, 9], [1]],
]
def __getitem__(self, index):
data = self.data[index][0]
label = self.data[index][1]
return data, label
def __len__(self):
return len(self.data)
dataset = MyDataset()
sampler = SamplerHelper(dataset)
print(list(sampler)) # indices of dataset elements
# [0, 1, 2]
sampler = sampler.shard(num_replicas=2)
print(list(sampler)) # indices of dataset elements
# [0, 2]
"""
if num_replicas is None:
num_replicas = dist.get_world_size()
if rank is None:
rank = dist.get_rank()
def _impl():
for i, idx in enumerate(self):
if i % num_replicas == rank:
yield idx
if i % num_replicas != num_replicas - 1 and rank > i % num_replicas:
# use last samples to make it evenly divisible
yield idx
sampler = type(self)(self.data_source, _impl)
if self.length is not None:
sampler.length = int(math.ceil(self.length * 1.0 / num_replicas))
else:
sampler.length = None
return sampler
def list(self):
# Produce a sampler with a `listiterator` when calling `iter`. Since
# `list` would fetch all contents at time, thus it can get accurate
# length.
def _impl():
indices = list(iter(self))
self.length = len(indices)
return iter(indices)
return type(self)(self.data_source, _impl)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import jieba
from .vocab import Vocab
def get_idx_from_word(word, word_to_idx, unk_word):
if word in word_to_idx:
return word_to_idx[word]
return word_to_idx[unk_word]
class BaseTokenizer(object):
def __init__(self, vocab):
self.vocab = vocab
def get_tokenizer(self):
return self.tokenizer
def cut(self, sentence):
pass
def encode(self, sentence):
pass
class JiebaTokenizer(BaseTokenizer):
"""
Constructs a tokenizer based on `jieba <https://github.com/fxsjy/jieba>`__.
It supports :meth:`cut` method to split the text to tokens, and :meth:`encode`
method to covert text to token ids.
Args:
vocab(paddlenlp.data.Vocab): An instance of :class:`paddlenlp.data.Vocab`.
"""
def __init__(self, vocab):
super(JiebaTokenizer, self).__init__(vocab)
self.tokenizer = jieba.Tokenizer()
# initialize tokenizer
self.tokenizer.FREQ = {key: 1 for key in self.vocab.token_to_idx.keys()}
self.tokenizer.total = len(self.tokenizer.FREQ)
self.tokenizer.initialized = True
def cut(self, sentence, cut_all=False, use_hmm=True):
"""
The method used to cut the text to tokens.
Args:
sentence(str): The text that needs to be cuted.
cut_all(bool, optional): Whether to use the full mode. If True,
using full mode that gets all the possible words from the
sentence, which is fast but not accurate. If False, using
accurate mode that attempts to cut the sentence into the most
accurate segmentations, which is suitable for text analysis.
Default: False.
use_hmm(bool, optional): Whether to use the HMM model. Default: True.
Returns:
list[str]: A list of tokens.
Example:
.. code-block:: python
from paddlenlp.data import Vocab, JiebaTokenizer
# The vocab file. The sample file can be downloaded firstly.
# wget https://paddlenlp.bj.bcebos.com/data/senta_word_dict.txt
vocab_file_path = './senta_word_dict.txt'
# Initialize the Vocab
vocab = Vocab.load_vocabulary(
vocab_file_path,
unk_token='[UNK]',
pad_token='[PAD]')
tokenizer = JiebaTokenizer(vocab)
tokens = tokenizer.cut('我爱你中国')
print(tokens)
# ['我爱你', '中国']
"""
return self.tokenizer.lcut(sentence, cut_all, use_hmm)
def encode(self, sentence, cut_all=False, use_hmm=True):
"""
The method used to convert the text to ids. It will firstly call
:meth:`cut` method to cut the text to tokens. Then, convert tokens to
ids using `vocab`.
Args:
sentence(str): The text that needs to be cuted.
cut_all(bool, optional): Whether to use the full mode. If True,
using full mode that gets all the possible words from the
sentence, which is fast but not accurate. If False, using
accurate mode that attempts to cut the sentence into the most
accurate segmentations, which is suitable for text analysis.
Default: False.
use_hmm(bool, optional): Whether to use the HMM model. Default: True.
Returns:
list[int]: A list of ids.
Example:
.. code-block:: python
from paddlenlp.data import Vocab, JiebaTokenizer
# The vocab file. The sample file can be downloaded firstly.
# wget https://paddlenlp.bj.bcebos.com/data/senta_word_dict.txt
vocab_file_path = './senta_word_dict.txt'
# Initialize the Vocab
vocab = Vocab.load_vocabulary(
vocab_file_path,
unk_token='[UNK]',
pad_token='[PAD]')
tokenizer = JiebaTokenizer(vocab)
ids = tokenizer.encode('我爱你中国')
print(ids)
# [1170578, 575565]
"""
words = self.cut(sentence, cut_all, use_hmm)
return [
get_idx_from_word(word, self.vocab.token_to_idx,
self.vocab.unk_token) for word in words
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment