"vscode:/vscode.git/clone" did not exist on "3c30e04127eaac5bc0dd8c52db6a573ec69cd895"
Commit 27dab946 authored by huchen's avatar huchen
Browse files

Merge branch 'GNMT-v2' into 'main'

更新了GNMT v2

See merge request dcutoolkit/deeplearing/dlexamples_new!11
parents 20291e9d 07c30a15
...@@ -20,9 +20,7 @@ ...@@ -20,9 +20,7 @@
import argparse import argparse
from collections import Counter from collections import Counter
import sys
import importlib
importlib.reload(sys)
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description='Clean dataset') parser = argparse.ArgumentParser(description='Clean dataset')
...@@ -32,8 +30,7 @@ def parse_args(): ...@@ -32,8 +30,7 @@ def parse_args():
def save_output(fname, data): def save_output(fname, data):
#with open(fname, 'w') as f: with open(fname, 'w') as f:
with open(fname, 'w', encoding='utf-8') as f:
f.writelines(data) f.writelines(data)
...@@ -74,8 +71,7 @@ def main(): ...@@ -74,8 +71,7 @@ def main():
data1 = [] data1 = []
data2 = [] data2 = []
#with open(args.file1) as f1, open(args.file2) as f2: with open(args.file1) as f1, open(args.file2) as f2:
with open(args.file1, 'r', encoding='utf-8') as f1, open(args.file2, 'r', encoding='utf-8') as f2:
for idx, lines in enumerate(zip(f1, f2)): for idx, lines in enumerate(zip(f1, f2)):
line1, line2 = lines line1, line2 = lines
if idx % 100000 == 1: if idx % 100000 == 1:
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
set -e set -e
DATASET_DIR='../wmt16_de_en/' DATASET_DIR='data/wmt16_de_en'
ACTUAL_SRC_TRAIN=`cat ${DATASET_DIR}/train.tok.clean.bpe.32000.en |md5sum` ACTUAL_SRC_TRAIN=`cat ${DATASET_DIR}/train.tok.clean.bpe.32000.en |md5sum`
EXPECTED_SRC_TRAIN='b7482095b787264a310d4933d197a134 -' EXPECTED_SRC_TRAIN='b7482095b787264a310d4933d197a134 -'
......
...@@ -64,9 +64,7 @@ wget -nc -nv -O ${OUTPUT_DIR_DATA}/dev.tgz \ ...@@ -64,9 +64,7 @@ wget -nc -nv -O ${OUTPUT_DIR_DATA}/dev.tgz \
wget -nc -nv -O ${OUTPUT_DIR_DATA}/test.tgz \ wget -nc -nv -O ${OUTPUT_DIR_DATA}/test.tgz \
http://data.statmt.org/wmt16/translation-task/test.tgz http://data.statmt.org/wmt16/translation-task/test.tgz
OUTPUT_DIR=${1:-"/public/home/aiss/code/mlperf/wmt16_de_en"} # Extract everything
OUTPUT_DIR_DATA="${OUTPUT_DIR}/data"
## Extract everything
echo "Extracting all files..." echo "Extracting all files..."
mkdir -p "${OUTPUT_DIR_DATA}/europarl-v7-de-en" mkdir -p "${OUTPUT_DIR_DATA}/europarl-v7-de-en"
tar -xvzf "${OUTPUT_DIR_DATA}/europarl-v7-de-en.tgz" -C "${OUTPUT_DIR_DATA}/europarl-v7-de-en" tar -xvzf "${OUTPUT_DIR_DATA}/europarl-v7-de-en.tgz" -C "${OUTPUT_DIR_DATA}/europarl-v7-de-en"
...@@ -160,10 +158,10 @@ cat "${OUTPUT_DIR}/newstest2015.tok.clean.de" \ ...@@ -160,10 +158,10 @@ cat "${OUTPUT_DIR}/newstest2015.tok.clean.de" \
> "${OUTPUT_DIR}/newstest_dev.tok.clean.de" > "${OUTPUT_DIR}/newstest_dev.tok.clean.de"
# Filter datasets # Filter datasets
python3 `pwd`/scripts/filter_dataset.py \ python3 scripts/filter_dataset.py \
-f1 ${OUTPUT_DIR}/train.tok.clean.en \ -f1 ${OUTPUT_DIR}/train.tok.clean.en \
-f2 ${OUTPUT_DIR}/train.tok.clean.de -f2 ${OUTPUT_DIR}/train.tok.clean.de
python3 `pwd`/scripts/filter_dataset.py \ python3 scripts/filter_dataset.py \
-f1 ${OUTPUT_DIR}/newstest_dev.tok.clean.en \ -f1 ${OUTPUT_DIR}/newstest_dev.tok.clean.en \
-f2 ${OUTPUT_DIR}/newstest_dev.tok.clean.de -f2 ${OUTPUT_DIR}/newstest_dev.tok.clean.de
...@@ -173,23 +171,20 @@ python3 `pwd`/scripts/filter_dataset.py \ ...@@ -173,23 +171,20 @@ python3 `pwd`/scripts/filter_dataset.py \
for merge_ops in 32000; do for merge_ops in 32000; do
echo "Learning BPE with merge_ops=${merge_ops}. This may take a while..." echo "Learning BPE with merge_ops=${merge_ops}. This may take a while..."
cat "${OUTPUT_DIR}/train.tok.clean.de" "${OUTPUT_DIR}/train.tok.clean.en" | \ cat "${OUTPUT_DIR}/train.tok.clean.de" "${OUTPUT_DIR}/train.tok.clean.en" | \
#subword-nmt learn-bpe -s $merge_ops > "${OUTPUT_DIR}/bpe.${merge_ops}" subword-nmt learn-bpe -s $merge_ops > "${OUTPUT_DIR}/bpe.${merge_ops}"
${OUTPUT_DIR}/subword-nmt/learn_bpe.py -s $merge_ops > "${OUTPUT_DIR}/bpe.${merge_ops}"
echo "Apply BPE with merge_ops=${merge_ops} to tokenized files..." echo "Apply BPE with merge_ops=${merge_ops} to tokenized files..."
for lang in en de; do for lang in en de; do
for f in ${OUTPUT_DIR}/*.tok.${lang} ${OUTPUT_DIR}/*.tok.clean.${lang}; do for f in ${OUTPUT_DIR}/*.tok.${lang} ${OUTPUT_DIR}/*.tok.clean.${lang}; do
outfile="${f%.*}.bpe.${merge_ops}.${lang}" outfile="${f%.*}.bpe.${merge_ops}.${lang}"
#subword-nmt apply-bpe -c "${OUTPUT_DIR}/bpe.${merge_ops}" < $f > "${outfile}" subword-nmt apply-bpe -c "${OUTPUT_DIR}/bpe.${merge_ops}" < $f > "${outfile}"
${OUTPUT_DIR}/subword-nmt/apply_bpe.py -c "${OUTPUT_DIR}/bpe.${merge_ops}" < $f > "${outfile}"
echo ${outfile} echo ${outfile}
done done
done done
# Create vocabulary file for BPE # Create vocabulary file for BPE
cat "${OUTPUT_DIR}/train.tok.clean.bpe.${merge_ops}.en" "${OUTPUT_DIR}/train.tok.clean.bpe.${merge_ops}.de" | \ cat "${OUTPUT_DIR}/train.tok.clean.bpe.${merge_ops}.en" "${OUTPUT_DIR}/train.tok.clean.bpe.${merge_ops}.de" | \
#subword-nmt get-vocab | cut -f1 -d ' ' > "${OUTPUT_DIR}/vocab.bpe.${merge_ops}" subword-nmt get-vocab | cut -f1 -d ' ' > "${OUTPUT_DIR}/vocab.bpe.${merge_ops}"
${OUTPUT_DIR}/subword-nmt/get_vocab.py | cut -f1 -d ' ' > "${OUTPUT_DIR}/vocab.bpe.${merge_ops}"
done done
......
This diff is collapsed.
#include <torch/torch.h>
#include <vector>
// CUDA forward declarations
at::Tensor attn_score_forward_cuda(
const at::Tensor &attn_query,
const at::Tensor &attn_keys,
const at::Tensor &bias,
const at::Tensor &linear_attn);
std::vector<at::Tensor> attn_score_backward_cuda(
const at::Tensor &grad_output,
const at::Tensor &attn_query,
const at::Tensor &attn_keys,
const at::Tensor &bias,
const at::Tensor &linear_attn);
// C++ interface
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
at::Tensor attn_score_forward(
const at::Tensor &attn_query,
const at::Tensor &attn_keys,
const at::Tensor &bias,
const at::Tensor &linear_attn) {
CHECK_INPUT(attn_query);
CHECK_INPUT(attn_keys);
CHECK_INPUT(bias);
CHECK_INPUT(linear_attn);
return attn_score_forward_cuda(attn_query, attn_keys, bias, linear_attn);
}
std::vector<at::Tensor> attn_score_backward(
const at::Tensor &grad_output,
const at::Tensor &attn_query,
const at::Tensor &attn_keys,
const at::Tensor &bias,
const at::Tensor &linear_attn) {
CHECK_INPUT(grad_output);
CHECK_INPUT(attn_query);
CHECK_INPUT(attn_keys);
CHECK_INPUT(bias);
CHECK_INPUT(linear_attn);
return attn_score_backward_cuda(grad_output, attn_query, attn_keys, bias, linear_attn);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &attn_score_forward, "Attention score calculation forward (CUDA)");
m.def("backward", &attn_score_backward, "Attention score calculation backward (CUDA)");
}
#include <torch/torch.h>
#include <vector>
// CUDA forward declarations
at::Tensor attn_score_forward_cuda(
const at::Tensor &attn_query,
const at::Tensor &attn_keys,
const at::Tensor &bias,
const at::Tensor &linear_attn);
std::vector<at::Tensor> attn_score_backward_cuda(
const at::Tensor &grad_output,
const at::Tensor &attn_query,
const at::Tensor &attn_keys,
const at::Tensor &bias,
const at::Tensor &linear_attn);
// C++ interface
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
at::Tensor attn_score_forward(
const at::Tensor &attn_query,
const at::Tensor &attn_keys,
const at::Tensor &bias,
const at::Tensor &linear_attn) {
CHECK_INPUT(attn_query);
CHECK_INPUT(attn_keys);
CHECK_INPUT(bias);
CHECK_INPUT(linear_attn);
return attn_score_forward_cuda(attn_query, attn_keys, bias, linear_attn);
}
std::vector<at::Tensor> attn_score_backward(
const at::Tensor &grad_output,
const at::Tensor &attn_query,
const at::Tensor &attn_keys,
const at::Tensor &bias,
const at::Tensor &linear_attn) {
CHECK_INPUT(grad_output);
CHECK_INPUT(attn_query);
CHECK_INPUT(attn_keys);
CHECK_INPUT(bias);
CHECK_INPUT(linear_attn);
return attn_score_backward_cuda(grad_output, attn_query, attn_keys, bias, linear_attn);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &attn_score_forward, "Attention score calculation forward (CUDA)");
m.def("backward", &attn_score_backward, "Attention score calculation backward (CUDA)");
}
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <torch/torch.h>
namespace at {
namespace native {
at::Tensor revert_varlen_tensor(const Tensor &input, const Tensor &offsets);
at::Tensor get_offsets(const Tensor &input, const Tensor &lengths);
void checkLongTensor(const Tensor &tensor);
at::Tensor set_mask_cpp(const Tensor &_lengths) {
at::native::checkLongTensor(_lengths);
int64_t batch_size = _lengths.size(0);
int64_t *lengths = _lengths.data_ptr<int64_t>();
int64_t seq_length = (lengths == NULL) ? 0 : lengths[0];
auto output = torch::empty({seq_length, batch_size}, torch::CPU(at::kByte));
auto output_data = output.data_ptr<uint8_t>();
for (int64_t t = 0; t < seq_length; t++) {
for (int64_t i = 0; i < batch_size; i++) {
if (lengths[i] > t) {
output_data[t * batch_size + i] = 1;
} else {
output_data[t * batch_size + i] = 0;
}
}
}
return output;
}
} // namespace native
} // namespace at
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("revert_varlen_tensor", &at::native::revert_varlen_tensor);
m.def("set_mask_cpp", &at::native::set_mask_cpp);
m.def("get_offsets", &at::native::get_offsets);
}
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <torch/torch.h>
namespace at {
namespace native {
at::Tensor revert_varlen_tensor(const Tensor &input, const Tensor &offsets);
at::Tensor get_offsets(const Tensor &input, const Tensor &lengths);
void checkLongTensor(const Tensor &tensor);
at::Tensor set_mask_cpp(const Tensor &_lengths) {
at::native::checkLongTensor(_lengths);
int64_t batch_size = _lengths.size(0);
int64_t *lengths = _lengths.data_ptr<int64_t>();
int64_t seq_length = (lengths == NULL) ? 0 : lengths[0];
auto output = torch::empty({seq_length, batch_size}, torch::CPU(at::kByte));
auto output_data = output.data_ptr<uint8_t>();
for (int64_t t = 0; t < seq_length; t++) {
for (int64_t i = 0; i < batch_size; i++) {
if (lengths[i] > t) {
output_data[t * batch_size + i] = 1;
} else {
output_data[t * batch_size + i] = 0;
}
}
}
return output;
}
} // namespace native
} // namespace at
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("revert_varlen_tensor", &at::native::revert_varlen_tensor);
m.def("set_mask_cpp", &at::native::set_mask_cpp);
m.def("get_offsets", &at::native::get_offsets);
}
#include "hip/hip_runtime.h"
#include "ATen/hip/HIPContext.h"
#include <ATen/ATen.h>
#include <torch/torch.h>
#include <torch/types.h>
namespace at {
namespace native {
namespace {
template <typename scalar_t>
__global__ void revert_varlen_kernel(scalar_t *in, scalar_t *out,
int64_t *offsets, int feature_size, int n,
scalar_t pad_value) {
const int offset = static_cast<int>(offsets[blockIdx.x]);
for (int i = threadIdx.x; i < feature_size; i += blockDim.x) {
out[blockIdx.x * feature_size + i] =
(offset >= 0) ? in[offset + i] : pad_value;
}
}
} // namespace
void checkLongTensor(const Tensor &tensor) {
TORCH_CHECK(tensor.dim() == 1 && tensor.device() == at::kCPU &&
tensor.scalar_type() == at::kLong,
"'lengths' argument should be a 1D CPU int64 tensor");
}
at::Tensor revert_varlen_tensor(const Tensor &_input, const Tensor &_offsets) {
auto input = _input.contiguous();
auto output = torch::empty_like(input);
int64_t seq_length = input.size(0);
int64_t batch_size = input.size(1);
assert(_offsets.dim() == 1);
assert(_offsets.is_cuda());
assert(_offsets.scalar_type() == at::kLong);
TORCH_CHECK(_offsets.dim() == 1 && _offsets.is_cuda() &&
_offsets.scalar_type() == at::kLong,
"'offsets' argument should be a 1D CUDA int64 tensor");
TORCH_CHECK(_offsets.numel() == batch_size * seq_length,
"Expected `len(offsets) = batch_size * seq_length`, but got ",
_offsets.numel(), " (batch_size=", batch_size,
", seq_length=", seq_length, ")");
int64_t feature_size = 1;
for (int64_t dim = 2; dim < input.ndimension(); dim++) {
feature_size *= input.size(dim);
}
int numThreads = 512;
int numBlocks = batch_size * seq_length;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "revert_varlen", [&] {
hipLaunchKernelGGL(revert_varlen_kernel, dim3(numBlocks), dim3(numThreads), 0, at::cuda::getCurrentHIPStream(),
input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
_offsets.data_ptr<int64_t>(), feature_size, batch_size * seq_length,
static_cast<scalar_t>(0));
});
return output;
}
at::Tensor get_offsets(const Tensor &_input, const Tensor &_lengths) {
at::native::checkLongTensor(_lengths);
auto input = _input.contiguous();
int64_t seq_length = input.size(0);
int64_t batch_size = input.size(1);
int64_t *lengths = _lengths.data_ptr<int64_t>();
TORCH_CHECK(_lengths.size(0) == batch_size,
"Expected `len(lengths)` to be equal to batch_size, but got ",
_lengths.size(0), " (batch_size=", batch_size, ")");
TORCH_CHECK(
(lengths[batch_size - 1] > 0),
"Length of all samples has to be greater than 0, but found an element "
"in 'lengths' that is <= 0");
std::vector<int64_t> offsets;
offsets.reserve(batch_size * seq_length);
int64_t feature_size = 1;
for (int64_t dim = 2; dim < input.ndimension(); dim++) {
feature_size *= input.size(dim);
}
for (int64_t t = 0; t < seq_length; t++) {
for (int64_t i = 0; i < batch_size; i++) {
if (lengths[i] > t) {
offsets.push_back(i * feature_size +
(lengths[i] - t - 1) * batch_size * feature_size);
} else {
offsets.push_back(-1);
}
}
}
auto options = at::TensorOptions().device(at::kCUDA).dtype(at::kLong);
auto offsets_tensor =
at::from_blob(offsets.data(), batch_size * seq_length, at::kLong)
.to(options, /* non_blocking */ true, /*copy*/ false);
return offsets_tensor;
}
} // namespace native
} // namespace at
#include "ATen/cuda/CUDAContext.h"
#include <ATen/ATen.h>
#include <torch/torch.h>
#include <torch/types.h>
namespace at {
namespace native {
namespace {
template <typename scalar_t>
__global__ void revert_varlen_kernel(scalar_t *in, scalar_t *out,
int64_t *offsets, int feature_size, int n,
scalar_t pad_value) {
const int offset = static_cast<int>(offsets[blockIdx.x]);
for (int i = threadIdx.x; i < feature_size; i += blockDim.x) {
out[blockIdx.x * feature_size + i] =
(offset >= 0) ? in[offset + i] : pad_value;
}
}
} // namespace
void checkLongTensor(const Tensor &tensor) {
TORCH_CHECK(tensor.dim() == 1 && tensor.device() == at::kCPU &&
tensor.scalar_type() == at::kLong,
"'lengths' argument should be a 1D CPU int64 tensor");
}
at::Tensor revert_varlen_tensor(const Tensor &_input, const Tensor &_offsets) {
auto input = _input.contiguous();
auto output = torch::empty_like(input);
int64_t seq_length = input.size(0);
int64_t batch_size = input.size(1);
assert(_offsets.dim() == 1);
assert(_offsets.is_cuda());
assert(_offsets.scalar_type() == at::kLong);
TORCH_CHECK(_offsets.dim() == 1 && _offsets.is_cuda() &&
_offsets.scalar_type() == at::kLong,
"'offsets' argument should be a 1D CUDA int64 tensor");
TORCH_CHECK(_offsets.numel() == batch_size * seq_length,
"Expected `len(offsets) = batch_size * seq_length`, but got ",
_offsets.numel(), " (batch_size=", batch_size,
", seq_length=", seq_length, ")");
int64_t feature_size = 1;
for (int64_t dim = 2; dim < input.ndimension(); dim++) {
feature_size *= input.size(dim);
}
int numThreads = 512;
int numBlocks = batch_size * seq_length;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "revert_varlen", [&] {
revert_varlen_kernel<<<numBlocks, numThreads, 0,
at::cuda::getCurrentCUDAStream()>>>(
input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
_offsets.data_ptr<int64_t>(), feature_size, batch_size * seq_length,
static_cast<scalar_t>(0));
});
return output;
}
at::Tensor get_offsets(const Tensor &_input, const Tensor &_lengths) {
at::native::checkLongTensor(_lengths);
auto input = _input.contiguous();
int64_t seq_length = input.size(0);
int64_t batch_size = input.size(1);
int64_t *lengths = _lengths.data_ptr<int64_t>();
TORCH_CHECK(_lengths.size(0) == batch_size,
"Expected `len(lengths)` to be equal to batch_size, but got ",
_lengths.size(0), " (batch_size=", batch_size, ")");
TORCH_CHECK(
(lengths[batch_size - 1] > 0),
"Length of all samples has to be greater than 0, but found an element "
"in 'lengths' that is <= 0");
std::vector<int64_t> offsets;
offsets.reserve(batch_size * seq_length);
int64_t feature_size = 1;
for (int64_t dim = 2; dim < input.ndimension(); dim++) {
feature_size *= input.size(dim);
}
for (int64_t t = 0; t < seq_length; t++) {
for (int64_t i = 0; i < batch_size; i++) {
if (lengths[i] > t) {
offsets.push_back(i * feature_size +
(lengths[i] - t - 1) * batch_size * feature_size);
} else {
offsets.push_back(-1);
}
}
}
auto options = at::TensorOptions().device(at::kCUDA).dtype(at::kLong);
auto offsets_tensor =
at::from_blob(offsets.data(), batch_size * seq_length, at::kLong)
.to(options, /* non_blocking */ true, /*copy*/ false);
return offsets_tensor;
}
} // namespace native
} // namespace at
#include "hip/hip_runtime.h"
#include "ATen/hip/HIPContext.h"
#include <ATen/ATen.h>
#include <torch/torch.h>
#include <torch/types.h>
namespace at {
namespace native {
namespace {
template <typename scalar_t>
__global__ void revert_varlen_kernel(scalar_t *in, scalar_t *out,
int64_t *offsets, int feature_size, int n,
scalar_t pad_value) {
const int offset = static_cast<int>(offsets[blockIdx.x]);
for (int i = threadIdx.x; i < feature_size; i += blockDim.x) {
out[blockIdx.x * feature_size + i] =
(offset >= 0) ? in[offset + i] : pad_value;
}
}
} // namespace
void checkLongTensor(const Tensor &tensor) {
TORCH_CHECK(tensor.dim() == 1 && tensor.device() == at::kCPU &&
tensor.scalar_type() == at::kLong,
"'lengths' argument should be a 1D CPU int64 tensor");
}
at::Tensor revert_varlen_tensor(const Tensor &_input, const Tensor &_offsets) {
auto input = _input.contiguous();
auto output = torch::empty_like(input);
int64_t seq_length = input.size(0);
int64_t batch_size = input.size(1);
assert(_offsets.dim() == 1);
assert(_offsets.is_cuda());
assert(_offsets.scalar_type() == at::kLong);
TORCH_CHECK(_offsets.dim() == 1 && _offsets.is_cuda() &&
_offsets.scalar_type() == at::kLong,
"'offsets' argument should be a 1D CUDA int64 tensor");
TORCH_CHECK(_offsets.numel() == batch_size * seq_length,
"Expected `len(offsets) = batch_size * seq_length`, but got ",
_offsets.numel(), " (batch_size=", batch_size,
", seq_length=", seq_length, ")");
int64_t feature_size = 1;
for (int64_t dim = 2; dim < input.ndimension(); dim++) {
feature_size *= input.size(dim);
}
int numThreads = 512;
int numBlocks = batch_size * seq_length;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "revert_varlen", [&] {
// hipLaunchKernelGGL(revert_varlen_kernel, dim3(numBlocks), dim3(numThreads), 0, at::cuda::getCurrentHIPStream(),
// input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
// _offsets.data_ptr<int64_t>(), feature_size, batch_size * seq_length,
// static_cast<scalar_t>(0));
// });
revert_varlen_kernel<<<numBlocks, numThreads, 0,
at::cuda::getCurrentHIPStream()>>>(
input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
_offsets.data_ptr<int64_t>(), feature_size, batch_size * seq_length,
static_cast<scalar_t>(0));
});
return output;
}
at::Tensor get_offsets(const Tensor &_input, const Tensor &_lengths) {
at::native::checkLongTensor(_lengths);
auto input = _input.contiguous();
int64_t seq_length = input.size(0);
int64_t batch_size = input.size(1);
int64_t *lengths = _lengths.data_ptr<int64_t>();
TORCH_CHECK(_lengths.size(0) == batch_size,
"Expected `len(lengths)` to be equal to batch_size, but got ",
_lengths.size(0), " (batch_size=", batch_size, ")");
TORCH_CHECK(
(lengths[batch_size - 1] > 0),
"Length of all samples has to be greater than 0, but found an element "
"in 'lengths' that is <= 0");
std::vector<int64_t> offsets;
offsets.reserve(batch_size * seq_length);
int64_t feature_size = 1;
for (int64_t dim = 2; dim < input.ndimension(); dim++) {
feature_size *= input.size(dim);
}
for (int64_t t = 0; t < seq_length; t++) {
for (int64_t i = 0; i < batch_size; i++) {
if (lengths[i] > t) {
offsets.push_back(i * feature_size +
(lengths[i] - t - 1) * batch_size * feature_size);
} else {
offsets.push_back(-1);
}
}
}
auto options = at::TensorOptions().device(at::kCUDA).dtype(at::kLong);
auto offsets_tensor =
at::from_blob(offsets.data(), batch_size * seq_length, at::kLong)
.to(options, /* non_blocking */ true, /*copy*/ false);
return offsets_tensor;
}
} // namespace native
} // namespace at
#include "hip/hip_runtime.h"
#include "ATen/hip/HIPContext.h"
#include <ATen/ATen.h>
#include <torch/torch.h>
#include <torch/types.h>
namespace at {
namespace native {
namespace {
template <typename scalar_t>
__global__ void revert_varlen_kernel(scalar_t *in, scalar_t *out,
int64_t *offsets, int feature_size, int n,
scalar_t pad_value) {
const int offset = static_cast<int>(offsets[blockIdx.x]);
for (int i = threadIdx.x; i < feature_size; i += blockDim.x) {
out[blockIdx.x * feature_size + i] =
(offset >= 0) ? in[offset + i] : pad_value;
}
}
} // namespace
void checkLongTensor(const Tensor &tensor) {
TORCH_CHECK(tensor.dim() == 1 && tensor.device() == at::kCPU &&
tensor.scalar_type() == at::kLong,
"'lengths' argument should be a 1D CPU int64 tensor");
}
at::Tensor revert_varlen_tensor(const Tensor &_input, const Tensor &_offsets) {
auto input = _input.contiguous();
auto output = torch::empty_like(input);
int64_t seq_length = input.size(0);
int64_t batch_size = input.size(1);
assert(_offsets.dim() == 1);
assert(_offsets.is_cuda());
assert(_offsets.scalar_type() == at::kLong);
TORCH_CHECK(_offsets.dim() == 1 && _offsets.is_cuda() &&
_offsets.scalar_type() == at::kLong,
"'offsets' argument should be a 1D CUDA int64 tensor");
TORCH_CHECK(_offsets.numel() == batch_size * seq_length,
"Expected `len(offsets) = batch_size * seq_length`, but got ",
_offsets.numel(), " (batch_size=", batch_size,
", seq_length=", seq_length, ")");
int64_t feature_size = 1;
for (int64_t dim = 2; dim < input.ndimension(); dim++) {
feature_size *= input.size(dim);
}
int numThreads = 512;
int numBlocks = batch_size * seq_length;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "revert_varlen", [&] {
hipLaunchKernelGGL(revert_varlen_kernel, dim3(numBlocks), dim3(numThreads), 0, at::cuda::getCurrentHIPStream(),
input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
_offsets.data_ptr<int64_t>(), feature_size, batch_size * seq_length,
static_cast<scalar_t>(0));
});
return output;
}
at::Tensor get_offsets(const Tensor &_input, const Tensor &_lengths) {
at::native::checkLongTensor(_lengths);
auto input = _input.contiguous();
int64_t seq_length = input.size(0);
int64_t batch_size = input.size(1);
int64_t *lengths = _lengths.data_ptr<int64_t>();
TORCH_CHECK(_lengths.size(0) == batch_size,
"Expected `len(lengths)` to be equal to batch_size, but got ",
_lengths.size(0), " (batch_size=", batch_size, ")");
TORCH_CHECK(
(lengths[batch_size - 1] > 0),
"Length of all samples has to be greater than 0, but found an element "
"in 'lengths' that is <= 0");
std::vector<int64_t> offsets;
offsets.reserve(batch_size * seq_length);
int64_t feature_size = 1;
for (int64_t dim = 2; dim < input.ndimension(); dim++) {
feature_size *= input.size(dim);
}
for (int64_t t = 0; t < seq_length; t++) {
for (int64_t i = 0; i < batch_size; i++) {
if (lengths[i] > t) {
offsets.push_back(i * feature_size +
(lengths[i] - t - 1) * batch_size * feature_size);
} else {
offsets.push_back(-1);
}
}
}
auto options = at::TensorOptions().device(at::kCUDA).dtype(at::kLong);
auto offsets_tensor =
at::from_blob(offsets.data(), batch_size * seq_length, at::kLong)
.to(options, /* non_blocking */ true, /*copy*/ false);
return offsets_tensor;
}
} // namespace native
} // namespace at
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment