Unverified Commit 96f9c6de authored by Evgeny Tsykunov's avatar Evgeny Tsykunov Committed by GitHub
Browse files

Parallelize CPU reference implementation in tests 2 (#1588)



Parallelize CPU reference implementation in tests
Signed-off-by: default avatarEvgeny Tsykunov <etsykunov@etsykunov-mlt.client.nvidia.com>
Co-authored-by: default avatarEvgeny Tsykunov <etsykunov@etsykunov-mlt.client.nvidia.com>
Co-authored-by: default avatarOleg Goncharov <64355998+Oleg-Goncharov@users.noreply.github.com>
parent eee710a3
...@@ -9,7 +9,12 @@ set -e ...@@ -9,7 +9,12 @@ set -e
TE_LIB_PATH=`pip3 show transformer-engine | grep Location | cut -d ' ' -f 2` TE_LIB_PATH=`pip3 show transformer-engine | grep Location | cut -d ' ' -f 2`
export LD_LIBRARY_PATH=$TE_LIB_PATH:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=$TE_LIB_PATH:$LD_LIBRARY_PATH
# Set parallelization parameters
NUM_PHYSICAL_CORES=$(nproc)
NUM_PARALLEL_JOBS=4
cd $TE_PATH/tests/cpp cd $TE_PATH/tests/cpp
cmake -GNinja -Bbuild . cmake -GNinja -Bbuild .
cmake --build build cmake --build build
ctest --test-dir build -j4 export OMP_NUM_THREADS=$((NUM_PHYSICAL_CORES / NUM_PARALLEL_JOBS))
ctest --test-dir build -j$NUM_PARALLEL_JOBS
...@@ -115,23 +115,52 @@ void compute_ref_x1(const ProcessingMethod processing_method, ...@@ -115,23 +115,52 @@ void compute_ref_x1(const ProcessingMethod processing_method,
const size_t block_size_X, const size_t block_size_X,
const size_t scales_stride) const size_t scales_stride)
{ {
std::vector<float> output_dbias_fp32(cols, 0); const size_t tile_size_Y = std::max(32lu, block_size_Y);
const size_t tile_size_X = std::max(64lu, block_size_X);
const size_t blocks_Y = (rows + block_size_Y - 1) / block_size_Y; const size_t tiles_num_Y = (rows + tile_size_Y - 1) / tile_size_Y;
const size_t blocks_X = (cols + block_size_X - 1) / block_size_X; const size_t tiles_num_X = (cols + tile_size_X - 1) / tile_size_X;
const size_t blocks_per_tile_Y = tile_size_Y / block_size_Y;
const size_t blocks_per_tile_X = tile_size_X / block_size_X;
for (size_t ii = 0; ii < blocks_Y; ++ii) { std::vector<float> output_dbias_fp32(cols, 0);
const size_t i_min = ii * block_size_Y; #pragma omp parallel proc_bind(spread)
const size_t i_max = std::min((ii + 1) * block_size_Y, rows); {
for (size_t jj = 0; jj < blocks_X; ++jj) { std::vector<float> thread_dbias(cols, 0);
const size_t j_min = jj * block_size_X; #pragma omp for schedule(static)
const size_t j_max = std::min((jj + 1) * block_size_X, cols); for (size_t t = 0; t < tiles_num_Y * tiles_num_X; ++t) {
const size_t scale_idx = ii * scales_stride + jj; const size_t tile_Y = t / tiles_num_X;
const size_t tile_X = t % tiles_num_X;
const size_t tile_offset_Y = tile_Y * tile_size_Y;
const size_t tile_offset_X = tile_X * tile_size_X;
for (size_t ii = 0; ii < blocks_per_tile_Y; ++ii) {
const size_t block_idx_Y = tile_Y * blocks_per_tile_Y + ii;
const size_t block_offset_Y = ii * block_size_Y;
const size_t i_min = tile_offset_Y + block_offset_Y;
if (i_min >= rows) continue;
const size_t i_max = std::min(i_min + block_size_Y, rows);
for (size_t jj = 0; jj < blocks_per_tile_X; ++jj) {
const size_t block_idx_X = tile_X * blocks_per_tile_X + jj;
const size_t block_offset_X = jj * block_size_X;
const size_t j_min = tile_offset_X + block_offset_X;
if (j_min >= cols) continue;
const size_t j_max = std::min(j_min + block_size_X, cols);
const size_t scale_idx = block_idx_Y * scales_stride + block_idx_X;
scale_block<InputType, OutputType, OP>( scale_block<InputType, OutputType, OP>(
processing_method, input, grad, output_c, output_dbias_fp32.data(), processing_method, input, grad, output_c, thread_dbias.data(),
output_scales, scale_idx, i_min, i_max, j_min, j_max, cols); output_scales, scale_idx, i_min, i_max, j_min, j_max, cols);
} }
} }
}
#pragma omp critical
{
for (size_t j = 0; j < cols; ++j) {
output_dbias_fp32[j] += thread_dbias[j];
}
}
}
for (size_t j = 0; j < cols; ++j) { for (size_t j = 0; j < cols; ++j) {
output_dbias[j] = static_cast<InputType>(output_dbias_fp32[j]); output_dbias[j] = static_cast<InputType>(output_dbias_fp32[j]);
} }
......
...@@ -61,20 +61,40 @@ void compute_ref_x1(const InputType* input, ...@@ -61,20 +61,40 @@ void compute_ref_x1(const InputType* input,
const size_t block_size_X, const size_t block_size_X,
const size_t scales_stride) const size_t scales_stride)
{ {
const size_t blocks_Y = (rows + block_size_Y - 1) / block_size_Y; const size_t tile_size_Y = std::max(32lu, block_size_Y);
const size_t blocks_X = (cols + block_size_X - 1) / block_size_X; const size_t tile_size_X = std::max(64lu, block_size_X);
const size_t tiles_num_Y = (rows + tile_size_Y - 1) / tile_size_Y;
for (size_t ii = 0; ii < blocks_Y; ++ii) { const size_t tiles_num_X = (cols + tile_size_X - 1) / tile_size_X;
const size_t i_min = ii * block_size_Y; const size_t blocks_per_tile_Y = tile_size_Y / block_size_Y;
const size_t i_max = std::min((ii + 1) * block_size_Y, rows); const size_t blocks_per_tile_X = tile_size_X / block_size_X;
for (size_t jj = 0; jj < blocks_X; ++jj) {
const size_t j_min = jj * block_size_X; #pragma omp parallel for schedule(static) proc_bind(spread)
const size_t j_max = std::min((jj + 1) * block_size_X, cols); for (size_t t = 0; t < tiles_num_Y * tiles_num_X; ++t) {
const size_t scale_idx = ii * scales_stride + jj; const size_t tile_Y = t / tiles_num_X;
const size_t tile_X = t % tiles_num_X;
const size_t tile_offset_Y = tile_Y * tile_size_Y;
const size_t tile_offset_X = tile_X * tile_size_X;
for (size_t ii = 0; ii < blocks_per_tile_Y; ++ii) {
const size_t block_idx_Y = tile_Y * blocks_per_tile_Y + ii;
const size_t block_offset_Y = ii * block_size_Y;
const size_t i_min = tile_offset_Y + block_offset_Y;
if (i_min >= rows) continue;
const size_t i_max = std::min(i_min + block_size_Y, rows);
for (size_t jj = 0; jj < blocks_per_tile_X; ++jj) {
const size_t block_idx_X = tile_X * blocks_per_tile_X + jj;
const size_t block_offset_X = jj * block_size_X;
const size_t j_min = tile_offset_X + block_offset_X;
if (j_min >= cols) continue;
const size_t j_max = std::min(j_min + block_size_X, cols);
const size_t scale_idx = block_idx_Y * scales_stride + block_idx_X;
dequantize_block<InputType, OutputType>( dequantize_block<InputType, OutputType>(
input, output, scales, scale_idx, i_min, i_max, j_min, j_max, cols); input, output, scales, scale_idx, i_min, i_max, j_min, j_max, cols);
} }
} }
}
} }
template <typename InputType, typename OutputType> template <typename InputType, typename OutputType>
......
...@@ -652,10 +652,15 @@ void generate_data_uniformly(T* data, const size_t size, std::mt19937* gen) { ...@@ -652,10 +652,15 @@ void generate_data_uniformly(T* data, const size_t size, std::mt19937* gen) {
#pragma omp parallel proc_bind(spread) #pragma omp parallel proc_bind(spread)
{ {
std::mt19937 gen_local = *gen; std::mt19937 gen_local = *gen;
gen_local.discard(omp_get_thread_num() * 599); const int thread_ID = omp_get_thread_num();
const int threads_num = omp_get_max_threads();
const int chunk_size = (size + threads_num - 1) / threads_num;
const int idx_min = chunk_size * thread_ID;
const int idx_max = std::min(chunk_size * (thread_ID + 1), static_cast<int>(size));
gen_local.discard(idx_min);
std::uniform_real_distribution<> dis(-2.0, 1.0); std::uniform_real_distribution<> dis(-2.0, 1.0);
#pragma omp for schedule(static)
for (size_t i = 0; i < size; ++i) { for (int i = idx_min; i < idx_max; ++i) {
data[i] = static_cast<T>(dis(gen_local)); data[i] = static_cast<T>(dis(gen_local));
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment