From 8f7de8477759198d0944c3d6023cb803b4c452d8 Mon Sep 17 00:00:00 2001 From: yuguo960516yuguo Date: Tue, 25 Apr 2023 19:14:37 +0800 Subject: [PATCH] dtk --- README.md | 19 +- audit_dtk-22.04.2.py | 21 - audit_dtk-22.10.py | 21 - cmake/oneflow.cmake | 11 + .../embedding/cached_key_value_store.hip.cpp | 650 ++-- oneflow/core/embedding/full_cache.hip.cpp | 1278 +++---- oneflow/core/embedding/hash_functions.hip.h | 198 +- oneflow/core/embedding/lru_cache.hip.cpp | 1168 +++--- .../embedding/mock_key_value_store.hip.cpp | 496 +-- .../persistent_table_key_value_store.hip.cpp | 484 +-- oneflow/core/ep/rocm/cuda_device.cpp | 358 +- oneflow/core/ep/rocm/cuda_device.h | 156 +- oneflow/core/ep/rocm/cuda_device_manager.cpp | 136 +- oneflow/core/ep/rocm/cuda_device_manager.h | 108 +- .../ep/rocm/cuda_device_manager_factory.cpp | 234 +- oneflow/core/ep/rocm/cuda_event.cpp | 112 +- oneflow/core/ep/rocm/cuda_event.h | 100 +- oneflow/core/ep/rocm/cuda_stream.cpp | 360 +- oneflow/core/ep/rocm/cuda_stream.h | 336 +- oneflow/core/ep/rocm/primitive/add.hip.cpp | 278 +- .../ep/rocm/primitive/binary_functor.hip.h | 300 +- .../broadcast_elementwise_binary.hip.cpp | 218 +- .../broadcast_elementwise_binary.hip.h | 792 ++-- ...elementwise_binary_activation_grad.hip.cpp | 78 +- ...ast_elementwise_binary_comparision.hip.cpp | 76 +- ...oadcast_elementwise_binary_logical.hip.cpp | 76 +- .../broadcast_elementwise_binary_math.hip.cpp | 70 +- .../ep/rocm/primitive/broadcast_matmul.cpp | 474 +-- oneflow/core/ep/rocm/primitive/cast.hip.cpp | 296 +- .../ep/rocm/primitive/constant_pad.hip.cpp | 508 +-- .../core/ep/rocm/primitive/copy_nd.hip.cpp | 190 +- .../rocm/primitive/elementwise_unary.hip.cpp | 232 +- oneflow/core/ep/rocm/primitive/fill.hip.cpp | 302 +- oneflow/core/ep/rocm/primitive/memcpy.cpp | 124 +- oneflow/core/ep/rocm/primitive/memset.cpp | 118 +- .../core/ep/rocm/primitive/permute.hip.cpp | 666 ++-- .../core/ep/rocm/primitive/softmax.hip.cpp | 214 +- .../rocm/primitive/softmax_backward.hip.cpp | 232 +- oneflow/core/ep/rocm/primitive/type_seq.h | 154 +- .../ep/rocm/primitive/unary_functor.hip.h | 340 +- .../framework/random_generator_impl.hip.cpp | 90 +- oneflow/core/hip/atomic.hip.h | 428 +-- oneflow/core/hip/elementwise.hip.h | 486 +-- oneflow/core/hip/layer_norm.hip.h | 3212 ++++++++--------- oneflow/core/hip/softmax.hip.h | 2998 +++++++-------- oneflow/core/hip/unique.hip.h | 502 +-- .../nccl_executor_backend.hip.cpp | 1328 +++---- .../insert_nccl_logical_op_pass.cpp | 2 +- ...uential_one_embedding_shuffle_ops_pass.cpp | 158 +- ...uda_check_numerics_kernel_observer.hip.cpp | 264 +- oneflow/core/kernel/kernel_util.hip.h | 106 +- oneflow/core/kernel/random_generator.hip.cpp | 116 +- oneflow/core/kernel/util/numeric_limits.hip.h | 254 +- oneflow/core/kernel/util/numerics.hip.h | 498 +-- .../ndarray/ndarray_apply_binary_core.hip.cpp | 134 +- ...darray_apply_broadcast_binary_core.hip.cpp | 378 +- ...ndarray_apply_broadcast_unary_core.hip.cpp | 90 +- .../ndarray/ndarray_apply_unary_core.hip.cpp | 92 +- .../core/ndarray/ndarray_assign_core.hip.cpp | 124 +- .../core/ndarray/ndarray_reduce_impl.hip.cpp | 764 ++-- .../core/ndarray/xpu_ndarray_assign.hip.cpp | 122 +- oneflow/core/profiler/event.cpp | 180 +- oneflow/core/profiler/event.h | 372 +- oneflow/core/profiler/event_recorder.h | 120 +- oneflow/core/vm/sync_vm_mode_guard.h | 76 +- .../kernels/adaptive_pool_gpu_kernel.hip.cpp | 590 +-- .../user/kernels/affine_grid_kernel.hip.cpp | 264 +- .../user/kernels/arange_kernel_util.hip.cpp | 94 +- oneflow/user/kernels/arg_sort_kernel.hip.cpp | 294 +- .../kernels/arg_where_kernel_util.hip.cpp | 282 +- oneflow/user/kernels/argmax_kernel.hip.cpp | 386 +- .../user/kernels/as_strided_kernel.hip.cpp | 396 +- oneflow/user/kernels/assign_if_kernel.hip.cpp | 150 +- oneflow/user/kernels/avg_pool_kernel.hip.cpp | 398 +- .../kernels/batch_gather_kernel_util.hip.cpp | 204 +- .../binary_cross_entropy_kernel.hip.cpp | 406 +-- ...y_cross_entropy_with_logits_kernel.hip.cpp | 744 ++-- ...ss_entropy_with_logits_mean_kernel.hip.cpp | 552 +-- .../kernels/broadcast_pow_grad_kernel.hip.cpp | 174 +- ...gorical_ordinal_encode_kernel_util.hip.cpp | 248 +- .../user/kernels/clip_by_value_kernel.hip.cpp | 142 +- .../combined_margin_loss_kernel.hip.cpp | 448 +-- .../kernels/count_not_finite_kernel.hip.cpp | 344 +- .../user/kernels/ctc_greedy_decoder.hip.cpp | 290 +- .../user/kernels/ctc_loss_kernel_util.hip.cpp | 568 +-- .../user/kernels/cum_backward_kernel.hip.cpp | 276 +- .../user/kernels/cum_forward_kernel.hip.cpp | 336 +- .../user/kernels/data_shuffle_kernel.hip.cpp | 3044 ++++++++-------- oneflow/user/kernels/diag_kernel.hip.cpp | 158 +- oneflow/user/kernels/diagonal_kernel.hip.cpp | 324 +- .../kernels/dim_gather_kernel_util.hip.cpp | 128 +- .../kernels/dim_scatter_kernel_util.hip.cpp | 132 +- .../dim_scatter_scalar_kernel_util.hip.cpp | 100 +- .../distributions/normal_distribution.hip.cpp | 140 +- .../uniform_distribution.hip.cpp | 152 +- .../uniform_int_distribution.hip.cpp | 142 +- oneflow/user/kernels/dropout_kernel.hip.cpp | 924 ++--- ...dynamic_loss_scale_schedule_kernel.hip.cpp | 132 +- .../user/kernels/eager_nccl_kernels.hip.cpp | 806 ++--- ...elementwise_maximum_minimum_kernel.hip.cpp | 112 +- oneflow/user/kernels/embedding_kernel.hip.cpp | 318 +- .../kernels/embedding_kernel_util.hip.cpp | 362 +- oneflow/user/kernels/erfinv_kernel.hip.cpp | 120 +- oneflow/user/kernels/expand_kernel.hip.cpp | 438 +-- oneflow/user/kernels/eye_kernel_util.hip.cpp | 78 +- .../kernels/fake_quantization_kernel.hip.cpp | 318 +- oneflow/user/kernels/fill_kernel.hip.cpp | 120 +- oneflow/user/kernels/flip_kernel.hip.cpp | 206 +- oneflow/user/kernels/fold_kernel_util.hip.cpp | 148 +- .../kernels/fused_bias_add_kernel.hip.cpp | 910 ++--- .../kernels/fused_cast_scale_kernel.hip.cpp | 222 +- .../fused_cross_feature_interaction.hip.cpp | 516 +-- ...sed_cross_feature_interaction_grad.hip.cpp | 908 ++--- ...sed_dot_feature_interaction_kernel.hip.cpp | 1844 +++++----- .../kernels/fused_gru_cell_kernel.hip.cpp | 942 ++--- .../kernels/fused_lstm_cell_kernel.hip.cpp | 1008 +++--- .../fused_relu_dropout_grad_kernel.hip.cpp | 290 +- .../kernels/fused_scale_mask_softmax.hip.cpp | 470 +-- .../kernels/fused_scale_mask_softmax.hip.h | 430 +-- .../fused_scale_mask_softmax_dropout.hip.cpp | 604 ++-- ...ion_query_mul_key_and_value_kernel.hip.cpp | 584 +-- ...il_scale_softmax_mask_scale_kernel.hip.cpp | 456 +-- .../user/kernels/gather_kernel_util.hip.cpp | 244 +- ...m_batch_permutation_indices_kernel.hip.cpp | 274 +- .../heap_selection_top_k_kernel.hip.cpp | 464 +-- .../kernels/image_preprocess_kernels.hip.cpp | 430 +-- .../user/kernels/in_top_k_kernel_util.hip.cpp | 134 +- oneflow/user/kernels/kl_div_kernel.hip.cpp | 240 +- ...l2_regularize_gradient_kernel_util.hip.cpp | 100 +- .../user/kernels/l2_normalize_kernel.hip.cpp | 298 +- .../kernels/layer_norm_gpu_kernel.hip.cpp | 1143 +++--- .../math_binary_elementwise_kernel.hip.cpp | 486 +-- .../kernels/math_unary_elementwise_func.h | 1966 +++++----- .../math_unary_elementwise_kernel.hip.cpp | 352 +- oneflow/user/kernels/max_pool_kernel.hip.cpp | 576 +-- oneflow/user/kernels/median_kernel.hip.cpp | 136 +- .../median_with_indices_kernel.hip.cpp | 310 +- .../kernels/min_max_observer_kernel.hip.cpp | 518 +-- .../kernels/model_update_kernel_util.hip.cpp | 1596 ++++---- ...ng_average_min_max_observer_kernel.hip.cpp | 632 ++-- .../user/kernels/multi_reduce_kernels.hip.cpp | 282 +- .../kernels/nd_index_slice_kernels.hip.cpp | 330 +- oneflow/user/kernels/nll_kernel_util.hip.cpp | 184 +- oneflow/user/kernels/nms_kernel.hip.cpp | 288 +- .../user/kernels/normalization_kernel.hip.cpp | 1066 +++--- .../user/kernels/nvtx_range_kernel.hip.cpp | 276 +- .../kernels/one_embedding_kernels.hip.cpp | 1266 +++---- .../one_embedding_update_kernels.hip.cpp | 1206 +++---- oneflow/user/kernels/one_hot_kernel.hip.cpp | 160 +- .../user/kernels/pad2d_kernels_util.hip.cpp | 426 +-- .../kernels/partial_fc_sample_kernel.hip.cpp | 860 ++--- oneflow/user/kernels/prelu_kernel.hip.cpp | 1008 +++--- .../user/kernels/quantization_kernel.hip.cpp | 316 +- oneflow/user/kernels/radix_sort.hip.h | 558 +-- .../kernels/radix_sort_top_k_kernel.hip.cpp | 288 +- .../kernels/random_mask_generator.hip.cpp | 136 +- oneflow/user/kernels/randperm_kernel.hip.cpp | 400 +- .../kernels/repeat_interleave_kernel.hip.cpp | 144 +- oneflow/user/kernels/roi_align_kernel.hip.cpp | 602 +-- oneflow/user/kernels/roll_kernel.hip.cpp | 588 +-- .../user/kernels/scalar_math_kernels.hip.cpp | 444 +-- .../user/kernels/search_sorted_kernel.hip.cpp | 256 +- .../sigmoid_cross_entropy_kernel.hip.cpp | 108 +- oneflow/user/kernels/slice_util.hip.cpp | 462 +-- .../kernels/smooth_l1_loss_kernel.hip.cpp | 288 +- .../softmax_cross_entropy_kernel.hip.cpp | 310 +- oneflow/user/kernels/sort_kernel.hip.cpp | 160 +- .../sparse_cross_entropy_kernel_util.hip.cpp | 532 +-- ...parse_softmax_cross_entropy_kernel.hip.cpp | 260 +- ..._softmax_cross_entropy_kernel_util.hip.cpp | 266 +- .../sqrt_square_sum_kernel_util.hip.cpp | 164 +- .../kernels/square_sum_kernel_util.hip.cpp | 208 +- oneflow/user/kernels/stateful_opkernel.cpp | 1802 ++++----- oneflow/user/kernels/tf_prelu_kernel.hip.cpp | 506 +-- .../user/kernels/to_contiguous_kernel.hip.cpp | 320 +- oneflow/user/kernels/tril_kernel.hip.cpp | 510 +-- oneflow/user/kernels/triu_kernel.hip.cpp | 260 +- .../two_stage_reduce_kernel_util.hip.cpp | 132 +- .../user/kernels/unfold_kernel_util.hip.cpp | 138 +- .../user/kernels/unfold_tensor_kernel.hip.cpp | 442 +-- .../user/kernels/unique_kernel_util.hip.cpp | 172 +- .../unsorted_segment_sum_kernel_util.hip.cpp | 442 +-- .../upsample_bicubic_2d_kernel.hip.cpp | 466 +-- .../upsample_bilinear_2d_kernel.hip.cpp | 378 +- .../kernels/upsample_linear_1d_kernel.hip.cpp | 324 +- .../kernels/upsample_nearest_kernel.hip.cpp | 822 ++--- .../upsample_trilinear_3d_kernel.hip.cpp | 472 +-- .../user/kernels/variance_kernel_util.hip.cpp | 382 +- .../user/kernels/where_kernel_util.hip.cpp | 178 +- .../modules/fused_dot_feature_interaction.py | 86 +- python/oneflow/test/modules/test_conv.py | 692 ++-- .../test/modules/test_softmax_cross_entropy | 348 +- .../test/profiler/test_profile_lenet.py | 296 +- version_script.lds | 14 +- 194 files changed, 41577 insertions(+), 41396 deletions(-) delete mode 100644 audit_dtk-22.04.2.py delete mode 100644 audit_dtk-22.10.py diff --git a/README.md b/README.md index 15f73ae..8d49a2c 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,6 @@ -# OneFlow +# OneFlow -OneFlow is a deep learning framework designed to be **user-friendly, scalable and efficient**. With OneFlow, it is easy to: -- program a model with **PyTorch-like API** -- scale a model to n-dimensional-parallel/distributed execution with the **Global View API** -- accelerate/deploy a model with the **Static Graph Compiler**. +**OneFlow is a performance-centered and open-source deep learning framework.** [![Simple CI](https://github.com/Oneflow-Inc/oneflow/actions/workflows/simple.yml/badge.svg)](https://github.com/Oneflow-Inc/oneflow/actions/workflows/simple.yml) [![Nightly Docker Image](https://github.com/Oneflow-Inc/docker-images/actions/workflows/oneflow-nightly.yml/badge.svg)](https://github.com/Oneflow-Inc/docker-images/actions/workflows/oneflow-nightly.yml) @@ -12,8 +9,10 @@ OneFlow is a deep learning framework designed to be **user-friendly, scalable an ## Latest News -- Version 0.8.0 is out! - - [Full changelog](https://github.com/Oneflow-Inc/oneflow/releases/tag/v0.8.0) +- Version 0.7.0 is out! + - Introducing global tensor + - Semi-auto parallelization has landed + - [Full changelog](https://github.com/Oneflow-Inc/oneflow/releases/tag/v0.7.0) ## Publication @@ -36,7 +35,7 @@ OneFlow is a deep learning framework designed to be **user-friendly, scalable an ### System Requirements - Linux. As for now, there is no pre-built release for macOS, Windows. -- Python 3.7, 3.8, 3.9, 3.10 +- Python 3.6, 3.7, 3.8, 3.9, 3.10 - (**Highly recommended**) Upgrade pip ``` @@ -54,7 +53,7 @@ OneFlow is a deep learning framework designed to be **user-friendly, scalable an - To install latest stable release of OneFlow with CUDA support: ```bash - python3 -m pip install oneflow + python3 -m pip install -f https://release.oneflow.info oneflow==0.7.0+cu102 ``` - To install nightly release of OneFlow with CUDA support: @@ -67,7 +66,7 @@ OneFlow is a deep learning framework designed to be **user-friendly, scalable an - Stable ```bash - python3 -m pip install --find-links https://release.oneflow.info oneflow==0.8.0+[PLATFORM] + python3 -m pip install --find-links https://release.oneflow.info oneflow==0.7.0+[PLATFORM] ``` - Nightly ``` diff --git a/audit_dtk-22.04.2.py b/audit_dtk-22.04.2.py deleted file mode 100644 index 6646a3b..0000000 --- a/audit_dtk-22.04.2.py +++ /dev/null @@ -1,21 +0,0 @@ -# Monkey patch to not ship libjvm.so in pypi wheels -import sys - -from auditwheel.main import main -from auditwheel.policy import _POLICIES as POLICIES - -# libjvm is loaded dynamically; do not include it -for p in POLICIES: - p['lib_whitelist'].append('librccl.so.1') - p['lib_whitelist'].append('libhipblas.so.0') - p['lib_whitelist'].append('libhiprand.so.1') - p['lib_whitelist'].append('librocrand.so.1') - p['lib_whitelist'].append('libMIOpen.so.1') - p['lib_whitelist'].append('libgalaxyhip.so.4') - p['lib_whitelist'].append('librocm_smi64.so.2') - p['lib_whitelist'].append('librocsolver.so.0 ') - p['lib_whitelist'].append('librocblas.so.0') - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/audit_dtk-22.10.py b/audit_dtk-22.10.py deleted file mode 100644 index db45007..0000000 --- a/audit_dtk-22.10.py +++ /dev/null @@ -1,21 +0,0 @@ -# Monkey patch to not ship libjvm.so in pypi wheels -import sys - -from auditwheel.main import main -from auditwheel.policy import _POLICIES as POLICIES - -# libjvm is loaded dynamically; do not include it -for p in POLICIES: - p['lib_whitelist'].append('librccl.so.1') - p['lib_whitelist'].append('libhipblas.so.0') - p['lib_whitelist'].append('libhiprand.so.1') - p['lib_whitelist'].append('librocrand.so.1') - p['lib_whitelist'].append('libMIOpen.so.1') - p['lib_whitelist'].append('libgalaxyhip.so.5') - p['lib_whitelist'].append('librocm_smi64.so.2') - p['lib_whitelist'].append('librocsolver.so.0 ') - p['lib_whitelist'].append('librocblas.so.0') - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/cmake/oneflow.cmake b/cmake/oneflow.cmake index fff0d9d..a9de1f2 100644 --- a/cmake/oneflow.cmake +++ b/cmake/oneflow.cmake @@ -328,6 +328,17 @@ if(BUILD_PYTHON OR BUILD_CPP_API) endif() endif() +if (BUILD_ROCM) + # AMD compiler fails to compile these three files with '-O1/2/3'. + # The value of `COMPILE_OPTIONS` target property is added after CMAKE__FLAGS_, + # so '-O0' will override '-O1/2/3'. + set_source_files_properties(${PROJECT_SOURCE_DIR}/oneflow/user/kernels/median_with_indices_kernel.hip.cpp + ${PROJECT_SOURCE_DIR}/oneflow/user/kernels/radix_sort_top_k_kernel.hip.cpp + ${PROJECT_SOURCE_DIR}/oneflow/user/kernels/arg_sort_kernel.hip.cpp + # ${PROJECT_SOURCE_DIR}/oneflow/core/ep/cuda/primitive/broadcast_elementwise_binary_math.hip.cpp + PROPERTIES COMPILE_OPTIONS "-O0") +endif() + if(BUILD_PYTHON) # py ext lib diff --git a/oneflow/core/embedding/cached_key_value_store.hip.cpp b/oneflow/core/embedding/cached_key_value_store.hip.cpp index 88456a4..a0a215e 100644 --- a/oneflow/core/embedding/cached_key_value_store.hip.cpp +++ b/oneflow/core/embedding/cached_key_value_store.hip.cpp @@ -1,326 +1,326 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/embedding/cached_key_value_store.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include "oneflow/core/ep/include/device_manager_registry.h" - -namespace oneflow { - -namespace embedding { - -namespace { -template -__global__ void PostStoreGetKernel(uint32_t num_cache_missing, uint32_t num_store_missing, - uint32_t num_elems_per_value, - const uint32_t* cache_missing_indices, - const uint32_t* store_missing_indices, const Elem* store_values, - Elem* values, uint32_t* missing_indices) { - const uint32_t num_cache_missing_elem = num_cache_missing * num_elems_per_value; - CUDA_1D_KERNEL_LOOP_T(uint32_t, i, num_cache_missing_elem) { - const uint32_t value_index = i / num_elems_per_value; - const uint32_t elem_index = i - value_index * num_elems_per_value; - values[cache_missing_indices[value_index] * num_elems_per_value + elem_index] = store_values[i]; - } - CUDA_1D_KERNEL_LOOP_T(uint32_t, i, num_store_missing) { - missing_indices[i] = cache_missing_indices[store_missing_indices[i]]; - } -} - -template -class CacheKeyValueStoreImpl : public KeyValueStore { - public: - OF_DISALLOW_COPY_AND_MOVE(CacheKeyValueStoreImpl); - CacheKeyValueStoreImpl(std::unique_ptr&& store, std::unique_ptr&& cache) - : store_(std::move(store)), cache_(std::move(cache)), synced_(true), max_query_length_(0) { - OF_CUDA_CHECK(hipGetDevice(&device_index_)); - CHECK_EQ(store_->KeySize(), cache_->KeySize()); - CHECK_EQ(store_->ValueSize(), cache_->ValueSize()); - OF_CUDA_CHECK(hipMalloc(&num_buffer_, sizeof(uint32_t))); - OF_CUDA_CHECK(hipMallocHost(reinterpret_cast(&host_num_buffer_), sizeof(uint32_t))); - num_elems_per_value_ = store_->ValueSize() / sizeof(Elem); - } - ~CacheKeyValueStoreImpl() { - CudaCurrentDeviceGuard guard(device_index_); - OF_CUDA_CHECK(hipFree(num_buffer_)); - OF_CUDA_CHECK(hipHostFree(host_num_buffer_)); - if (max_query_length_ != 0) { - OF_CUDA_CHECK(hipFree(keys_buffer_)); - OF_CUDA_CHECK(hipFree(values_buffer_)); - OF_CUDA_CHECK(hipFree(indices_buffer0_)); - OF_CUDA_CHECK(hipFree(indices_buffer1_)); - } - cache_.reset(); - store_.reset(); - } - - uint32_t KeySize() const override { return store_->KeySize(); } - uint32_t ValueSize() const override { return store_->ValueSize(); } - uint32_t MaxQueryLength() const override { return max_query_length_; } - - void ReserveQueryLength(uint32_t query_length) override { - CudaCurrentDeviceGuard guard(device_index_); - if (query_length <= max_query_length_) { return; } - if (query_length > cache_->MaxQueryLength()) { cache_->ReserveQueryLength(query_length); } - if (query_length > store_->MaxQueryLength()) { store_->ReserveQueryLength(query_length); } - if (max_query_length_ != 0) { - OF_CUDA_CHECK(hipFree(keys_buffer_)); - OF_CUDA_CHECK(hipFree(values_buffer_)); - OF_CUDA_CHECK(hipFree(indices_buffer0_)); - OF_CUDA_CHECK(hipFree(indices_buffer1_)); - } - OF_CUDA_CHECK(hipMalloc(&keys_buffer_, query_length * store_->KeySize())); - OF_CUDA_CHECK(hipMalloc(&values_buffer_, query_length * store_->ValueSize())); - OF_CUDA_CHECK(hipMalloc(&indices_buffer0_, query_length * sizeof(uint32_t))); - OF_CUDA_CHECK(hipMalloc(&indices_buffer1_, query_length * sizeof(uint32_t))); - max_query_length_ = query_length; - } - - void Get(ep::Stream* stream, uint32_t num_keys, const void* keys, void* values, - uint32_t* n_missing, uint32_t* missing_indices) override; - void Get(ep::Stream* stream, uint32_t num_keys, const void* keys, void* values, - uint8_t* mask) override; - void Put(ep::Stream* stream, uint32_t num_keys, const void* keys, const void* values) override; - void FusedHalfUpdatePut(ep::Stream* stream, uint32_t n_keys, const void* keys, const void* values, - const void* update, const float* lr, float scale) override; - bool IsFusionSupported() override { - return cache_->Policy() == CacheOptions::Policy::kFull - && cache_->ValueType() == DataType::kFloat; - } - bool SnapshotExists(const std::string& name) override; - void LoadSnapshot(const std::string& name) override; - void SaveSnapshot(const std::string& name) override; - void LoadSnapshot(const std::string& name, - const std::function& Hook) override; - - private: - void SyncCacheToStore(); - - std::unique_ptr store_; - std::unique_ptr cache_; - - uint32_t* num_buffer_{}; - uint32_t* host_num_buffer_{}; - Key* keys_buffer_{}; - Elem* values_buffer_{}; - uint32_t* indices_buffer0_{}; - uint32_t* indices_buffer1_{}; - int device_index_{}; - uint32_t max_query_length_; - uint32_t num_elems_per_value_{}; - std::recursive_mutex mutex_; - bool synced_; -}; - -template -void CacheKeyValueStoreImpl::Get(ep::Stream* stream, uint32_t num_keys, const void* keys, - void* values, uint32_t* n_missing, - uint32_t* missing_indices) { - std::lock_guard lock(mutex_); - auto cuda_stream = stream->As(); - if (cache_->Policy() == CacheOptions::Policy::kFull) { - cache_->Get(stream, num_keys, keys, values, n_missing, keys_buffer_, missing_indices); - return; - } else { - cache_->Get(stream, num_keys, keys, values, num_buffer_, keys_buffer_, indices_buffer0_); - } - OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, num_buffer_, sizeof(uint32_t), hipMemcpyDefault, - cuda_stream->cuda_stream())); - CHECK_JUST(cuda_stream->Sync()); - const uint32_t num_cache_missing = *host_num_buffer_; - if (num_cache_missing == 0) { - OF_CUDA_CHECK(hipMemsetAsync(n_missing, 0, sizeof(uint32_t), - stream->As()->cuda_stream())); - return; - } - store_->Get(stream, num_cache_missing, keys_buffer_, values_buffer_, n_missing, indices_buffer1_); - OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, n_missing, sizeof(uint32_t), hipMemcpyDefault, - cuda_stream->cuda_stream())); - CHECK_JUST(cuda_stream->Sync()); - const uint32_t num_store_missing = *host_num_buffer_; - RUN_CUDA_KERNEL((PostStoreGetKernel), stream, num_cache_missing * num_elems_per_value_, - num_cache_missing, num_store_missing, num_elems_per_value_, indices_buffer0_, - indices_buffer1_, values_buffer_, static_cast(values), missing_indices); -} - -template -void CacheKeyValueStoreImpl::Get(ep::Stream* stream, uint32_t num_keys, const void* keys, - void* values, uint8_t* mask) { - std::lock_guard lock(mutex_); - if (cache_->Policy() == CacheOptions::Policy::kFull) { - cache_->Get(stream, num_keys, keys, values, mask); - return; - } else { - UNIMPLEMENTED(); - } -} - -template -void CacheKeyValueStoreImpl::Put(ep::Stream* stream, uint32_t num_keys, const void* keys, - const void* values) { - std::lock_guard lock(mutex_); - synced_ = false; - auto cuda_stream = stream->As(); - cache_->Put(stream, num_keys, keys, values, num_buffer_, keys_buffer_, values_buffer_); - if (cache_->Policy() == CacheOptions::Policy::kFull) { return; } - OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, num_buffer_, sizeof(uint32_t), hipMemcpyDefault, - cuda_stream->cuda_stream())); - CHECK_JUST(cuda_stream->Sync()); - store_->Put(stream, *host_num_buffer_, keys_buffer_, values_buffer_); -} - -template -void CacheKeyValueStoreImpl::FusedHalfUpdatePut(ep::Stream* stream, uint32_t num_keys, - const void* keys, const void* values, - const void* update, const float* lr, - float scale) { - std::lock_guard lock(mutex_); - if (cache_->Policy() != CacheOptions::Policy::kFull || cache_->ValueType() != DataType::kFloat) { - UNIMPLEMENTED(); - } - synced_ = false; - cache_->FusedHalfUpdatePut(stream, num_keys, keys, values, update, lr, scale, num_buffer_, - keys_buffer_, values_buffer_); -} - -template -bool CacheKeyValueStoreImpl::SnapshotExists(const std::string& name) { - return store_->SnapshotExists(name); -} - -template -void CacheKeyValueStoreImpl::LoadSnapshot(const std::string& name) { - LoadSnapshot(name, nullptr); -} - -template -void CacheKeyValueStoreImpl::LoadSnapshot( - const std::string& name, const std::function& Hook) { - CudaCurrentDeviceGuard guard(device_index_); - std::lock_guard lock(mutex_); - CHECK_GT(max_query_length_, 0); - cache_->Clear(); - auto device = - Singleton::Get()->GetDevice(DeviceType::kCUDA, device_index_); - CHECK(device); - auto* stream = device->CreateStream(); - store_->LoadSnapshot(name, [&](KVIterator* iter) { - if (cache_->Policy() == CacheOptions::Policy::kFull) { - auto* cuda_stream = stream->As(); - while (true) { - iter->NextN(stream, max_query_length_, num_buffer_, keys_buffer_, values_buffer_); - OF_CUDA_CHECK(hipDeviceSynchronize()); - OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, num_buffer_, sizeof(uint32_t), - hipMemcpyDefault, cuda_stream->cuda_stream())); - CHECK_JUST(stream->Sync()); - if (*host_num_buffer_ == 0) { return; } - cache_->Put(stream, *host_num_buffer_, keys_buffer_, values_buffer_, num_buffer_, nullptr, - nullptr); - OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, num_buffer_, sizeof(uint32_t), - hipMemcpyDefault, cuda_stream->cuda_stream())); - CHECK_JUST(stream->Sync()); - CHECK_EQ(*host_num_buffer_, 0); - } - } - if (Hook) { - iter->Reset(); - Hook(iter); - } - }); - device->DestroyStream(stream); - store_->LoadSnapshot(name); -} - -template -void CacheKeyValueStoreImpl::SaveSnapshot(const std::string& name) { - CudaCurrentDeviceGuard guard(device_index_); - std::lock_guard lock(mutex_); - SyncCacheToStore(); - store_->SaveSnapshot(name); -} - -template -void CacheKeyValueStoreImpl::SyncCacheToStore() { - if (synced_) { return; } - CudaCurrentDeviceGuard guard(device_index_); - auto device = - Singleton::Get()->GetDevice(DeviceType::kCUDA, device_index_); - CHECK(device); - auto* stream = device->CreateStream(); - auto* cuda_stream = stream->As(); - const uint64_t dump_capacity = cache_->DumpCapacity(); - CHECK_GT(max_query_length_, 0); - for (uint64_t start_key_index = 0; start_key_index < dump_capacity; - start_key_index += max_query_length_) { - cache_->Dump(stream, start_key_index, - std::min(start_key_index + max_query_length_, dump_capacity), num_buffer_, - keys_buffer_, values_buffer_); - OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, num_buffer_, sizeof(uint32_t), - hipMemcpyDefault, cuda_stream->cuda_stream())); - CHECK_JUST(stream->Sync()); - if (*host_num_buffer_ == 0) { continue; } - store_->Put(stream, *host_num_buffer_, keys_buffer_, values_buffer_); - CHECK_JUST(stream->Sync()); - } - device->DestroyStream(stream); - synced_ = true; -} - -template -std::unique_ptr DispatchElemType(std::unique_ptr&& store, - std::unique_ptr&& cache) { - const uint32_t value_size = store->ValueSize(); - if (value_size % sizeof(uint4) == 0) { - return std::unique_ptr( - new CacheKeyValueStoreImpl(std::move(store), std::move(cache))); - } else if (value_size % sizeof(uint64_t) == 0) { - return std::unique_ptr( - new CacheKeyValueStoreImpl(std::move(store), std::move(cache))); - } else if (value_size % sizeof(uint32_t) == 0) { - return std::unique_ptr( - new CacheKeyValueStoreImpl(std::move(store), std::move(cache))); - } else if (value_size % sizeof(uint16_t) == 0) { - return std::unique_ptr( - new CacheKeyValueStoreImpl(std::move(store), std::move(cache))); - } else { - return std::unique_ptr( - new CacheKeyValueStoreImpl(std::move(store), std::move(cache))); - } -} - -std::unique_ptr DispatchKeyType(std::unique_ptr&& store, - std::unique_ptr&& cache) { - const uint32_t key_size = store->KeySize(); - if (key_size == 4) { - return DispatchElemType(std::move(store), std::move(cache)); - } else if (key_size == 8) { - return DispatchElemType(std::move(store), std::move(cache)); - } else { - UNIMPLEMENTED(); - return nullptr; - } -} - -} // namespace - -std::unique_ptr NewCachedKeyValueStore(std::unique_ptr&& store, - std::unique_ptr&& cache) { - return DispatchKeyType(std::move(store), std::move(cache)); -} - -} // namespace embedding - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/embedding/cached_key_value_store.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include "oneflow/core/ep/include/device_manager_registry.h" + +namespace oneflow { + +namespace embedding { + +namespace { +template +__global__ void PostStoreGetKernel(uint32_t num_cache_missing, uint32_t num_store_missing, + uint32_t num_elems_per_value, + const uint32_t* cache_missing_indices, + const uint32_t* store_missing_indices, const Elem* store_values, + Elem* values, uint32_t* missing_indices) { + const uint32_t num_cache_missing_elem = num_cache_missing * num_elems_per_value; + CUDA_1D_KERNEL_LOOP_T(uint32_t, i, num_cache_missing_elem) { + const uint32_t value_index = i / num_elems_per_value; + const uint32_t elem_index = i - value_index * num_elems_per_value; + values[cache_missing_indices[value_index] * num_elems_per_value + elem_index] = store_values[i]; + } + CUDA_1D_KERNEL_LOOP_T(uint32_t, i, num_store_missing) { + missing_indices[i] = cache_missing_indices[store_missing_indices[i]]; + } +} + +template +class CacheKeyValueStoreImpl : public KeyValueStore { + public: + OF_DISALLOW_COPY_AND_MOVE(CacheKeyValueStoreImpl); + CacheKeyValueStoreImpl(std::unique_ptr&& store, std::unique_ptr&& cache) + : store_(std::move(store)), cache_(std::move(cache)), synced_(true), max_query_length_(0) { + OF_CUDA_CHECK(hipGetDevice(&device_index_)); + CHECK_EQ(store_->KeySize(), cache_->KeySize()); + CHECK_EQ(store_->ValueSize(), cache_->ValueSize()); + OF_CUDA_CHECK(hipMalloc(&num_buffer_, sizeof(uint32_t))); + OF_CUDA_CHECK(hipMallocHost(reinterpret_cast(&host_num_buffer_), sizeof(uint32_t))); + num_elems_per_value_ = store_->ValueSize() / sizeof(Elem); + } + ~CacheKeyValueStoreImpl() { + CudaCurrentDeviceGuard guard(device_index_); + OF_CUDA_CHECK(hipFree(num_buffer_)); + OF_CUDA_CHECK(hipHostFree(host_num_buffer_)); + if (max_query_length_ != 0) { + OF_CUDA_CHECK(hipFree(keys_buffer_)); + OF_CUDA_CHECK(hipFree(values_buffer_)); + OF_CUDA_CHECK(hipFree(indices_buffer0_)); + OF_CUDA_CHECK(hipFree(indices_buffer1_)); + } + cache_.reset(); + store_.reset(); + } + + uint32_t KeySize() const override { return store_->KeySize(); } + uint32_t ValueSize() const override { return store_->ValueSize(); } + uint32_t MaxQueryLength() const override { return max_query_length_; } + + void ReserveQueryLength(uint32_t query_length) override { + CudaCurrentDeviceGuard guard(device_index_); + if (query_length <= max_query_length_) { return; } + if (query_length > cache_->MaxQueryLength()) { cache_->ReserveQueryLength(query_length); } + if (query_length > store_->MaxQueryLength()) { store_->ReserveQueryLength(query_length); } + if (max_query_length_ != 0) { + OF_CUDA_CHECK(hipFree(keys_buffer_)); + OF_CUDA_CHECK(hipFree(values_buffer_)); + OF_CUDA_CHECK(hipFree(indices_buffer0_)); + OF_CUDA_CHECK(hipFree(indices_buffer1_)); + } + OF_CUDA_CHECK(hipMalloc(&keys_buffer_, query_length * store_->KeySize())); + OF_CUDA_CHECK(hipMalloc(&values_buffer_, query_length * store_->ValueSize())); + OF_CUDA_CHECK(hipMalloc(&indices_buffer0_, query_length * sizeof(uint32_t))); + OF_CUDA_CHECK(hipMalloc(&indices_buffer1_, query_length * sizeof(uint32_t))); + max_query_length_ = query_length; + } + + void Get(ep::Stream* stream, uint32_t num_keys, const void* keys, void* values, + uint32_t* n_missing, uint32_t* missing_indices) override; + void Get(ep::Stream* stream, uint32_t num_keys, const void* keys, void* values, + uint8_t* mask) override; + void Put(ep::Stream* stream, uint32_t num_keys, const void* keys, const void* values) override; + void FusedHalfUpdatePut(ep::Stream* stream, uint32_t n_keys, const void* keys, const void* values, + const void* update, const float* lr, float scale) override; + bool IsFusionSupported() override { + return cache_->Policy() == CacheOptions::Policy::kFull + && cache_->ValueType() == DataType::kFloat; + } + bool SnapshotExists(const std::string& name) override; + void LoadSnapshot(const std::string& name) override; + void SaveSnapshot(const std::string& name) override; + void LoadSnapshot(const std::string& name, + const std::function& Hook) override; + + private: + void SyncCacheToStore(); + + std::unique_ptr store_; + std::unique_ptr cache_; + + uint32_t* num_buffer_{}; + uint32_t* host_num_buffer_{}; + Key* keys_buffer_{}; + Elem* values_buffer_{}; + uint32_t* indices_buffer0_{}; + uint32_t* indices_buffer1_{}; + int device_index_{}; + uint32_t max_query_length_; + uint32_t num_elems_per_value_{}; + std::recursive_mutex mutex_; + bool synced_; +}; + +template +void CacheKeyValueStoreImpl::Get(ep::Stream* stream, uint32_t num_keys, const void* keys, + void* values, uint32_t* n_missing, + uint32_t* missing_indices) { + std::lock_guard lock(mutex_); + auto cuda_stream = stream->As(); + if (cache_->Policy() == CacheOptions::Policy::kFull) { + cache_->Get(stream, num_keys, keys, values, n_missing, keys_buffer_, missing_indices); + return; + } else { + cache_->Get(stream, num_keys, keys, values, num_buffer_, keys_buffer_, indices_buffer0_); + } + OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, num_buffer_, sizeof(uint32_t), hipMemcpyDefault, + cuda_stream->cuda_stream())); + CHECK_JUST(cuda_stream->Sync()); + const uint32_t num_cache_missing = *host_num_buffer_; + if (num_cache_missing == 0) { + OF_CUDA_CHECK(hipMemsetAsync(n_missing, 0, sizeof(uint32_t), + stream->As()->cuda_stream())); + return; + } + store_->Get(stream, num_cache_missing, keys_buffer_, values_buffer_, n_missing, indices_buffer1_); + OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, n_missing, sizeof(uint32_t), hipMemcpyDefault, + cuda_stream->cuda_stream())); + CHECK_JUST(cuda_stream->Sync()); + const uint32_t num_store_missing = *host_num_buffer_; + RUN_CUDA_KERNEL((PostStoreGetKernel), stream, num_cache_missing * num_elems_per_value_, + num_cache_missing, num_store_missing, num_elems_per_value_, indices_buffer0_, + indices_buffer1_, values_buffer_, static_cast(values), missing_indices); +} + +template +void CacheKeyValueStoreImpl::Get(ep::Stream* stream, uint32_t num_keys, const void* keys, + void* values, uint8_t* mask) { + std::lock_guard lock(mutex_); + if (cache_->Policy() == CacheOptions::Policy::kFull) { + cache_->Get(stream, num_keys, keys, values, mask); + return; + } else { + UNIMPLEMENTED(); + } +} + +template +void CacheKeyValueStoreImpl::Put(ep::Stream* stream, uint32_t num_keys, const void* keys, + const void* values) { + std::lock_guard lock(mutex_); + synced_ = false; + auto cuda_stream = stream->As(); + cache_->Put(stream, num_keys, keys, values, num_buffer_, keys_buffer_, values_buffer_); + if (cache_->Policy() == CacheOptions::Policy::kFull) { return; } + OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, num_buffer_, sizeof(uint32_t), hipMemcpyDefault, + cuda_stream->cuda_stream())); + CHECK_JUST(cuda_stream->Sync()); + store_->Put(stream, *host_num_buffer_, keys_buffer_, values_buffer_); +} + +template +void CacheKeyValueStoreImpl::FusedHalfUpdatePut(ep::Stream* stream, uint32_t num_keys, + const void* keys, const void* values, + const void* update, const float* lr, + float scale) { + std::lock_guard lock(mutex_); + if (cache_->Policy() != CacheOptions::Policy::kFull || cache_->ValueType() != DataType::kFloat) { + UNIMPLEMENTED(); + } + synced_ = false; + cache_->FusedHalfUpdatePut(stream, num_keys, keys, values, update, lr, scale, num_buffer_, + keys_buffer_, values_buffer_); +} + +template +bool CacheKeyValueStoreImpl::SnapshotExists(const std::string& name) { + return store_->SnapshotExists(name); +} + +template +void CacheKeyValueStoreImpl::LoadSnapshot(const std::string& name) { + LoadSnapshot(name, nullptr); +} + +template +void CacheKeyValueStoreImpl::LoadSnapshot( + const std::string& name, const std::function& Hook) { + CudaCurrentDeviceGuard guard(device_index_); + std::lock_guard lock(mutex_); + CHECK_GT(max_query_length_, 0); + cache_->Clear(); + auto device = + Singleton::Get()->GetDevice(DeviceType::kCUDA, device_index_); + CHECK(device); + auto* stream = device->CreateStream(); + store_->LoadSnapshot(name, [&](KVIterator* iter) { + if (cache_->Policy() == CacheOptions::Policy::kFull) { + auto* cuda_stream = stream->As(); + while (true) { + iter->NextN(stream, max_query_length_, num_buffer_, keys_buffer_, values_buffer_); + OF_CUDA_CHECK(hipDeviceSynchronize()); + OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, num_buffer_, sizeof(uint32_t), + hipMemcpyDefault, cuda_stream->cuda_stream())); + CHECK_JUST(stream->Sync()); + if (*host_num_buffer_ == 0) { return; } + cache_->Put(stream, *host_num_buffer_, keys_buffer_, values_buffer_, num_buffer_, nullptr, + nullptr); + OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, num_buffer_, sizeof(uint32_t), + hipMemcpyDefault, cuda_stream->cuda_stream())); + CHECK_JUST(stream->Sync()); + CHECK_EQ(*host_num_buffer_, 0); + } + } + if (Hook) { + iter->Reset(); + Hook(iter); + } + }); + device->DestroyStream(stream); + store_->LoadSnapshot(name); +} + +template +void CacheKeyValueStoreImpl::SaveSnapshot(const std::string& name) { + CudaCurrentDeviceGuard guard(device_index_); + std::lock_guard lock(mutex_); + SyncCacheToStore(); + store_->SaveSnapshot(name); +} + +template +void CacheKeyValueStoreImpl::SyncCacheToStore() { + if (synced_) { return; } + CudaCurrentDeviceGuard guard(device_index_); + auto device = + Singleton::Get()->GetDevice(DeviceType::kCUDA, device_index_); + CHECK(device); + auto* stream = device->CreateStream(); + auto* cuda_stream = stream->As(); + const uint64_t dump_capacity = cache_->DumpCapacity(); + CHECK_GT(max_query_length_, 0); + for (uint64_t start_key_index = 0; start_key_index < dump_capacity; + start_key_index += max_query_length_) { + cache_->Dump(stream, start_key_index, + std::min(start_key_index + max_query_length_, dump_capacity), num_buffer_, + keys_buffer_, values_buffer_); + OF_CUDA_CHECK(hipMemcpyAsync(host_num_buffer_, num_buffer_, sizeof(uint32_t), + hipMemcpyDefault, cuda_stream->cuda_stream())); + CHECK_JUST(stream->Sync()); + if (*host_num_buffer_ == 0) { continue; } + store_->Put(stream, *host_num_buffer_, keys_buffer_, values_buffer_); + CHECK_JUST(stream->Sync()); + } + device->DestroyStream(stream); + synced_ = true; +} + +template +std::unique_ptr DispatchElemType(std::unique_ptr&& store, + std::unique_ptr&& cache) { + const uint32_t value_size = store->ValueSize(); + if (value_size % sizeof(uint4) == 0) { + return std::unique_ptr( + new CacheKeyValueStoreImpl(std::move(store), std::move(cache))); + } else if (value_size % sizeof(uint64_t) == 0) { + return std::unique_ptr( + new CacheKeyValueStoreImpl(std::move(store), std::move(cache))); + } else if (value_size % sizeof(uint32_t) == 0) { + return std::unique_ptr( + new CacheKeyValueStoreImpl(std::move(store), std::move(cache))); + } else if (value_size % sizeof(uint16_t) == 0) { + return std::unique_ptr( + new CacheKeyValueStoreImpl(std::move(store), std::move(cache))); + } else { + return std::unique_ptr( + new CacheKeyValueStoreImpl(std::move(store), std::move(cache))); + } +} + +std::unique_ptr DispatchKeyType(std::unique_ptr&& store, + std::unique_ptr&& cache) { + const uint32_t key_size = store->KeySize(); + if (key_size == 4) { + return DispatchElemType(std::move(store), std::move(cache)); + } else if (key_size == 8) { + return DispatchElemType(std::move(store), std::move(cache)); + } else { + UNIMPLEMENTED(); + return nullptr; + } +} + +} // namespace + +std::unique_ptr NewCachedKeyValueStore(std::unique_ptr&& store, + std::unique_ptr&& cache) { + return DispatchKeyType(std::move(store), std::move(cache)); +} + +} // namespace embedding + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/embedding/full_cache.hip.cpp b/oneflow/core/embedding/full_cache.hip.cpp index 164ecec..fed4182 100644 --- a/oneflow/core/embedding/full_cache.hip.cpp +++ b/oneflow/core/embedding/full_cache.hip.cpp @@ -1,640 +1,640 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/embedding/full_cache.h" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/embedding/hash_functions.hip.h" -#include "oneflow/core/hip/atomic.hip.h" - -namespace oneflow { - -namespace embedding { - -using Key32 = unsigned int; -using Key64 = unsigned long long int; -using Key128 = ulonglong2; - -namespace { - -template -__device__ bool TryGetOrInsert(Key* entry_key, volatile Index* entry_index, Index* table_size, - Key key, Index* out) { - Key key_hi = (key | 0x1); - Key key_lo = (key & 0x1); - Index index_plus_one = 0; - Key old_entry_key = cuda::atomic::CAS(entry_key, static_cast(0), key_hi); - while (index_plus_one == 0) { - if (old_entry_key == static_cast(0)) { - Index index = cuda::atomic::Add(table_size, static_cast(1)); - index_plus_one = index + 1; - *entry_index = ((index_plus_one << 1U) | key_lo); - *out = index_plus_one; - return true; - } else if (old_entry_key == key_hi) { - const Index entry_index_val = *entry_index; - if (entry_index_val == 0) { - // do nothing - } else if ((entry_index_val & 0x1) == key_lo) { - *out = (entry_index_val >> 1U); - return true; - } else { - return false; - } - } else { - return false; - } - } - return false; -} - -template -__device__ bool GetOrInsertOne(const size_t capacity, Key* table_keys, Index* table_indices, - Index* table_size, Key key, size_t hash, Index* out) { - const size_t start_idx = hash % capacity; - for (size_t count = 0; count < capacity; ++count) { - const size_t idx = (start_idx + count) % capacity; - Key* entry_key = table_keys + idx; - Index* entry_index = table_indices + idx; - if (TryGetOrInsert(entry_key, entry_index, table_size, key, out)) { return true; } - } - return false; -} - -template -__device__ bool GetOne(const size_t capacity, Key* table_keys, Index* table_indices, Key key, - size_t hash, Index* out) { - const size_t start_idx = hash % capacity; - for (size_t count = 0; count < capacity; ++count) { - const size_t idx = (start_idx + count) % capacity; - Key entry_key = table_keys[idx]; - Key entry_index = table_indices[idx]; - Key key_hi = (key | 0x1); - Key key_lo = (key & 0x1); - if (entry_key == 0) { break; } - if (entry_key == key_hi) { - if ((entry_index & 0x1) == key_lo) { - *out = (entry_index >> 1U); - return true; - } - } - } - *out = 0; - return false; -} - -template -__global__ void OrdinalEncodeKernel(uint64_t capacity, Key* table_keys, Index* table_indices, - Index* table_size, uint32_t num_keys, const Key* keys, - Index* context) { - CUDA_1D_KERNEL_LOOP(i, num_keys) { - Key key = keys[i]; - uint64_t hash = FullCacheHash()(key); - bool success = GetOrInsertOne(capacity, table_keys, table_indices, table_size, key, - hash, context + i); - assert(success); - } -} - -template -__global__ void OrdinalEncodeLookupKernel(uint64_t capacity, Key* table_keys, Index* table_indices, - uint32_t num_keys, const Key* keys, Index* context) { - CUDA_1D_KERNEL_LOOP(i, num_keys) { - Key key = keys[i]; - uint64_t hash = FullCacheHash()(key); - GetOne(capacity, table_keys, table_indices, key, hash, context + i); - } -} - -template -__global__ void OrdinalEncodeDumpKernel(const Key* table_keys, const Index* table_indices, - uint64_t start_key_index, uint64_t end_key_index, - uint32_t* n_dumped, Key* keys, Index* context) { - CUDA_1D_KERNEL_LOOP(i, (end_key_index - start_key_index)) { - Key entry_key = table_keys[i + start_key_index]; - Index entry_index = table_indices[i + start_key_index]; - if (entry_index != 0) { - uint32_t index = cuda::atomic::Add(n_dumped, static_cast(1)); - keys[index] = ((entry_key ^ 0x1) | (entry_index & 0x1)); - context[index] = (entry_index >> 1U); - } - } -} - -template -__global__ void LookupKernel(uint32_t value_length, const Elem* cache_values, - uint32_t values_elem_cnt, const Key* keys, const Index* context, - Elem* values, uint32_t* n_missing, Key* missing_keys, - uint32_t* missing_indices) { - CUDA_1D_KERNEL_LOOP(i, values_elem_cnt) { - const uint64_t key_id = i / value_length; - const uint64_t ctx = context[key_id]; - const uint64_t row_id = ctx - 1; - const uint64_t col_id = i - key_id * value_length; - if (ctx == 0) { - const Key missing_key = keys[key_id]; - if (col_id == 0) { - const uint32_t old_n_missing = cuda::atomic::Add(n_missing, static_cast(1)); - missing_keys[old_n_missing] = missing_key; - missing_indices[old_n_missing] = key_id; - } - continue; - } - if (return_value) { values[i] = cache_values[row_id * value_length + col_id]; } - } -} - -template -__global__ void EncodeLookupKernel(uint32_t value_length, const Elem* cache_values, - uint32_t values_elem_cnt, const Key* keys, const Index* context, - Elem* values, uint32_t* n_missing, Key* missing_keys, - uint32_t* missing_indices, const size_t capacity, - Key* table_keys, Index* table_indices) { - constexpr uint32_t warp_size = 32; - constexpr uint32_t n_warp_per_block = block_size / warp_size; - const uint32_t warp_id = threadIdx.x / warp_size; - const uint32_t lane_id = threadIdx.x % warp_size; - const uint32_t global_warp_id = blockIdx.x * n_warp_per_block + warp_id; - const uint32_t global_n_warp = gridDim.x * n_warp_per_block; - const uint32_t n_keys = values_elem_cnt / value_length; - __shared__ Key batch_keys[n_warp_per_block][warp_size]; - __shared__ Index batch_row_ids[n_warp_per_block][warp_size]; - __shared__ Key batch_missing_keys[n_warp_per_block][warp_size]; - __shared__ uint32_t batch_missing_indices[n_warp_per_block][warp_size]; - __shared__ uint32_t batch_n_missing[n_warp_per_block]; - for (uint32_t batch_start = global_warp_id * warp_size; batch_start < n_keys; - batch_start += global_n_warp * warp_size) { - const uint32_t batch_n_key = min(n_keys - batch_start, warp_size); - if (lane_id == 0) { batch_n_missing[warp_id] = 0; } - __syncthreads(); - const uint32_t key_offset = batch_start + lane_id; - if (key_offset < n_keys) { - const Key key = keys[batch_start + lane_id]; - const uint64_t hash = FullCacheHash()(key); - Index row; - GetOne(capacity, table_keys, table_indices, key, hash, &row); - batch_row_ids[warp_id][lane_id] = row; - if (row == 0) { - const uint32_t batch_missing_idx = atomicAdd(batch_n_missing + warp_id, 1); - batch_missing_keys[warp_id][batch_missing_idx] = key; - batch_missing_indices[warp_id][batch_missing_idx] = key_offset; - } - } - __syncthreads(); - const uint32_t batch_n_missing_t = batch_n_missing[warp_id]; - if (lane_id == 0) { - const uint32_t old_n_missing = - cuda::atomic::Add(n_missing, static_cast(batch_n_missing_t)); - batch_n_missing[warp_id] = old_n_missing; - } - __syncthreads(); - if (lane_id < batch_n_missing_t) { - missing_keys[batch_n_missing[warp_id] + lane_id] = batch_missing_keys[warp_id][lane_id]; - missing_indices[batch_n_missing[warp_id] + lane_id] = batch_missing_indices[warp_id][lane_id]; - } - for (int i = 0; i < batch_n_key; ++i) { - const Key key = batch_keys[warp_id][i]; - const Index row = batch_row_ids[warp_id][i]; - if (row == 0) { continue; } - for (int col = lane_id; col < value_length; col += warp_size) { - values[(batch_start + i) * value_length + col] = - cache_values[(row - 1) * value_length + col]; - } - } - __syncthreads(); - } -} - -template -struct alignas(sizeof(T) * pack_size) Pack { - T elem[pack_size]; -}; - -template -__global__ void EncodeLookupMaskKernel(uint32_t value_length, const Elem* __restrict__ cache_values, - uint32_t values_elem_cnt, const Key* __restrict__ keys, - const Index* __restrict__ context, Elem* __restrict__ values, - uint8_t* __restrict__ mask, const size_t capacity, - Key* __restrict__ table_keys, - Index* __restrict__ table_indices) { - const uint32_t packed_cols = value_length / pack_size; - auto* packed_values = reinterpret_cast*>(values); - const auto* packed_cache_values = reinterpret_cast*>(cache_values); - constexpr uint32_t warp_size = 32; - constexpr uint32_t n_warp_per_block = block_size / warp_size; - const uint32_t warp_id = threadIdx.x / warp_size; - const uint32_t lane_id = threadIdx.x % warp_size; - const uint32_t global_warp_id = blockIdx.x * n_warp_per_block + warp_id; - const uint32_t global_n_warp = gridDim.x * n_warp_per_block; - const uint32_t n_keys = values_elem_cnt / value_length; - __shared__ Key batch_keys[n_warp_per_block][warp_size]; - __shared__ Index batch_row_ids[n_warp_per_block][warp_size]; - for (uint32_t batch_start = global_warp_id * warp_size; batch_start < n_keys; - batch_start += global_n_warp * warp_size) { - const uint32_t batch_n_key = min(n_keys - batch_start, warp_size); - const uint32_t key_offset = batch_start + lane_id; - if (key_offset < n_keys) { - const Key key = keys[batch_start + lane_id]; - const uint64_t hash = FullCacheHash()(key); - Index row; - GetOne(capacity, table_keys, table_indices, key, hash, &row); - batch_row_ids[warp_id][lane_id] = row; - mask[key_offset] = row > 0; - } - __syncthreads(); - for (int i = 0; i < batch_n_key; ++i) { - const Key key = batch_keys[warp_id][i]; - const Index row = batch_row_ids[warp_id][i]; - if (row == 0) { continue; } -#pragma unroll 4 - for (int col = lane_id; col < packed_cols; col += warp_size) { - packed_values[(batch_start + i) * packed_cols + col] = - packed_cache_values[(row - 1) * packed_cols + col]; - } - } - __syncthreads(); - } -} - -template -__global__ void UpdateKernel(uint32_t value_length, Elem* cache_values, uint32_t values_elem_cnt, - const Index* context, const Elem* values) { - const int packed_values_elem_cnt = values_elem_cnt / pack_size; - const uint32_t packed_elem_cnt = value_length / pack_size; - auto* packed_cache_values = reinterpret_cast*>(cache_values); - auto* packed_values = reinterpret_cast*>(values); - CUDA_1D_KERNEL_LOOP(i, packed_values_elem_cnt) { - const uint64_t key_id = i / packed_elem_cnt; - const uint64_t ctx = context[key_id]; - if (ctx == 0) { continue; } - const uint64_t row_id = ctx - 1; - const uint64_t col_id = i - key_id * packed_elem_cnt; - packed_cache_values[row_id * packed_elem_cnt + col_id] = packed_values[i]; - } -} - -template -__global__ typename std::enable_if::value, void>::type -FusedHalfUpdateKernel(uint32_t value_length, Elem* __restrict__ cache_values, - uint32_t values_elem_cnt, const Index* __restrict__ context, - const Elem* __restrict__ values, const half* __restrict__ update, - const float* __restrict__ lr, float scale) { - const int packed_values_elem_cnt = values_elem_cnt / pack_size; - const uint32_t packed_elem_cnt = value_length / pack_size; - auto* packed_cache_values = reinterpret_cast*>(cache_values); - auto* packed_values = reinterpret_cast*>(values); - auto* packed_update = reinterpret_cast*>(update); - const float alpha = -*lr * scale; - CUDA_1D_KERNEL_LOOP(i, packed_values_elem_cnt) { - const uint64_t key_id = i / packed_elem_cnt; - const uint64_t ctx = context[key_id]; - if (ctx == 0) { continue; } - const uint64_t row_id = ctx - 1; - const uint64_t col_id = i - key_id * packed_elem_cnt; - Pack m = packed_values[i]; - Pack u = packed_update[i]; - for (size_t j = 0; j < pack_size; ++j) { m.elem[j] += static_cast(u.elem[j]) * alpha; } - packed_cache_values[row_id * packed_elem_cnt + col_id] = m; - } -} - -template -__global__ typename std::enable_if::value, void>::type -FusedHalfUpdateKernel(uint32_t value_length, Elem* cache_values, uint32_t values_elem_cnt, - const Index* context, const Elem* values, const half* update, const float* lr, - float scale) { - asm volatile("s_trap 0;"); -} - -template -__global__ void DumpValueKernel(uint32_t value_length, const uint32_t* n_dumped, - const Index* context, const Elem* cache_values, Elem* values) { - CUDA_1D_KERNEL_LOOP(i, *n_dumped * value_length) { - const uint64_t key_id = i / value_length; - const uint64_t ctx = context[key_id]; - const uint64_t row_id = ctx - 1; - const uint64_t col_id = i - key_id * value_length; - values[i] = cache_values[row_id * value_length + col_id]; - } -} - -template -class OrdinalEncoder { - public: - OF_DISALLOW_COPY_AND_MOVE(OrdinalEncoder); - explicit OrdinalEncoder(uint64_t capacity, float load_factor) - : capacity_(capacity), table_capacity_(capacity / load_factor) { - OF_CUDA_CHECK(hipGetDevice(&device_index_)); - OF_CUDA_CHECK(hipMalloc(&table_size_, sizeof(Index))); - OF_CUDA_CHECK(hipMallocHost(reinterpret_cast(&table_size_host_), sizeof(Index))); - OF_CUDA_CHECK(hipMalloc(&table_keys_, table_capacity_ * sizeof(Key))); - OF_CUDA_CHECK(hipMalloc(&table_indices_, table_capacity_ * sizeof(Index))); - Clear(); - } - ~OrdinalEncoder() { - CudaCurrentDeviceGuard guard(device_index_); - OF_CUDA_CHECK(hipFree(table_size_)); - OF_CUDA_CHECK(hipHostFree(table_size_host_)); - OF_CUDA_CHECK(hipFree(table_keys_)); - OF_CUDA_CHECK(hipFree(table_indices_)); - } - - template - void Encode(ep::Stream* stream, uint32_t num_keys, const Key* keys, Index* context) { - if (insert) { - RUN_CUDA_KERNEL((OrdinalEncodeKernel), stream, num_keys, table_capacity_, - table_keys_, table_indices_, table_size_, num_keys, keys, context); - OF_CUDA_CHECK(hipMemcpyAsync(table_size_host_, table_size_, sizeof(Index), hipMemcpyDefault, - stream->As()->cuda_stream())); - CHECK_JUST(stream->Sync()); - CHECK_LT(*table_size_host_, capacity_) - << "The number of key is larger than cache size, please enlarge cache_memory_budget. "; - } else { - RUN_CUDA_KERNEL((OrdinalEncodeLookupKernel), stream, num_keys, table_capacity_, - table_keys_, table_indices_, num_keys, keys, context); - } - } - - void Dump(ep::Stream* stream, uint64_t start_key_index, uint64_t end_key_index, - uint32_t* n_dumped, Key* keys, Index* context) { - OF_CUDA_CHECK(hipMemsetAsync(n_dumped, 0, sizeof(uint32_t), - stream->As()->cuda_stream())); - RUN_CUDA_KERNEL((OrdinalEncodeDumpKernel), stream, end_key_index - start_key_index, - table_keys_, table_indices_, start_key_index, end_key_index, n_dumped, keys, - context); - } - - void Clear() { - OF_CUDA_CHECK(hipMemset(table_size_, 0, sizeof(Index))); - OF_CUDA_CHECK(hipMemset(table_keys_, 0, table_capacity_ * sizeof(Key))); - OF_CUDA_CHECK(hipMemset(table_indices_, 0, table_capacity_ * sizeof(Index))); - } - - uint64_t TableCapacity() const { return table_capacity_; } - - Key* table_keys() const { return table_keys_; } - - Index* table_indices() const { return table_indices_; } - - private: - int device_index_{}; - Key* table_keys_; - Index* table_indices_; - uint64_t capacity_; - uint64_t table_capacity_; - Index* table_size_{}; - Index* table_size_host_{}; -}; - -template -class CacheImpl : public Cache { - public: - OF_DISALLOW_COPY_AND_MOVE(CacheImpl); - explicit CacheImpl(const CacheOptions& options) - : encoder_(options.capacity, options.load_factor), - device_index_(-1), - options_(options), - max_query_length_(0) { - OF_CUDA_CHECK(hipGetDevice(&device_index_)); - const uint64_t values_size = options.capacity * options.value_size; - if (options.value_memory_kind == CacheOptions::MemoryKind::kDevice) { - OF_CUDA_CHECK(hipMalloc(&values_, values_size)); - } else if (options.value_memory_kind == CacheOptions::MemoryKind::kHost) { - if (ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_DISABLE_NUMA_AWARE_ALLOCATION", false)) { - OF_CUDA_CHECK(hipMallocHost(reinterpret_cast(&values_), values_size)); - } else { - OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, reinterpret_cast(&values_), - values_size)); - } - } else { - UNIMPLEMENTED(); - } - num_elem_per_value_ = options_.value_size / sizeof(Elem); - } - ~CacheImpl() { - CudaCurrentDeviceGuard guard(device_index_); - if (options_.value_memory_kind == CacheOptions::MemoryKind::kDevice) { - OF_CUDA_CHECK(hipFree(values_)); - } else if (options_.value_memory_kind == CacheOptions::MemoryKind::kHost) { - OF_CUDA_CHECK(hipHostFree(values_)); - } else { - UNIMPLEMENTED(); - } - if (max_query_length_ > 0) { OF_CUDA_CHECK(hipFree(encoding_buffer_)); } - } - - uint64_t Capacity() const override { return options_.capacity; } - uint64_t DumpCapacity() const override { return encoder_.TableCapacity(); } - uint32_t KeySize() const override { return options_.key_size; } - - uint32_t ValueSize() const override { return options_.value_size; } - - DataType ValueType() const override { return options_.value_type; } - - uint32_t MaxQueryLength() const override { return max_query_length_; } - - void ReserveQueryLength(uint32_t query_length) override { - CudaCurrentDeviceGuard guard(device_index_); - if (query_length <= max_query_length_) { return; } - if (max_query_length_ > 0) { OF_CUDA_CHECK(hipFree(encoding_buffer_)); } - OF_CUDA_CHECK(hipMalloc(&encoding_buffer_, query_length * sizeof(uint64_t))); - max_query_length_ = query_length; - } - - CacheOptions::Policy Policy() const override { return CacheOptions::Policy::kFull; } - - void Test(ep::Stream* stream, uint32_t n_keys, const void* keys, uint32_t* n_missing, - void* missing_keys, uint32_t* missing_indices) override; - - void Get(ep::Stream* stream, uint32_t n_keys, const void* keys, void* values, uint32_t* n_missing, - void* missing_keys, uint32_t* missing_indices) override; - - void Get(ep::Stream* stream, uint32_t n_keys, const void* keys, void* values, - uint8_t* mask) override; - - void Put(ep::Stream* stream, uint32_t n_keys, const void* keys, const void* values, - uint32_t* n_evicted, void* evicted_keys, void* evicted_values) override; - void FusedHalfUpdatePut(ep::Stream* stream, uint32_t n_keys, const void* keys, const void* values, - const void* update, const float* lr, float scale, uint32_t* n_evicted, - void* evicted_keys, void* evicted_values) override; - void Dump(ep::Stream* stream, uint64_t start_key_index, uint64_t end_key_index, - uint32_t* n_dumped, void* keys, void* values) override; - - void Clear() override; - - private: - OrdinalEncoder encoder_; - int device_index_; - uint32_t num_elem_per_value_{}; - Elem* values_; - Index* encoding_buffer_{}; - CacheOptions options_; - uint32_t max_query_length_; -}; - -template -void CacheImpl::Test(ep::Stream* stream, uint32_t n_keys, - const void* keys, uint32_t* n_missing, - void* missing_keys, uint32_t* missing_indices) { - OF_CUDA_CHECK( - hipMemsetAsync(n_missing, 0, sizeof(uint32_t), stream->As()->cuda_stream())); - if (n_keys == 0) { return; } - CHECK_LE(n_keys, max_query_length_); - encoder_.template Encode(stream, n_keys, static_cast(keys), encoding_buffer_); - const uint32_t values_elem_cnt = n_keys * num_elem_per_value_; - RUN_CUDA_KERNEL((LookupKernel), stream, values_elem_cnt, - num_elem_per_value_, values_, values_elem_cnt, static_cast(keys), - encoding_buffer_, nullptr, n_missing, static_cast(missing_keys), - missing_indices); -} - -template -void CacheImpl::Get(ep::Stream* stream, uint32_t n_keys, - const void* keys, void* values, - uint32_t* n_missing, void* missing_keys, - uint32_t* missing_indices) { - OF_CUDA_CHECK( - hipMemsetAsync(n_missing, 0, sizeof(uint32_t), stream->As()->cuda_stream())); - if (n_keys == 0) { return; } - CHECK_LE(n_keys, max_query_length_); - constexpr uint32_t block_size = 128; - uint32_t grid_size = (n_keys + block_size - 1) / block_size; - const uint32_t values_elem_cnt = n_keys * num_elem_per_value_; - EncodeLookupKernel - <<As()->cuda_stream()>>>( - num_elem_per_value_, values_, values_elem_cnt, static_cast(keys), - encoding_buffer_, static_cast(values), n_missing, static_cast(missing_keys), - missing_indices, encoder_.TableCapacity(), encoder_.table_keys(), - encoder_.table_indices()); -} - -template -void CacheImpl::Get(ep::Stream* stream, uint32_t n_keys, - const void* keys, void* values, uint8_t* mask) { - if (n_keys == 0) { return; } - CHECK_LE(n_keys, max_query_length_); - constexpr uint32_t block_size = 128; - uint32_t grid_size = (n_keys + block_size - 1) / block_size; - const uint32_t values_elem_cnt = n_keys * num_elem_per_value_; - EncodeLookupMaskKernel - <<As()->cuda_stream()>>>( - num_elem_per_value_, values_, values_elem_cnt, static_cast(keys), - encoding_buffer_, static_cast(values), mask, encoder_.TableCapacity(), - encoder_.table_keys(), encoder_.table_indices()); -} - -template -void CacheImpl::Put(ep::Stream* stream, uint32_t n_keys, - const void* keys, const void* values, - uint32_t* n_evicted, void* evicted_keys, - void* evicted_values) { - OF_CUDA_CHECK( - hipMemsetAsync(n_evicted, 0, sizeof(uint32_t), stream->As()->cuda_stream())); - if (n_keys == 0) { return; } - CHECK_LE(n_keys, max_query_length_); - encoder_.template Encode(stream, n_keys, static_cast(keys), encoding_buffer_); - const uint32_t values_elem_cnt = n_keys * num_elem_per_value_; - RUN_CUDA_KERNEL((UpdateKernel), stream, values_elem_cnt / pack_size, - num_elem_per_value_, values_, values_elem_cnt, encoding_buffer_, - static_cast(values)); -} - -template -void CacheImpl::FusedHalfUpdatePut( - ep::Stream* stream, uint32_t n_keys, const void* keys, const void* values, const void* update, - const float* lr, float scale, uint32_t* n_evicted, void* evicted_keys, void* evicted_values) { - if (!std::is_same::value) { UNIMPLEMENTED(); } - OF_CUDA_CHECK( - hipMemsetAsync(n_evicted, 0, sizeof(uint32_t), stream->As()->cuda_stream())); - if (n_keys == 0) { return; } - CHECK_LE(n_keys, max_query_length_); - encoder_.template Encode(stream, n_keys, static_cast(keys), encoding_buffer_); - const uint32_t values_elem_cnt = n_keys * num_elem_per_value_; - RUN_CUDA_KERNEL((FusedHalfUpdateKernel), stream, - values_elem_cnt / pack_size, num_elem_per_value_, values_, values_elem_cnt, - encoding_buffer_, static_cast(values), - static_cast(update), lr, scale); -} -template -void CacheImpl::Dump(ep::Stream* stream, uint64_t start_key_index, - uint64_t end_key_index, uint32_t* n_dumped, - void* keys, void* values) { - encoder_.Dump(stream, start_key_index, end_key_index, n_dumped, static_cast(keys), - encoding_buffer_); - RUN_CUDA_KERNEL((DumpValueKernel), stream, - num_elem_per_value_ * (end_key_index - start_key_index), num_elem_per_value_, - n_dumped, encoding_buffer_, values_, static_cast(values)); -} - -template -void CacheImpl::Clear() { - encoder_.Clear(); -} - -template -std::unique_ptr DispatchValueType(const CacheOptions& options) { - if (options.value_type == DataType::kFloat) { - const size_t value_elem_cnt = options.value_size / sizeof(float); - const size_t half_warp = 16; - if (value_elem_cnt % 4 == 0 && value_elem_cnt / 4 > half_warp) { - return std::unique_ptr(new CacheImpl(options)); - } else if (value_elem_cnt % 2 == 0 && value_elem_cnt / 2 > half_warp) { - return std::unique_ptr(new CacheImpl(options)); - } else { - return std::unique_ptr(new CacheImpl(options)); - } - } else if (options.value_size % sizeof(ulonglong2) == 0) { - return std::unique_ptr(new CacheImpl(options)); - } else if (options.value_size % sizeof(uint64_t) == 0) { - return std::unique_ptr(new CacheImpl(options)); - } else if (options.value_size % sizeof(uint32_t) == 0) { - return std::unique_ptr(new CacheImpl(options)); - } else if (options.value_size % sizeof(uint16_t) == 0) { - return std::unique_ptr(new CacheImpl(options)); - } else { - return std::unique_ptr(new CacheImpl(options)); - } -} - -template -std::unique_ptr DispatchKeyType(const CacheOptions& options) { - if (options.key_size == sizeof(Key32)) { - return DispatchValueType(options); - } else if (options.key_size == sizeof(Key64)) { - return DispatchValueType(options); - } else { - UNIMPLEMENTED(); - return nullptr; - } -} - -std::unique_ptr DispatchIndexType(const CacheOptions& options) { - const int64_t table_capacity = static_cast(options.capacity) / options.load_factor; - if (table_capacity >= (1ULL << 31ULL)) { - return DispatchKeyType(options); - } else { - return DispatchKeyType(options); - } -} - -} // namespace - -std::unique_ptr NewFullCache(const CacheOptions& options) { - return DispatchIndexType(options); -} - -} // namespace embedding - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/embedding/full_cache.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/embedding/hash_functions.hip.h" +#include "oneflow/core/hip/atomic.hip.h" + +namespace oneflow { + +namespace embedding { + +using Key32 = unsigned int; +using Key64 = unsigned long long int; +using Key128 = ulonglong2; + +namespace { + +template +__device__ bool TryGetOrInsert(Key* entry_key, volatile Index* entry_index, Index* table_size, + Key key, Index* out) { + Key key_hi = (key | 0x1); + Key key_lo = (key & 0x1); + Index index_plus_one = 0; + Key old_entry_key = cuda::atomic::CAS(entry_key, static_cast(0), key_hi); + while (index_plus_one == 0) { + if (old_entry_key == static_cast(0)) { + Index index = cuda::atomic::Add(table_size, static_cast(1)); + index_plus_one = index + 1; + *entry_index = ((index_plus_one << 1U) | key_lo); + *out = index_plus_one; + return true; + } else if (old_entry_key == key_hi) { + const Index entry_index_val = *entry_index; + if (entry_index_val == 0) { + // do nothing + } else if ((entry_index_val & 0x1) == key_lo) { + *out = (entry_index_val >> 1U); + return true; + } else { + return false; + } + } else { + return false; + } + } + return false; +} + +template +__device__ bool GetOrInsertOne(const size_t capacity, Key* table_keys, Index* table_indices, + Index* table_size, Key key, size_t hash, Index* out) { + const size_t start_idx = hash % capacity; + for (size_t count = 0; count < capacity; ++count) { + const size_t idx = (start_idx + count) % capacity; + Key* entry_key = table_keys + idx; + Index* entry_index = table_indices + idx; + if (TryGetOrInsert(entry_key, entry_index, table_size, key, out)) { return true; } + } + return false; +} + +template +__device__ bool GetOne(const size_t capacity, Key* table_keys, Index* table_indices, Key key, + size_t hash, Index* out) { + const size_t start_idx = hash % capacity; + for (size_t count = 0; count < capacity; ++count) { + const size_t idx = (start_idx + count) % capacity; + Key entry_key = table_keys[idx]; + Key entry_index = table_indices[idx]; + Key key_hi = (key | 0x1); + Key key_lo = (key & 0x1); + if (entry_key == 0) { break; } + if (entry_key == key_hi) { + if ((entry_index & 0x1) == key_lo) { + *out = (entry_index >> 1U); + return true; + } + } + } + *out = 0; + return false; +} + +template +__global__ void OrdinalEncodeKernel(uint64_t capacity, Key* table_keys, Index* table_indices, + Index* table_size, uint32_t num_keys, const Key* keys, + Index* context) { + CUDA_1D_KERNEL_LOOP(i, num_keys) { + Key key = keys[i]; + uint64_t hash = FullCacheHash()(key); + bool success = GetOrInsertOne(capacity, table_keys, table_indices, table_size, key, + hash, context + i); + assert(success); + } +} + +template +__global__ void OrdinalEncodeLookupKernel(uint64_t capacity, Key* table_keys, Index* table_indices, + uint32_t num_keys, const Key* keys, Index* context) { + CUDA_1D_KERNEL_LOOP(i, num_keys) { + Key key = keys[i]; + uint64_t hash = FullCacheHash()(key); + GetOne(capacity, table_keys, table_indices, key, hash, context + i); + } +} + +template +__global__ void OrdinalEncodeDumpKernel(const Key* table_keys, const Index* table_indices, + uint64_t start_key_index, uint64_t end_key_index, + uint32_t* n_dumped, Key* keys, Index* context) { + CUDA_1D_KERNEL_LOOP(i, (end_key_index - start_key_index)) { + Key entry_key = table_keys[i + start_key_index]; + Index entry_index = table_indices[i + start_key_index]; + if (entry_index != 0) { + uint32_t index = cuda::atomic::Add(n_dumped, static_cast(1)); + keys[index] = ((entry_key ^ 0x1) | (entry_index & 0x1)); + context[index] = (entry_index >> 1U); + } + } +} + +template +__global__ void LookupKernel(uint32_t value_length, const Elem* cache_values, + uint32_t values_elem_cnt, const Key* keys, const Index* context, + Elem* values, uint32_t* n_missing, Key* missing_keys, + uint32_t* missing_indices) { + CUDA_1D_KERNEL_LOOP(i, values_elem_cnt) { + const uint64_t key_id = i / value_length; + const uint64_t ctx = context[key_id]; + const uint64_t row_id = ctx - 1; + const uint64_t col_id = i - key_id * value_length; + if (ctx == 0) { + const Key missing_key = keys[key_id]; + if (col_id == 0) { + const uint32_t old_n_missing = cuda::atomic::Add(n_missing, static_cast(1)); + missing_keys[old_n_missing] = missing_key; + missing_indices[old_n_missing] = key_id; + } + continue; + } + if (return_value) { values[i] = cache_values[row_id * value_length + col_id]; } + } +} + +template +__global__ void EncodeLookupKernel(uint32_t value_length, const Elem* cache_values, + uint32_t values_elem_cnt, const Key* keys, const Index* context, + Elem* values, uint32_t* n_missing, Key* missing_keys, + uint32_t* missing_indices, const size_t capacity, + Key* table_keys, Index* table_indices) { + constexpr uint32_t warp_size = 32; + constexpr uint32_t n_warp_per_block = block_size / warp_size; + const uint32_t warp_id = threadIdx.x / warp_size; + const uint32_t lane_id = threadIdx.x % warp_size; + const uint32_t global_warp_id = blockIdx.x * n_warp_per_block + warp_id; + const uint32_t global_n_warp = gridDim.x * n_warp_per_block; + const uint32_t n_keys = values_elem_cnt / value_length; + __shared__ Key batch_keys[n_warp_per_block][warp_size]; + __shared__ Index batch_row_ids[n_warp_per_block][warp_size]; + __shared__ Key batch_missing_keys[n_warp_per_block][warp_size]; + __shared__ uint32_t batch_missing_indices[n_warp_per_block][warp_size]; + __shared__ uint32_t batch_n_missing[n_warp_per_block]; + for (uint32_t batch_start = global_warp_id * warp_size; batch_start < n_keys; + batch_start += global_n_warp * warp_size) { + const uint32_t batch_n_key = min(n_keys - batch_start, warp_size); + if (lane_id == 0) { batch_n_missing[warp_id] = 0; } + __syncthreads(); + const uint32_t key_offset = batch_start + lane_id; + if (key_offset < n_keys) { + const Key key = keys[batch_start + lane_id]; + const uint64_t hash = FullCacheHash()(key); + Index row; + GetOne(capacity, table_keys, table_indices, key, hash, &row); + batch_row_ids[warp_id][lane_id] = row; + if (row == 0) { + const uint32_t batch_missing_idx = atomicAdd(batch_n_missing + warp_id, 1); + batch_missing_keys[warp_id][batch_missing_idx] = key; + batch_missing_indices[warp_id][batch_missing_idx] = key_offset; + } + } + __syncthreads(); + const uint32_t batch_n_missing_t = batch_n_missing[warp_id]; + if (lane_id == 0) { + const uint32_t old_n_missing = + cuda::atomic::Add(n_missing, static_cast(batch_n_missing_t)); + batch_n_missing[warp_id] = old_n_missing; + } + __syncthreads(); + if (lane_id < batch_n_missing_t) { + missing_keys[batch_n_missing[warp_id] + lane_id] = batch_missing_keys[warp_id][lane_id]; + missing_indices[batch_n_missing[warp_id] + lane_id] = batch_missing_indices[warp_id][lane_id]; + } + for (int i = 0; i < batch_n_key; ++i) { + const Key key = batch_keys[warp_id][i]; + const Index row = batch_row_ids[warp_id][i]; + if (row == 0) { continue; } + for (int col = lane_id; col < value_length; col += warp_size) { + values[(batch_start + i) * value_length + col] = + cache_values[(row - 1) * value_length + col]; + } + } + __syncthreads(); + } +} + +template +struct alignas(sizeof(T) * pack_size) Pack { + T elem[pack_size]; +}; + +template +__global__ void EncodeLookupMaskKernel(uint32_t value_length, const Elem* __restrict__ cache_values, + uint32_t values_elem_cnt, const Key* __restrict__ keys, + const Index* __restrict__ context, Elem* __restrict__ values, + uint8_t* __restrict__ mask, const size_t capacity, + Key* __restrict__ table_keys, + Index* __restrict__ table_indices) { + const uint32_t packed_cols = value_length / pack_size; + auto* packed_values = reinterpret_cast*>(values); + const auto* packed_cache_values = reinterpret_cast*>(cache_values); + constexpr uint32_t warp_size = 32; + constexpr uint32_t n_warp_per_block = block_size / warp_size; + const uint32_t warp_id = threadIdx.x / warp_size; + const uint32_t lane_id = threadIdx.x % warp_size; + const uint32_t global_warp_id = blockIdx.x * n_warp_per_block + warp_id; + const uint32_t global_n_warp = gridDim.x * n_warp_per_block; + const uint32_t n_keys = values_elem_cnt / value_length; + __shared__ Key batch_keys[n_warp_per_block][warp_size]; + __shared__ Index batch_row_ids[n_warp_per_block][warp_size]; + for (uint32_t batch_start = global_warp_id * warp_size; batch_start < n_keys; + batch_start += global_n_warp * warp_size) { + const uint32_t batch_n_key = min(n_keys - batch_start, warp_size); + const uint32_t key_offset = batch_start + lane_id; + if (key_offset < n_keys) { + const Key key = keys[batch_start + lane_id]; + const uint64_t hash = FullCacheHash()(key); + Index row; + GetOne(capacity, table_keys, table_indices, key, hash, &row); + batch_row_ids[warp_id][lane_id] = row; + mask[key_offset] = row > 0; + } + __syncthreads(); + for (int i = 0; i < batch_n_key; ++i) { + const Key key = batch_keys[warp_id][i]; + const Index row = batch_row_ids[warp_id][i]; + if (row == 0) { continue; } +#pragma unroll 4 + for (int col = lane_id; col < packed_cols; col += warp_size) { + packed_values[(batch_start + i) * packed_cols + col] = + packed_cache_values[(row - 1) * packed_cols + col]; + } + } + __syncthreads(); + } +} + +template +__global__ void UpdateKernel(uint32_t value_length, Elem* cache_values, uint32_t values_elem_cnt, + const Index* context, const Elem* values) { + const int packed_values_elem_cnt = values_elem_cnt / pack_size; + const uint32_t packed_elem_cnt = value_length / pack_size; + auto* packed_cache_values = reinterpret_cast*>(cache_values); + auto* packed_values = reinterpret_cast*>(values); + CUDA_1D_KERNEL_LOOP(i, packed_values_elem_cnt) { + const uint64_t key_id = i / packed_elem_cnt; + const uint64_t ctx = context[key_id]; + if (ctx == 0) { continue; } + const uint64_t row_id = ctx - 1; + const uint64_t col_id = i - key_id * packed_elem_cnt; + packed_cache_values[row_id * packed_elem_cnt + col_id] = packed_values[i]; + } +} + +template +__global__ typename std::enable_if::value, void>::type +FusedHalfUpdateKernel(uint32_t value_length, Elem* __restrict__ cache_values, + uint32_t values_elem_cnt, const Index* __restrict__ context, + const Elem* __restrict__ values, const half* __restrict__ update, + const float* __restrict__ lr, float scale) { + const int packed_values_elem_cnt = values_elem_cnt / pack_size; + const uint32_t packed_elem_cnt = value_length / pack_size; + auto* packed_cache_values = reinterpret_cast*>(cache_values); + auto* packed_values = reinterpret_cast*>(values); + auto* packed_update = reinterpret_cast*>(update); + const float alpha = -*lr * scale; + CUDA_1D_KERNEL_LOOP(i, packed_values_elem_cnt) { + const uint64_t key_id = i / packed_elem_cnt; + const uint64_t ctx = context[key_id]; + if (ctx == 0) { continue; } + const uint64_t row_id = ctx - 1; + const uint64_t col_id = i - key_id * packed_elem_cnt; + Pack m = packed_values[i]; + Pack u = packed_update[i]; + for (size_t j = 0; j < pack_size; ++j) { m.elem[j] += static_cast(u.elem[j]) * alpha; } + packed_cache_values[row_id * packed_elem_cnt + col_id] = m; + } +} + +template +__global__ typename std::enable_if::value, void>::type +FusedHalfUpdateKernel(uint32_t value_length, Elem* cache_values, uint32_t values_elem_cnt, + const Index* context, const Elem* values, const half* update, const float* lr, + float scale) { + asm volatile("s_trap 0;"); +} + +template +__global__ void DumpValueKernel(uint32_t value_length, const uint32_t* n_dumped, + const Index* context, const Elem* cache_values, Elem* values) { + CUDA_1D_KERNEL_LOOP(i, *n_dumped * value_length) { + const uint64_t key_id = i / value_length; + const uint64_t ctx = context[key_id]; + const uint64_t row_id = ctx - 1; + const uint64_t col_id = i - key_id * value_length; + values[i] = cache_values[row_id * value_length + col_id]; + } +} + +template +class OrdinalEncoder { + public: + OF_DISALLOW_COPY_AND_MOVE(OrdinalEncoder); + explicit OrdinalEncoder(uint64_t capacity, float load_factor) + : capacity_(capacity), table_capacity_(capacity / load_factor) { + OF_CUDA_CHECK(hipGetDevice(&device_index_)); + OF_CUDA_CHECK(hipMalloc(&table_size_, sizeof(Index))); + OF_CUDA_CHECK(hipMallocHost(reinterpret_cast(&table_size_host_), sizeof(Index))); + OF_CUDA_CHECK(hipMalloc(&table_keys_, table_capacity_ * sizeof(Key))); + OF_CUDA_CHECK(hipMalloc(&table_indices_, table_capacity_ * sizeof(Index))); + Clear(); + } + ~OrdinalEncoder() { + CudaCurrentDeviceGuard guard(device_index_); + OF_CUDA_CHECK(hipFree(table_size_)); + OF_CUDA_CHECK(hipHostFree(table_size_host_)); + OF_CUDA_CHECK(hipFree(table_keys_)); + OF_CUDA_CHECK(hipFree(table_indices_)); + } + + template + void Encode(ep::Stream* stream, uint32_t num_keys, const Key* keys, Index* context) { + if (insert) { + RUN_CUDA_KERNEL((OrdinalEncodeKernel), stream, num_keys, table_capacity_, + table_keys_, table_indices_, table_size_, num_keys, keys, context); + OF_CUDA_CHECK(hipMemcpyAsync(table_size_host_, table_size_, sizeof(Index), hipMemcpyDefault, + stream->As()->cuda_stream())); + CHECK_JUST(stream->Sync()); + CHECK_LT(*table_size_host_, capacity_) + << "The number of key is larger than cache size, please enlarge cache_memory_budget. "; + } else { + RUN_CUDA_KERNEL((OrdinalEncodeLookupKernel), stream, num_keys, table_capacity_, + table_keys_, table_indices_, num_keys, keys, context); + } + } + + void Dump(ep::Stream* stream, uint64_t start_key_index, uint64_t end_key_index, + uint32_t* n_dumped, Key* keys, Index* context) { + OF_CUDA_CHECK(hipMemsetAsync(n_dumped, 0, sizeof(uint32_t), + stream->As()->cuda_stream())); + RUN_CUDA_KERNEL((OrdinalEncodeDumpKernel), stream, end_key_index - start_key_index, + table_keys_, table_indices_, start_key_index, end_key_index, n_dumped, keys, + context); + } + + void Clear() { + OF_CUDA_CHECK(hipMemset(table_size_, 0, sizeof(Index))); + OF_CUDA_CHECK(hipMemset(table_keys_, 0, table_capacity_ * sizeof(Key))); + OF_CUDA_CHECK(hipMemset(table_indices_, 0, table_capacity_ * sizeof(Index))); + } + + uint64_t TableCapacity() const { return table_capacity_; } + + Key* table_keys() const { return table_keys_; } + + Index* table_indices() const { return table_indices_; } + + private: + int device_index_{}; + Key* table_keys_; + Index* table_indices_; + uint64_t capacity_; + uint64_t table_capacity_; + Index* table_size_{}; + Index* table_size_host_{}; +}; + +template +class CacheImpl : public Cache { + public: + OF_DISALLOW_COPY_AND_MOVE(CacheImpl); + explicit CacheImpl(const CacheOptions& options) + : encoder_(options.capacity, options.load_factor), + device_index_(-1), + options_(options), + max_query_length_(0) { + OF_CUDA_CHECK(hipGetDevice(&device_index_)); + const uint64_t values_size = options.capacity * options.value_size; + if (options.value_memory_kind == CacheOptions::MemoryKind::kDevice) { + OF_CUDA_CHECK(hipMalloc(&values_, values_size)); + } else if (options.value_memory_kind == CacheOptions::MemoryKind::kHost) { + if (ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_DISABLE_NUMA_AWARE_ALLOCATION", false)) { + OF_CUDA_CHECK(hipMallocHost(reinterpret_cast(&values_), values_size)); + } else { + OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, reinterpret_cast(&values_), + values_size)); + } + } else { + UNIMPLEMENTED(); + } + num_elem_per_value_ = options_.value_size / sizeof(Elem); + } + ~CacheImpl() { + CudaCurrentDeviceGuard guard(device_index_); + if (options_.value_memory_kind == CacheOptions::MemoryKind::kDevice) { + OF_CUDA_CHECK(hipFree(values_)); + } else if (options_.value_memory_kind == CacheOptions::MemoryKind::kHost) { + OF_CUDA_CHECK(hipHostFree(values_)); + } else { + UNIMPLEMENTED(); + } + if (max_query_length_ > 0) { OF_CUDA_CHECK(hipFree(encoding_buffer_)); } + } + + uint64_t Capacity() const override { return options_.capacity; } + uint64_t DumpCapacity() const override { return encoder_.TableCapacity(); } + uint32_t KeySize() const override { return options_.key_size; } + + uint32_t ValueSize() const override { return options_.value_size; } + + DataType ValueType() const override { return options_.value_type; } + + uint32_t MaxQueryLength() const override { return max_query_length_; } + + void ReserveQueryLength(uint32_t query_length) override { + CudaCurrentDeviceGuard guard(device_index_); + if (query_length <= max_query_length_) { return; } + if (max_query_length_ > 0) { OF_CUDA_CHECK(hipFree(encoding_buffer_)); } + OF_CUDA_CHECK(hipMalloc(&encoding_buffer_, query_length * sizeof(uint64_t))); + max_query_length_ = query_length; + } + + CacheOptions::Policy Policy() const override { return CacheOptions::Policy::kFull; } + + void Test(ep::Stream* stream, uint32_t n_keys, const void* keys, uint32_t* n_missing, + void* missing_keys, uint32_t* missing_indices) override; + + void Get(ep::Stream* stream, uint32_t n_keys, const void* keys, void* values, uint32_t* n_missing, + void* missing_keys, uint32_t* missing_indices) override; + + void Get(ep::Stream* stream, uint32_t n_keys, const void* keys, void* values, + uint8_t* mask) override; + + void Put(ep::Stream* stream, uint32_t n_keys, const void* keys, const void* values, + uint32_t* n_evicted, void* evicted_keys, void* evicted_values) override; + void FusedHalfUpdatePut(ep::Stream* stream, uint32_t n_keys, const void* keys, const void* values, + const void* update, const float* lr, float scale, uint32_t* n_evicted, + void* evicted_keys, void* evicted_values) override; + void Dump(ep::Stream* stream, uint64_t start_key_index, uint64_t end_key_index, + uint32_t* n_dumped, void* keys, void* values) override; + + void Clear() override; + + private: + OrdinalEncoder encoder_; + int device_index_; + uint32_t num_elem_per_value_{}; + Elem* values_; + Index* encoding_buffer_{}; + CacheOptions options_; + uint32_t max_query_length_; +}; + +template +void CacheImpl::Test(ep::Stream* stream, uint32_t n_keys, + const void* keys, uint32_t* n_missing, + void* missing_keys, uint32_t* missing_indices) { + OF_CUDA_CHECK( + hipMemsetAsync(n_missing, 0, sizeof(uint32_t), stream->As()->cuda_stream())); + if (n_keys == 0) { return; } + CHECK_LE(n_keys, max_query_length_); + encoder_.template Encode(stream, n_keys, static_cast(keys), encoding_buffer_); + const uint32_t values_elem_cnt = n_keys * num_elem_per_value_; + RUN_CUDA_KERNEL((LookupKernel), stream, values_elem_cnt, + num_elem_per_value_, values_, values_elem_cnt, static_cast(keys), + encoding_buffer_, nullptr, n_missing, static_cast(missing_keys), + missing_indices); +} + +template +void CacheImpl::Get(ep::Stream* stream, uint32_t n_keys, + const void* keys, void* values, + uint32_t* n_missing, void* missing_keys, + uint32_t* missing_indices) { + OF_CUDA_CHECK( + hipMemsetAsync(n_missing, 0, sizeof(uint32_t), stream->As()->cuda_stream())); + if (n_keys == 0) { return; } + CHECK_LE(n_keys, max_query_length_); + constexpr uint32_t block_size = 128; + uint32_t grid_size = (n_keys + block_size - 1) / block_size; + const uint32_t values_elem_cnt = n_keys * num_elem_per_value_; + EncodeLookupKernel + <<As()->cuda_stream()>>>( + num_elem_per_value_, values_, values_elem_cnt, static_cast(keys), + encoding_buffer_, static_cast(values), n_missing, static_cast(missing_keys), + missing_indices, encoder_.TableCapacity(), encoder_.table_keys(), + encoder_.table_indices()); +} + +template +void CacheImpl::Get(ep::Stream* stream, uint32_t n_keys, + const void* keys, void* values, uint8_t* mask) { + if (n_keys == 0) { return; } + CHECK_LE(n_keys, max_query_length_); + constexpr uint32_t block_size = 128; + uint32_t grid_size = (n_keys + block_size - 1) / block_size; + const uint32_t values_elem_cnt = n_keys * num_elem_per_value_; + EncodeLookupMaskKernel + <<As()->cuda_stream()>>>( + num_elem_per_value_, values_, values_elem_cnt, static_cast(keys), + encoding_buffer_, static_cast(values), mask, encoder_.TableCapacity(), + encoder_.table_keys(), encoder_.table_indices()); +} + +template +void CacheImpl::Put(ep::Stream* stream, uint32_t n_keys, + const void* keys, const void* values, + uint32_t* n_evicted, void* evicted_keys, + void* evicted_values) { + OF_CUDA_CHECK( + hipMemsetAsync(n_evicted, 0, sizeof(uint32_t), stream->As()->cuda_stream())); + if (n_keys == 0) { return; } + CHECK_LE(n_keys, max_query_length_); + encoder_.template Encode(stream, n_keys, static_cast(keys), encoding_buffer_); + const uint32_t values_elem_cnt = n_keys * num_elem_per_value_; + RUN_CUDA_KERNEL((UpdateKernel), stream, values_elem_cnt / pack_size, + num_elem_per_value_, values_, values_elem_cnt, encoding_buffer_, + static_cast(values)); +} + +template +void CacheImpl::FusedHalfUpdatePut( + ep::Stream* stream, uint32_t n_keys, const void* keys, const void* values, const void* update, + const float* lr, float scale, uint32_t* n_evicted, void* evicted_keys, void* evicted_values) { + if (!std::is_same::value) { UNIMPLEMENTED(); } + OF_CUDA_CHECK( + hipMemsetAsync(n_evicted, 0, sizeof(uint32_t), stream->As()->cuda_stream())); + if (n_keys == 0) { return; } + CHECK_LE(n_keys, max_query_length_); + encoder_.template Encode(stream, n_keys, static_cast(keys), encoding_buffer_); + const uint32_t values_elem_cnt = n_keys * num_elem_per_value_; + RUN_CUDA_KERNEL((FusedHalfUpdateKernel), stream, + values_elem_cnt / pack_size, num_elem_per_value_, values_, values_elem_cnt, + encoding_buffer_, static_cast(values), + static_cast(update), lr, scale); +} +template +void CacheImpl::Dump(ep::Stream* stream, uint64_t start_key_index, + uint64_t end_key_index, uint32_t* n_dumped, + void* keys, void* values) { + encoder_.Dump(stream, start_key_index, end_key_index, n_dumped, static_cast(keys), + encoding_buffer_); + RUN_CUDA_KERNEL((DumpValueKernel), stream, + num_elem_per_value_ * (end_key_index - start_key_index), num_elem_per_value_, + n_dumped, encoding_buffer_, values_, static_cast(values)); +} + +template +void CacheImpl::Clear() { + encoder_.Clear(); +} + +template +std::unique_ptr DispatchValueType(const CacheOptions& options) { + if (options.value_type == DataType::kFloat) { + const size_t value_elem_cnt = options.value_size / sizeof(float); + const size_t half_warp = 16; + if (value_elem_cnt % 4 == 0 && value_elem_cnt / 4 > half_warp) { + return std::unique_ptr(new CacheImpl(options)); + } else if (value_elem_cnt % 2 == 0 && value_elem_cnt / 2 > half_warp) { + return std::unique_ptr(new CacheImpl(options)); + } else { + return std::unique_ptr(new CacheImpl(options)); + } + } else if (options.value_size % sizeof(ulonglong2) == 0) { + return std::unique_ptr(new CacheImpl(options)); + } else if (options.value_size % sizeof(uint64_t) == 0) { + return std::unique_ptr(new CacheImpl(options)); + } else if (options.value_size % sizeof(uint32_t) == 0) { + return std::unique_ptr(new CacheImpl(options)); + } else if (options.value_size % sizeof(uint16_t) == 0) { + return std::unique_ptr(new CacheImpl(options)); + } else { + return std::unique_ptr(new CacheImpl(options)); + } +} + +template +std::unique_ptr DispatchKeyType(const CacheOptions& options) { + if (options.key_size == sizeof(Key32)) { + return DispatchValueType(options); + } else if (options.key_size == sizeof(Key64)) { + return DispatchValueType(options); + } else { + UNIMPLEMENTED(); + return nullptr; + } +} + +std::unique_ptr DispatchIndexType(const CacheOptions& options) { + const int64_t table_capacity = static_cast(options.capacity) / options.load_factor; + if (table_capacity >= (1ULL << 31ULL)) { + return DispatchKeyType(options); + } else { + return DispatchKeyType(options); + } +} + +} // namespace + +std::unique_ptr NewFullCache(const CacheOptions& options) { + return DispatchIndexType(options); +} + +} // namespace embedding + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/embedding/hash_functions.hip.h b/oneflow/core/embedding/hash_functions.hip.h index 25c6eb0..99a2373 100644 --- a/oneflow/core/embedding/hash_functions.hip.h +++ b/oneflow/core/embedding/hash_functions.hip.h @@ -1,100 +1,100 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_EMBEDDING_HASH_FUNCTION_HIP_H_ -#define ONEFLOW_CORE_EMBEDDING_HASH_FUNCTION_HIP_H_ - -#include -#include "oneflow/core/common/data_type.h" - -namespace oneflow { - -namespace embedding { - -namespace { - -// From https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h -static const uint64_t PRIME64_1 = - 0x9E3779B185EBCA87ULL; // 0b1001111000110111011110011011000110000101111010111100101010000111 -static const uint64_t PRIME64_2 = - 0xC2B2AE3D27D4EB4FULL; // 0b1100001010110010101011100011110100100111110101001110101101001111 -static const uint64_t PRIME64_3 = - 0x165667B19E3779F9ULL; // 0b0001011001010110011001111011000110011110001101110111100111111001 -static const uint64_t PRIME64_4 = - 0x85EBCA77C2B2AE63ULL; // 0b1000010111101011110010100111011111000010101100101010111001100011 -static const uint64_t PRIME64_5 = - 0x27D4EB2F165667C5ULL; // 0b0010011111010100111010110010111100010110010101100110011111000101 - -#define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r)))) - -OF_DEVICE_FUNC uint64_t XXH64_round(uint64_t acc, uint64_t input) { - acc += input * PRIME64_2; - acc = XXH_rotl64(acc, 31); - acc *= PRIME64_1; - return acc; -} - -OF_DEVICE_FUNC uint64_t xxh64_uint64(uint64_t v, uint64_t seed) { - uint64_t acc = seed + PRIME64_5; - acc += sizeof(uint64_t); - acc = acc ^ XXH64_round(0, v); - acc = XXH_rotl64(acc, 27) * PRIME64_1; - acc = acc + PRIME64_4; - acc ^= (acc >> 33); - acc = acc * PRIME64_2; - acc = acc ^ (acc >> 29); - acc = acc * PRIME64_3; - acc = acc ^ (acc >> 32); - return acc; -} - -static const size_t kShardingHashSeed = 1; -static const size_t kLocalUniqueHashSeed = 2; -static const size_t kGlobalUniqueHashSeed = 3; -static const size_t kFullCacheHashSeed = 4; -static const size_t kLruCacheHashSeed = 5; - -} // namespace - -struct ShardingHash { - OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kShardingHashSeed); } - OF_DEVICE_FUNC size_t operator()(uint32_t v) { return xxh64_uint64(v, kShardingHashSeed); } - OF_DEVICE_FUNC size_t operator()(int32_t v) { - return xxh64_uint64(static_cast(v), kShardingHashSeed); - } - OF_DEVICE_FUNC size_t operator()(int64_t v) { - return xxh64_uint64(static_cast(v), kShardingHashSeed); - } -}; - -struct LocalUniqueHash { - OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kLocalUniqueHashSeed); } -}; - -struct GlobalUniqueHash { - OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kGlobalUniqueHashSeed); } -}; - -struct FullCacheHash { - OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kFullCacheHashSeed); } -}; - -struct LruCacheHash { - OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kLruCacheHashSeed); } -}; - -} // namespace embedding -} // namespace oneflow +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_EMBEDDING_HASH_FUNCTION_HIP_H_ +#define ONEFLOW_CORE_EMBEDDING_HASH_FUNCTION_HIP_H_ + +#include +#include "oneflow/core/common/data_type.h" + +namespace oneflow { + +namespace embedding { + +namespace { + +// From https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h +static const uint64_t PRIME64_1 = + 0x9E3779B185EBCA87ULL; // 0b1001111000110111011110011011000110000101111010111100101010000111 +static const uint64_t PRIME64_2 = + 0xC2B2AE3D27D4EB4FULL; // 0b1100001010110010101011100011110100100111110101001110101101001111 +static const uint64_t PRIME64_3 = + 0x165667B19E3779F9ULL; // 0b0001011001010110011001111011000110011110001101110111100111111001 +static const uint64_t PRIME64_4 = + 0x85EBCA77C2B2AE63ULL; // 0b1000010111101011110010100111011111000010101100101010111001100011 +static const uint64_t PRIME64_5 = + 0x27D4EB2F165667C5ULL; // 0b0010011111010100111010110010111100010110010101100110011111000101 + +#define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r)))) + +OF_DEVICE_FUNC uint64_t XXH64_round(uint64_t acc, uint64_t input) { + acc += input * PRIME64_2; + acc = XXH_rotl64(acc, 31); + acc *= PRIME64_1; + return acc; +} + +OF_DEVICE_FUNC uint64_t xxh64_uint64(uint64_t v, uint64_t seed) { + uint64_t acc = seed + PRIME64_5; + acc += sizeof(uint64_t); + acc = acc ^ XXH64_round(0, v); + acc = XXH_rotl64(acc, 27) * PRIME64_1; + acc = acc + PRIME64_4; + acc ^= (acc >> 33); + acc = acc * PRIME64_2; + acc = acc ^ (acc >> 29); + acc = acc * PRIME64_3; + acc = acc ^ (acc >> 32); + return acc; +} + +static const size_t kShardingHashSeed = 1; +static const size_t kLocalUniqueHashSeed = 2; +static const size_t kGlobalUniqueHashSeed = 3; +static const size_t kFullCacheHashSeed = 4; +static const size_t kLruCacheHashSeed = 5; + +} // namespace + +struct ShardingHash { + OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kShardingHashSeed); } + OF_DEVICE_FUNC size_t operator()(uint32_t v) { return xxh64_uint64(v, kShardingHashSeed); } + OF_DEVICE_FUNC size_t operator()(int32_t v) { + return xxh64_uint64(static_cast(v), kShardingHashSeed); + } + OF_DEVICE_FUNC size_t operator()(int64_t v) { + return xxh64_uint64(static_cast(v), kShardingHashSeed); + } +}; + +struct LocalUniqueHash { + OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kLocalUniqueHashSeed); } +}; + +struct GlobalUniqueHash { + OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kGlobalUniqueHashSeed); } +}; + +struct FullCacheHash { + OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kFullCacheHashSeed); } +}; + +struct LruCacheHash { + OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kLruCacheHashSeed); } +}; + +} // namespace embedding +} // namespace oneflow #endif // ONEFLOW_CORE_EMBEDDING_HASH_FUNCTION_HIP_H_ \ No newline at end of file diff --git a/oneflow/core/embedding/lru_cache.hip.cpp b/oneflow/core/embedding/lru_cache.hip.cpp index 8db00c5..cfb1044 100644 --- a/oneflow/core/embedding/lru_cache.hip.cpp +++ b/oneflow/core/embedding/lru_cache.hip.cpp @@ -1,585 +1,585 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Inspired by https://github.com/NVIDIA-Merlin/HugeCTR/blob/master/gpu_cache/src/nv_gpu_cache.cu - -#include "oneflow/core/embedding/lru_cache.h" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/embedding/hash_functions.hip.h" -#include -#include - -namespace oneflow { - -namespace embedding { - -namespace { - -constexpr int kWarpSize = 64; -constexpr int kNumWarpPerBlock = 2; -constexpr int kBlockSize = kNumWarpPerBlock * kWarpSize; -constexpr unsigned long long int kFullMask = 0xFFFFFFFFFFFFFFFFU; - -ep::CudaLaunchConfig GetLaunchConfig(uint32_t n_keys) { - return ep::CudaLaunchConfig((n_keys + kNumWarpPerBlock - 1) / kNumWarpPerBlock, - kWarpSize * kNumWarpPerBlock, 0); -} - -struct ThreadContext { - __device__ ThreadContext() { - const uint32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - global_warp_id = global_thread_id / kWarpSize; - warp_id_in_block = global_warp_id % kNumWarpPerBlock; // NOLINT - num_warps = gridDim.x * kNumWarpPerBlock; // NOLINT - lane_id = global_thread_id % kWarpSize; - } - - uint32_t global_warp_id; - uint32_t warp_id_in_block; - uint32_t num_warps; - uint32_t lane_id; -}; - -class WarpMutexAtomicImpl { - public: - OF_DISALLOW_COPY_AND_MOVE(WarpMutexAtomicImpl); - __device__ WarpMutexAtomicImpl() : flag_(0) {} - __device__ ~WarpMutexAtomicImpl() = default; - - __device__ void Lock(const ThreadContext& thread_ctx) { - if (thread_ctx.lane_id == 0) { - while (atomicCAS(&flag_, 0, 1) != 0) - ; - } - __threadfence(); - __syncthreads(); - } - - __device__ void Unlock(const ThreadContext& thread_ctx) { - __syncthreads(); - __threadfence(); - if (thread_ctx.lane_id == 0) { atomicExch(&flag_, 0); } - } - - private: - int32_t flag_; -}; - -template -struct LruCacheContext { - Key* keys; - Elem* lines; - uint8_t* ages; - void* mutex; - uint64_t n_set; - uint32_t line_size; - CacheOptions::MemoryKind value_memory_kind; -}; - -__global__ void InitCacheSetMutex(uint32_t n_set, void* mutex) { - - using WarpMutex = WarpMutexAtomicImpl; - - const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < n_set) { new (reinterpret_cast(mutex) + idx) WarpMutex; } -} - -template -void ClearLruCacheContext(LruCacheContext* ctx) { - OF_CUDA_CHECK(hipMemset(ctx->keys, 0, ctx->n_set * kWarpSize * sizeof(Key))); - OF_CUDA_CHECK(hipMemset(ctx->ages, 0, ctx->n_set * kWarpSize * sizeof(uint8_t))); - InitCacheSetMutex<<<(ctx->n_set - 1 + 256) / 256, 256>>>(ctx->n_set, ctx->mutex); -} - -template -void InitLruCacheContext(const CacheOptions& options, LruCacheContext* ctx) { - const size_t keys_size_per_set = kWarpSize * sizeof(Key); - const uint32_t line_size = options.value_size / sizeof(Elem); - const size_t lines_size_per_set = kWarpSize * line_size * sizeof(Elem); - const size_t ages_size_per_set = kWarpSize * sizeof(uint8_t); - int device = 0; - OF_CUDA_CHECK(hipGetDevice(&device)); - int major = 0; - OF_CUDA_CHECK(hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, device)); - size_t mutex_size_per_set = 0; - - mutex_size_per_set = sizeof(WarpMutexAtomicImpl); - - const size_t n_set = (options.capacity - 1 + kWarpSize) / kWarpSize; - CHECK_GT(n_set, 0); - ctx->n_set = n_set; - ctx->line_size = line_size; - const size_t keys_size = n_set * keys_size_per_set; - OF_CUDA_CHECK(hipMalloc(&(ctx->keys), keys_size)); - const size_t lines_size = n_set * lines_size_per_set; - if (options.value_memory_kind == CacheOptions::MemoryKind::kDevice) { - OF_CUDA_CHECK(hipMalloc(&(ctx->lines), lines_size)); - } else if (options.value_memory_kind == CacheOptions::MemoryKind::kHost) { - if (ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_DISABLE_NUMA_AWARE_ALLOCATION", false)) { - OF_CUDA_CHECK(hipMallocHost(reinterpret_cast(&(ctx->lines)), lines_size)); - } else { - OF_CUDA_CHECK( - NumaAwareCudaMallocHost(device, reinterpret_cast(&ctx->lines), lines_size)); - } - } else { - UNIMPLEMENTED(); - } - ctx->value_memory_kind = options.value_memory_kind; - const size_t ages_size = n_set * ages_size_per_set; - OF_CUDA_CHECK(hipMalloc(&(ctx->ages), ages_size)); - const size_t mutex_size = n_set * mutex_size_per_set; - OF_CUDA_CHECK(hipMalloc(&(ctx->mutex), mutex_size)); - - ClearLruCacheContext(ctx); -} - -template -void DestroyLruCacheContext(LruCacheContext* ctx) { - OF_CUDA_CHECK(hipFree(ctx->keys)); - if (ctx->value_memory_kind == CacheOptions::MemoryKind::kDevice) { - OF_CUDA_CHECK(hipFree(ctx->lines)); - } else if (ctx->value_memory_kind == CacheOptions::MemoryKind::kHost) { - OF_CUDA_CHECK(hipHostFree(ctx->lines)); - } else { - UNIMPLEMENTED(); - } - OF_CUDA_CHECK(hipFree(ctx->ages)); - OF_CUDA_CHECK(hipFree(ctx->mutex)); -} - -template -struct SetContext { - - using WarpMutex = WarpMutexAtomicImpl; - - __device__ SetContext(const LruCacheContext& ctx, uint32_t set_id) - : keys(ctx.keys + set_id * kWarpSize), - mutex(reinterpret_cast(ctx.mutex) + set_id), - ages(ctx.ages + set_id * kWarpSize), - lines(ctx.lines + set_id * kWarpSize * ctx.line_size) {} - - __device__ int Lookup(const ThreadContext& thread_ctx, Key key) { - const Key lane_key = keys[thread_ctx.lane_id]; - const int lane_age = ages[thread_ctx.lane_id]; - const bool lane_hit = (lane_key == key && lane_age != 0); - const unsigned long long int hit_mask = __ballot(lane_hit); - if (hit_mask != 0) { - return __ffs(static_cast(hit_mask)) - 1; - } else { - return -1; - } - } - - __device__ void Read(const LruCacheContext& cache_ctx, const ThreadContext& thread_ctx, - int way, Elem* line) { - const Elem* from_line = lines + way * cache_ctx.line_size; - for (int i = thread_ctx.lane_id; i < cache_ctx.line_size; i += kWarpSize) { - line[i] = from_line[i]; - } - } - - __device__ int InsertWithoutEvicting(const LruCacheContext& cache_ctx, - const ThreadContext& thread_ctx, Key key) { - int insert_way = -1; - const Key lane_key = keys[thread_ctx.lane_id]; - int lane_age = ages[thread_ctx.lane_id]; - const unsigned long long int hit_mask = __ballot(lane_key == key && lane_age != 0); - if (hit_mask != 0) { - insert_way = __ffs(static_cast(hit_mask)) - 1; - const int insert_way_age = __shfl(lane_age, insert_way); - if (lane_age > insert_way_age) { - lane_age -= 1; - } else if (thread_ctx.lane_id == insert_way) { - lane_age = kWarpSize; - } - __syncthreads(); - } - if (insert_way == -1) { - const unsigned long long int valid_mask = __ballot(lane_age != 0); - if (valid_mask != kFullMask) { - insert_way = __popc(static_cast(valid_mask)); - if (lane_age > 0) { - lane_age -= 1; - } else if (thread_ctx.lane_id == insert_way) { - lane_age = kWarpSize; - keys[insert_way] = key; - } - __syncthreads(); - } - } - if (insert_way != -1) { ages[thread_ctx.lane_id] = lane_age; } - return insert_way; - } - - __device__ void Evict(const LruCacheContext& cache_ctx, - const ThreadContext& thread_ctx, Key key, int* way, Key* evicted_key) { - const Key lane_key = keys[thread_ctx.lane_id]; - int lane_age = ages[thread_ctx.lane_id]; - const int insert_way = __ffs(static_cast(__ballot(lane_age == 1))) - 1; - *evicted_key = __shfl(lane_key, insert_way); - if (thread_ctx.lane_id == insert_way) { - keys[insert_way] = key; - lane_age = kWarpSize; - } else if (lane_age > 1) { - lane_age -= 1; - } - __syncthreads(); - ages[thread_ctx.lane_id] = lane_age; - *way = insert_way; - } - - __device__ void Write(const LruCacheContext& cache_ctx, - const ThreadContext& thread_ctx, int way, const Elem* line) { - Elem* to_line = lines + way * cache_ctx.line_size; - for (int i = thread_ctx.lane_id; i < cache_ctx.line_size; i += kWarpSize) { - to_line[i] = line[i]; - } - } - - __device__ void Lock(const ThreadContext& thread_ctx) { mutex->Lock(thread_ctx); } - - __device__ void Unlock(const ThreadContext& thread_ctx) { mutex->Unlock(thread_ctx); } - - Key* keys; - Elem* lines; - uint8_t* ages; - WarpMutex* mutex; -}; - -template -__global__ void GetKernel(LruCacheContext cache_ctx, uint32_t num_keys, const Key* keys, - Elem* values, uint32_t* n_missing_keys, Key* missing_keys, - uint32_t* missing_indices) { - ThreadContext thread_ctx{}; - __shared__ Key block_keys[kNumWarpPerBlock][kWarpSize]; - __shared__ size_t block_set_ids[kNumWarpPerBlock][kWarpSize]; - for (uint32_t batch_offset = thread_ctx.global_warp_id * kWarpSize; batch_offset < num_keys; - batch_offset += thread_ctx.num_warps * kWarpSize) { - const uint32_t n_batch_keys = min(kWarpSize, num_keys - batch_offset); - if (thread_ctx.lane_id < n_batch_keys) { - const Key key = keys[batch_offset + thread_ctx.lane_id]; - const size_t hash = LruCacheHash()(key); - const uint32_t set_id = hash % cache_ctx.n_set; - block_keys[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = key; - block_set_ids[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = set_id; - } - __syncthreads(); - uint32_t n_warp_missing = 0; - Key warp_missing_key = 0; - uint32_t warp_missing_index = 0; - for (uint32_t i = 0; i < n_batch_keys; ++i) { - const uint32_t key_idx = batch_offset + i; - const Key key = block_keys[thread_ctx.warp_id_in_block][i]; - const size_t set_id = block_set_ids[thread_ctx.warp_id_in_block][i]; - SetContext set_ctx(cache_ctx, set_id); - const int way = set_ctx.Lookup(thread_ctx, key); - if (way < 0) { - if (thread_ctx.lane_id == n_warp_missing) { - warp_missing_key = key; - warp_missing_index = key_idx; - } - __syncthreads(); - n_warp_missing += 1; - } else if (!test_only) { - set_ctx.Read(cache_ctx, thread_ctx, way, values + key_idx * cache_ctx.line_size); - } - } - if (n_warp_missing > 0) { - uint32_t base_missing_idx = 0; - if (thread_ctx.lane_id == 0) { base_missing_idx = atomicAdd(n_missing_keys, n_warp_missing); } - __syncthreads(); - base_missing_idx = __shfl(base_missing_idx, 0); - if (thread_ctx.lane_id < n_warp_missing) { - missing_keys[base_missing_idx + thread_ctx.lane_id] = warp_missing_key; - missing_indices[base_missing_idx + thread_ctx.lane_id] = warp_missing_index; - } - __syncthreads(); - } - __syncthreads(); - } -} - -template -__global__ void PutWithoutEvictingKernel(LruCacheContext cache_ctx, uint32_t num_keys, - const Key* keys, const Elem* values, uint32_t* n_missing, - Key* missing_keys, uint32_t* missing_indices) { - ThreadContext thread_ctx{}; - __shared__ Key block_keys[kNumWarpPerBlock][kWarpSize]; - __shared__ size_t block_set_ids[kNumWarpPerBlock][kWarpSize]; - for (uint32_t batch_offset = thread_ctx.global_warp_id * kWarpSize; batch_offset < num_keys; - batch_offset += thread_ctx.num_warps * kWarpSize) { - const uint32_t n_batch_keys = min(kWarpSize, num_keys - batch_offset); - if (thread_ctx.lane_id < n_batch_keys) { - const Key key = keys[batch_offset + thread_ctx.lane_id]; - const size_t hash = LruCacheHash()(key); - const uint32_t set_id = hash % cache_ctx.n_set; - block_keys[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = key; - block_set_ids[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = set_id; - } - __syncthreads(); - uint32_t n_warp_missing = 0; - Key warp_missing_key = 0; - uint32_t warp_missing_index = 0; - for (uint32_t i = 0; i < n_batch_keys; ++i) { - const uint32_t key_idx = batch_offset + i; - const Key key = block_keys[thread_ctx.warp_id_in_block][i]; - const size_t set_id = block_set_ids[thread_ctx.warp_id_in_block][i]; - SetContext set_ctx(cache_ctx, set_id); - set_ctx.Lock(thread_ctx); - Key evicted_key = 0; - const int insert_way = set_ctx.InsertWithoutEvicting(cache_ctx, thread_ctx, key); - if (insert_way >= 0) { - set_ctx.Write(cache_ctx, thread_ctx, insert_way, values + cache_ctx.line_size * key_idx); - } else { - if (thread_ctx.lane_id == n_warp_missing) { - warp_missing_key = key; - warp_missing_index = key_idx; - } - __syncthreads(); - n_warp_missing += 1; - } - set_ctx.Unlock(thread_ctx); - } - if (n_warp_missing > 0) { - uint32_t base_missing_idx = 0; - if (thread_ctx.lane_id == 0) { base_missing_idx = atomicAdd(n_missing, n_warp_missing); } - __syncthreads(); - base_missing_idx = __shfl(base_missing_idx, 0); - if (thread_ctx.lane_id < n_warp_missing) { - missing_keys[base_missing_idx + thread_ctx.lane_id] = warp_missing_key; - missing_indices[base_missing_idx + thread_ctx.lane_id] = warp_missing_index; - } - __syncthreads(); - } - } -} - -template -__global__ void EvictKernel(LruCacheContext cache_ctx, const Key* keys, - const uint32_t* indices, const Elem* values, const uint32_t* n_evict, - Key* evicted_keys, Elem* evicted_values) { - ThreadContext thread_ctx{}; - uint32_t num_evict = *n_evict; - __shared__ Key block_keys[kNumWarpPerBlock][kWarpSize]; - __shared__ size_t block_set_ids[kNumWarpPerBlock][kWarpSize]; - for (uint32_t batch_offset = thread_ctx.global_warp_id * kWarpSize; batch_offset < num_evict; - batch_offset += thread_ctx.num_warps * kWarpSize) { - const uint32_t n_batch_keys = min(kWarpSize, num_evict - batch_offset); - if (thread_ctx.lane_id < n_batch_keys) { - const Key key = keys[batch_offset + thread_ctx.lane_id]; - const size_t hash = LruCacheHash()(key); - const uint32_t set_id = hash % cache_ctx.n_set; - block_keys[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = key; - block_set_ids[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = set_id; - } - __syncthreads(); - for (uint32_t i = 0; i < n_batch_keys; ++i) { - const uint32_t key_idx = batch_offset + i; - const Key key = block_keys[thread_ctx.warp_id_in_block][i]; - const uint32_t set_id = block_set_ids[thread_ctx.warp_id_in_block][i]; - SetContext set_ctx(cache_ctx, set_id); - set_ctx.Lock(thread_ctx); - int evicted_way = -1; - Key evicted_key = 0; - set_ctx.Evict(cache_ctx, thread_ctx, key, &evicted_way, &evicted_key); - if (thread_ctx.lane_id == 0) { evicted_keys[key_idx] = evicted_key; } - __syncthreads(); - set_ctx.Read(cache_ctx, thread_ctx, evicted_way, - evicted_values + cache_ctx.line_size * key_idx); - set_ctx.Write(cache_ctx, thread_ctx, evicted_way, - values + cache_ctx.line_size * indices[key_idx]); - set_ctx.Unlock(thread_ctx); - } - } -} - -template -__global__ void DumpKernel(LruCacheContext cache_ctx, size_t start_key_index, - size_t end_key_index, uint32_t* n_dumped, Key* keys, Elem* values) { - ThreadContext thread_ctx{}; - __shared__ Key warp_keys[kNumWarpPerBlock][kWarpSize]; - __shared__ uint8_t warp_ages[kNumWarpPerBlock][kWarpSize]; - for (uint32_t warp_start_key_index = start_key_index + thread_ctx.global_warp_id * kWarpSize; - warp_start_key_index < end_key_index; - warp_start_key_index += thread_ctx.num_warps * kWarpSize) { - Key lane_key = 0; - uint8_t lane_age = 0; - if (warp_start_key_index + thread_ctx.lane_id < end_key_index) { - lane_key = cache_ctx.keys[warp_start_key_index + thread_ctx.lane_id]; - lane_age = cache_ctx.ages[warp_start_key_index + thread_ctx.lane_id]; - } - __syncthreads(); - warp_keys[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = lane_key; - warp_ages[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = lane_age; - const int key_count = __popc(static_cast(__ballot(lane_age != 0))); - if (key_count == 0) { continue; } - uint32_t offset = 0; - if (thread_ctx.lane_id == 0) { offset = atomicAdd(n_dumped, key_count); } - offset = __shfl(offset, 0); - __syncthreads(); - for (uint32_t i = 0; i < kWarpSize; ++i) { - const Key key = warp_keys[thread_ctx.warp_id_in_block][i]; - const Key age = warp_ages[thread_ctx.warp_id_in_block][i]; - if (age == 0) { continue; } - if (thread_ctx.lane_id == 0) { keys[offset] = key; } - __syncthreads(); - for (uint32_t j = thread_ctx.lane_id; j < cache_ctx.line_size; j += kWarpSize) { - values[offset * cache_ctx.line_size + j] = - cache_ctx.lines[(warp_start_key_index + i) * cache_ctx.line_size + j]; - } - __syncthreads(); - offset += 1; - } - } -} - -template -class LruCache : public Cache { - public: - OF_DISALLOW_COPY_AND_MOVE(LruCache); - explicit LruCache(const CacheOptions& options) - : device_index_{}, - max_query_length_(0), - query_indices_buffer_(nullptr), - query_keys_buffer_(nullptr), - value_type_(options.value_type) { - OF_CUDA_CHECK(hipGetDevice(&device_index_)); - InitLruCacheContext(options, &ctx_); - } - ~LruCache() override { - CudaCurrentDeviceGuard guard(device_index_); - if (max_query_length_ != 0) { - OF_CUDA_CHECK(hipFree(query_indices_buffer_)); - OF_CUDA_CHECK(hipFree(query_keys_buffer_)); - } - DestroyLruCacheContext(&ctx_); - } - - uint32_t KeySize() const override { return sizeof(Key); } - uint32_t ValueSize() const override { return sizeof(Elem) * ctx_.line_size; } - DataType ValueType() const override { return value_type_; } - uint64_t Capacity() const override { return ctx_.n_set * kWarpSize; } - uint32_t MaxQueryLength() const override { return max_query_length_; } - - void ReserveQueryLength(uint32_t query_length) override { - CudaCurrentDeviceGuard guard(device_index_); - if (query_length < max_query_length_) { return; } - if (max_query_length_ != 0) { - OF_CUDA_CHECK(hipFree(query_indices_buffer_)); - OF_CUDA_CHECK(hipFree(query_keys_buffer_)); - } - OF_CUDA_CHECK(hipMalloc(&query_indices_buffer_, query_length * sizeof(uint32_t))); - OF_CUDA_CHECK(hipMalloc(&query_keys_buffer_, query_length * sizeof(Key))); - max_query_length_ = query_length; - } - - CacheOptions::Policy Policy() const override { return CacheOptions::Policy::kLRU; } - - void Test(ep::Stream* stream, uint32_t n_keys, const void* keys, uint32_t* n_missing, - void* missing_keys, uint32_t* missing_indices) override { - CHECK_LE(n_keys, max_query_length_); - auto cuda_stream = stream->As(); - OF_CUDA_CHECK(hipMemsetAsync(n_missing, 0, sizeof(uint32_t), cuda_stream->cuda_stream())); - if (n_keys == 0) { return; } - cuda_stream->LaunchKernel(GetKernel, GetLaunchConfig(n_keys), ctx_, n_keys, - static_cast(keys), nullptr, n_missing, - static_cast(missing_keys), missing_indices); - } - - void Get(ep::Stream* stream, uint32_t n_keys, const void* keys, void* values, uint32_t* n_missing, - void* missing_keys, uint32_t* missing_indices) override { - CHECK_LE(n_keys, max_query_length_); - auto cuda_stream = stream->As(); - OF_CUDA_CHECK(hipMemsetAsync(n_missing, 0, sizeof(uint32_t), cuda_stream->cuda_stream())); - if (n_keys == 0) { return; } - cuda_stream->LaunchKernel(GetKernel, GetLaunchConfig(n_keys), ctx_, n_keys, - static_cast(keys), static_cast(values), n_missing, - static_cast(missing_keys), missing_indices); - } - - void Put(ep::Stream* stream, uint32_t n_keys, const void* keys, const void* values, - uint32_t* n_evicted, void* evicted_keys, void* evicted_values) override { - CHECK_LE(n_keys, max_query_length_); - auto cuda_stream = stream->As(); - OF_CUDA_CHECK(hipMemsetAsync(n_evicted, 0, sizeof(uint32_t), cuda_stream->cuda_stream())); - if (n_keys == 0) { return; } - cuda_stream->LaunchKernel(PutWithoutEvictingKernel, GetLaunchConfig(n_keys), ctx_, - n_keys, static_cast(keys), - static_cast(values), n_evicted, query_keys_buffer_, - query_indices_buffer_); - cuda_stream->LaunchKernel(EvictKernel, GetLaunchConfig(n_keys), ctx_, - query_keys_buffer_, query_indices_buffer_, - static_cast(values), n_evicted, - static_cast(evicted_keys), static_cast(evicted_values)); - } - - void Dump(ep::Stream* stream, uint64_t start_key_index, uint64_t end_key_index, - uint32_t* n_dumped, void* keys, void* values) override { - auto cuda_stream = stream->As(); - OF_CUDA_CHECK(hipMemsetAsync(n_dumped, 0, sizeof(uint32_t), cuda_stream->cuda_stream())); - const uint64_t max_dump_keys = end_key_index - start_key_index; - cuda_stream->LaunchKernel( - DumpKernel, - ep::CudaLaunchConfig((max_dump_keys + kNumWarpPerBlock - 1) / kNumWarpPerBlock, kBlockSize, - 0), - ctx_, start_key_index, end_key_index, n_dumped, static_cast(keys), - static_cast(values)); - } - - void Clear() override { ClearLruCacheContext(&ctx_); } - - private: - int device_index_; - uint32_t max_query_length_; - LruCacheContext ctx_; - uint32_t* query_indices_buffer_; - Key* query_keys_buffer_; - DataType value_type_; -}; - -template -std::unique_ptr DispatchValueType(const CacheOptions& options) { - if (options.value_size % sizeof(ulonglong2) == 0) { - return std::unique_ptr(new LruCache(options)); - } else if (options.value_size % sizeof(uint64_t) == 0) { - return std::unique_ptr(new LruCache(options)); - } else if (options.value_size % sizeof(uint32_t) == 0) { - return std::unique_ptr(new LruCache(options)); - } else if (options.value_size % sizeof(uint16_t) == 0) { - return std::unique_ptr(new LruCache(options)); - } else { - return std::unique_ptr(new LruCache(options)); - } -} - -std::unique_ptr DispatchKeyType(const CacheOptions& options) { - if (options.key_size == sizeof(uint32_t)) { - return DispatchValueType(options); - } else if (options.key_size == sizeof(uint64_t)) { - return DispatchValueType(options); - } else { - UNIMPLEMENTED(); - return nullptr; - } -} - -} // namespace - -std::unique_ptr NewLruCache(const CacheOptions& options) { return DispatchKeyType(options); } - -} // namespace embedding - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Inspired by https://github.com/NVIDIA-Merlin/HugeCTR/blob/master/gpu_cache/src/nv_gpu_cache.cu + +#include "oneflow/core/embedding/lru_cache.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/embedding/hash_functions.hip.h" +#include +#include + +namespace oneflow { + +namespace embedding { + +namespace { + +constexpr int kWarpSize = 64; +constexpr int kNumWarpPerBlock = 2; +constexpr int kBlockSize = kNumWarpPerBlock * kWarpSize; +constexpr unsigned long long int kFullMask = 0xFFFFFFFFFFFFFFFFU; + +ep::CudaLaunchConfig GetLaunchConfig(uint32_t n_keys) { + return ep::CudaLaunchConfig((n_keys + kNumWarpPerBlock - 1) / kNumWarpPerBlock, + kWarpSize * kNumWarpPerBlock, 0); +} + +struct ThreadContext { + __device__ ThreadContext() { + const uint32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + global_warp_id = global_thread_id / kWarpSize; + warp_id_in_block = global_warp_id % kNumWarpPerBlock; // NOLINT + num_warps = gridDim.x * kNumWarpPerBlock; // NOLINT + lane_id = global_thread_id % kWarpSize; + } + + uint32_t global_warp_id; + uint32_t warp_id_in_block; + uint32_t num_warps; + uint32_t lane_id; +}; + +class WarpMutexAtomicImpl { + public: + OF_DISALLOW_COPY_AND_MOVE(WarpMutexAtomicImpl); + __device__ WarpMutexAtomicImpl() : flag_(0) {} + __device__ ~WarpMutexAtomicImpl() = default; + + __device__ void Lock(const ThreadContext& thread_ctx) { + if (thread_ctx.lane_id == 0) { + while (atomicCAS(&flag_, 0, 1) != 0) + ; + } + __threadfence(); + __syncthreads(); + } + + __device__ void Unlock(const ThreadContext& thread_ctx) { + __syncthreads(); + __threadfence(); + if (thread_ctx.lane_id == 0) { atomicExch(&flag_, 0); } + } + + private: + int32_t flag_; +}; + +template +struct LruCacheContext { + Key* keys; + Elem* lines; + uint8_t* ages; + void* mutex; + uint64_t n_set; + uint32_t line_size; + CacheOptions::MemoryKind value_memory_kind; +}; + +__global__ void InitCacheSetMutex(uint32_t n_set, void* mutex) { + + using WarpMutex = WarpMutexAtomicImpl; + + const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n_set) { new (reinterpret_cast(mutex) + idx) WarpMutex; } +} + +template +void ClearLruCacheContext(LruCacheContext* ctx) { + OF_CUDA_CHECK(hipMemset(ctx->keys, 0, ctx->n_set * kWarpSize * sizeof(Key))); + OF_CUDA_CHECK(hipMemset(ctx->ages, 0, ctx->n_set * kWarpSize * sizeof(uint8_t))); + InitCacheSetMutex<<<(ctx->n_set - 1 + 256) / 256, 256>>>(ctx->n_set, ctx->mutex); +} + +template +void InitLruCacheContext(const CacheOptions& options, LruCacheContext* ctx) { + const size_t keys_size_per_set = kWarpSize * sizeof(Key); + const uint32_t line_size = options.value_size / sizeof(Elem); + const size_t lines_size_per_set = kWarpSize * line_size * sizeof(Elem); + const size_t ages_size_per_set = kWarpSize * sizeof(uint8_t); + int device = 0; + OF_CUDA_CHECK(hipGetDevice(&device)); + int major = 0; + OF_CUDA_CHECK(hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, device)); + size_t mutex_size_per_set = 0; + + mutex_size_per_set = sizeof(WarpMutexAtomicImpl); + + const size_t n_set = (options.capacity - 1 + kWarpSize) / kWarpSize; + CHECK_GT(n_set, 0); + ctx->n_set = n_set; + ctx->line_size = line_size; + const size_t keys_size = n_set * keys_size_per_set; + OF_CUDA_CHECK(hipMalloc(&(ctx->keys), keys_size)); + const size_t lines_size = n_set * lines_size_per_set; + if (options.value_memory_kind == CacheOptions::MemoryKind::kDevice) { + OF_CUDA_CHECK(hipMalloc(&(ctx->lines), lines_size)); + } else if (options.value_memory_kind == CacheOptions::MemoryKind::kHost) { + if (ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_DISABLE_NUMA_AWARE_ALLOCATION", false)) { + OF_CUDA_CHECK(hipMallocHost(reinterpret_cast(&(ctx->lines)), lines_size)); + } else { + OF_CUDA_CHECK( + NumaAwareCudaMallocHost(device, reinterpret_cast(&ctx->lines), lines_size)); + } + } else { + UNIMPLEMENTED(); + } + ctx->value_memory_kind = options.value_memory_kind; + const size_t ages_size = n_set * ages_size_per_set; + OF_CUDA_CHECK(hipMalloc(&(ctx->ages), ages_size)); + const size_t mutex_size = n_set * mutex_size_per_set; + OF_CUDA_CHECK(hipMalloc(&(ctx->mutex), mutex_size)); + + ClearLruCacheContext(ctx); +} + +template +void DestroyLruCacheContext(LruCacheContext* ctx) { + OF_CUDA_CHECK(hipFree(ctx->keys)); + if (ctx->value_memory_kind == CacheOptions::MemoryKind::kDevice) { + OF_CUDA_CHECK(hipFree(ctx->lines)); + } else if (ctx->value_memory_kind == CacheOptions::MemoryKind::kHost) { + OF_CUDA_CHECK(hipHostFree(ctx->lines)); + } else { + UNIMPLEMENTED(); + } + OF_CUDA_CHECK(hipFree(ctx->ages)); + OF_CUDA_CHECK(hipFree(ctx->mutex)); +} + +template +struct SetContext { + + using WarpMutex = WarpMutexAtomicImpl; + + __device__ SetContext(const LruCacheContext& ctx, uint32_t set_id) + : keys(ctx.keys + set_id * kWarpSize), + mutex(reinterpret_cast(ctx.mutex) + set_id), + ages(ctx.ages + set_id * kWarpSize), + lines(ctx.lines + set_id * kWarpSize * ctx.line_size) {} + + __device__ int Lookup(const ThreadContext& thread_ctx, Key key) { + const Key lane_key = keys[thread_ctx.lane_id]; + const int lane_age = ages[thread_ctx.lane_id]; + const bool lane_hit = (lane_key == key && lane_age != 0); + const unsigned long long int hit_mask = __ballot(lane_hit); + if (hit_mask != 0) { + return __ffs(static_cast(hit_mask)) - 1; + } else { + return -1; + } + } + + __device__ void Read(const LruCacheContext& cache_ctx, const ThreadContext& thread_ctx, + int way, Elem* line) { + const Elem* from_line = lines + way * cache_ctx.line_size; + for (int i = thread_ctx.lane_id; i < cache_ctx.line_size; i += kWarpSize) { + line[i] = from_line[i]; + } + } + + __device__ int InsertWithoutEvicting(const LruCacheContext& cache_ctx, + const ThreadContext& thread_ctx, Key key) { + int insert_way = -1; + const Key lane_key = keys[thread_ctx.lane_id]; + int lane_age = ages[thread_ctx.lane_id]; + const unsigned long long int hit_mask = __ballot(lane_key == key && lane_age != 0); + if (hit_mask != 0) { + insert_way = __ffs(static_cast(hit_mask)) - 1; + const int insert_way_age = __shfl(lane_age, insert_way); + if (lane_age > insert_way_age) { + lane_age -= 1; + } else if (thread_ctx.lane_id == insert_way) { + lane_age = kWarpSize; + } + __syncthreads(); + } + if (insert_way == -1) { + const unsigned long long int valid_mask = __ballot(lane_age != 0); + if (valid_mask != kFullMask) { + insert_way = __popc(static_cast(valid_mask)); + if (lane_age > 0) { + lane_age -= 1; + } else if (thread_ctx.lane_id == insert_way) { + lane_age = kWarpSize; + keys[insert_way] = key; + } + __syncthreads(); + } + } + if (insert_way != -1) { ages[thread_ctx.lane_id] = lane_age; } + return insert_way; + } + + __device__ void Evict(const LruCacheContext& cache_ctx, + const ThreadContext& thread_ctx, Key key, int* way, Key* evicted_key) { + const Key lane_key = keys[thread_ctx.lane_id]; + int lane_age = ages[thread_ctx.lane_id]; + const int insert_way = __ffs(static_cast(__ballot(lane_age == 1))) - 1; + *evicted_key = __shfl(lane_key, insert_way); + if (thread_ctx.lane_id == insert_way) { + keys[insert_way] = key; + lane_age = kWarpSize; + } else if (lane_age > 1) { + lane_age -= 1; + } + __syncthreads(); + ages[thread_ctx.lane_id] = lane_age; + *way = insert_way; + } + + __device__ void Write(const LruCacheContext& cache_ctx, + const ThreadContext& thread_ctx, int way, const Elem* line) { + Elem* to_line = lines + way * cache_ctx.line_size; + for (int i = thread_ctx.lane_id; i < cache_ctx.line_size; i += kWarpSize) { + to_line[i] = line[i]; + } + } + + __device__ void Lock(const ThreadContext& thread_ctx) { mutex->Lock(thread_ctx); } + + __device__ void Unlock(const ThreadContext& thread_ctx) { mutex->Unlock(thread_ctx); } + + Key* keys; + Elem* lines; + uint8_t* ages; + WarpMutex* mutex; +}; + +template +__global__ void GetKernel(LruCacheContext cache_ctx, uint32_t num_keys, const Key* keys, + Elem* values, uint32_t* n_missing_keys, Key* missing_keys, + uint32_t* missing_indices) { + ThreadContext thread_ctx{}; + __shared__ Key block_keys[kNumWarpPerBlock][kWarpSize]; + __shared__ size_t block_set_ids[kNumWarpPerBlock][kWarpSize]; + for (uint32_t batch_offset = thread_ctx.global_warp_id * kWarpSize; batch_offset < num_keys; + batch_offset += thread_ctx.num_warps * kWarpSize) { + const uint32_t n_batch_keys = min(kWarpSize, num_keys - batch_offset); + if (thread_ctx.lane_id < n_batch_keys) { + const Key key = keys[batch_offset + thread_ctx.lane_id]; + const size_t hash = LruCacheHash()(key); + const uint32_t set_id = hash % cache_ctx.n_set; + block_keys[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = key; + block_set_ids[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = set_id; + } + __syncthreads(); + uint32_t n_warp_missing = 0; + Key warp_missing_key = 0; + uint32_t warp_missing_index = 0; + for (uint32_t i = 0; i < n_batch_keys; ++i) { + const uint32_t key_idx = batch_offset + i; + const Key key = block_keys[thread_ctx.warp_id_in_block][i]; + const size_t set_id = block_set_ids[thread_ctx.warp_id_in_block][i]; + SetContext set_ctx(cache_ctx, set_id); + const int way = set_ctx.Lookup(thread_ctx, key); + if (way < 0) { + if (thread_ctx.lane_id == n_warp_missing) { + warp_missing_key = key; + warp_missing_index = key_idx; + } + __syncthreads(); + n_warp_missing += 1; + } else if (!test_only) { + set_ctx.Read(cache_ctx, thread_ctx, way, values + key_idx * cache_ctx.line_size); + } + } + if (n_warp_missing > 0) { + uint32_t base_missing_idx = 0; + if (thread_ctx.lane_id == 0) { base_missing_idx = atomicAdd(n_missing_keys, n_warp_missing); } + __syncthreads(); + base_missing_idx = __shfl(base_missing_idx, 0); + if (thread_ctx.lane_id < n_warp_missing) { + missing_keys[base_missing_idx + thread_ctx.lane_id] = warp_missing_key; + missing_indices[base_missing_idx + thread_ctx.lane_id] = warp_missing_index; + } + __syncthreads(); + } + __syncthreads(); + } +} + +template +__global__ void PutWithoutEvictingKernel(LruCacheContext cache_ctx, uint32_t num_keys, + const Key* keys, const Elem* values, uint32_t* n_missing, + Key* missing_keys, uint32_t* missing_indices) { + ThreadContext thread_ctx{}; + __shared__ Key block_keys[kNumWarpPerBlock][kWarpSize]; + __shared__ size_t block_set_ids[kNumWarpPerBlock][kWarpSize]; + for (uint32_t batch_offset = thread_ctx.global_warp_id * kWarpSize; batch_offset < num_keys; + batch_offset += thread_ctx.num_warps * kWarpSize) { + const uint32_t n_batch_keys = min(kWarpSize, num_keys - batch_offset); + if (thread_ctx.lane_id < n_batch_keys) { + const Key key = keys[batch_offset + thread_ctx.lane_id]; + const size_t hash = LruCacheHash()(key); + const uint32_t set_id = hash % cache_ctx.n_set; + block_keys[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = key; + block_set_ids[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = set_id; + } + __syncthreads(); + uint32_t n_warp_missing = 0; + Key warp_missing_key = 0; + uint32_t warp_missing_index = 0; + for (uint32_t i = 0; i < n_batch_keys; ++i) { + const uint32_t key_idx = batch_offset + i; + const Key key = block_keys[thread_ctx.warp_id_in_block][i]; + const size_t set_id = block_set_ids[thread_ctx.warp_id_in_block][i]; + SetContext set_ctx(cache_ctx, set_id); + set_ctx.Lock(thread_ctx); + Key evicted_key = 0; + const int insert_way = set_ctx.InsertWithoutEvicting(cache_ctx, thread_ctx, key); + if (insert_way >= 0) { + set_ctx.Write(cache_ctx, thread_ctx, insert_way, values + cache_ctx.line_size * key_idx); + } else { + if (thread_ctx.lane_id == n_warp_missing) { + warp_missing_key = key; + warp_missing_index = key_idx; + } + __syncthreads(); + n_warp_missing += 1; + } + set_ctx.Unlock(thread_ctx); + } + if (n_warp_missing > 0) { + uint32_t base_missing_idx = 0; + if (thread_ctx.lane_id == 0) { base_missing_idx = atomicAdd(n_missing, n_warp_missing); } + __syncthreads(); + base_missing_idx = __shfl(base_missing_idx, 0); + if (thread_ctx.lane_id < n_warp_missing) { + missing_keys[base_missing_idx + thread_ctx.lane_id] = warp_missing_key; + missing_indices[base_missing_idx + thread_ctx.lane_id] = warp_missing_index; + } + __syncthreads(); + } + } +} + +template +__global__ void EvictKernel(LruCacheContext cache_ctx, const Key* keys, + const uint32_t* indices, const Elem* values, const uint32_t* n_evict, + Key* evicted_keys, Elem* evicted_values) { + ThreadContext thread_ctx{}; + uint32_t num_evict = *n_evict; + __shared__ Key block_keys[kNumWarpPerBlock][kWarpSize]; + __shared__ size_t block_set_ids[kNumWarpPerBlock][kWarpSize]; + for (uint32_t batch_offset = thread_ctx.global_warp_id * kWarpSize; batch_offset < num_evict; + batch_offset += thread_ctx.num_warps * kWarpSize) { + const uint32_t n_batch_keys = min(kWarpSize, num_evict - batch_offset); + if (thread_ctx.lane_id < n_batch_keys) { + const Key key = keys[batch_offset + thread_ctx.lane_id]; + const size_t hash = LruCacheHash()(key); + const uint32_t set_id = hash % cache_ctx.n_set; + block_keys[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = key; + block_set_ids[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = set_id; + } + __syncthreads(); + for (uint32_t i = 0; i < n_batch_keys; ++i) { + const uint32_t key_idx = batch_offset + i; + const Key key = block_keys[thread_ctx.warp_id_in_block][i]; + const uint32_t set_id = block_set_ids[thread_ctx.warp_id_in_block][i]; + SetContext set_ctx(cache_ctx, set_id); + set_ctx.Lock(thread_ctx); + int evicted_way = -1; + Key evicted_key = 0; + set_ctx.Evict(cache_ctx, thread_ctx, key, &evicted_way, &evicted_key); + if (thread_ctx.lane_id == 0) { evicted_keys[key_idx] = evicted_key; } + __syncthreads(); + set_ctx.Read(cache_ctx, thread_ctx, evicted_way, + evicted_values + cache_ctx.line_size * key_idx); + set_ctx.Write(cache_ctx, thread_ctx, evicted_way, + values + cache_ctx.line_size * indices[key_idx]); + set_ctx.Unlock(thread_ctx); + } + } +} + +template +__global__ void DumpKernel(LruCacheContext cache_ctx, size_t start_key_index, + size_t end_key_index, uint32_t* n_dumped, Key* keys, Elem* values) { + ThreadContext thread_ctx{}; + __shared__ Key warp_keys[kNumWarpPerBlock][kWarpSize]; + __shared__ uint8_t warp_ages[kNumWarpPerBlock][kWarpSize]; + for (uint32_t warp_start_key_index = start_key_index + thread_ctx.global_warp_id * kWarpSize; + warp_start_key_index < end_key_index; + warp_start_key_index += thread_ctx.num_warps * kWarpSize) { + Key lane_key = 0; + uint8_t lane_age = 0; + if (warp_start_key_index + thread_ctx.lane_id < end_key_index) { + lane_key = cache_ctx.keys[warp_start_key_index + thread_ctx.lane_id]; + lane_age = cache_ctx.ages[warp_start_key_index + thread_ctx.lane_id]; + } + __syncthreads(); + warp_keys[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = lane_key; + warp_ages[thread_ctx.warp_id_in_block][thread_ctx.lane_id] = lane_age; + const int key_count = __popc(static_cast(__ballot(lane_age != 0))); + if (key_count == 0) { continue; } + uint32_t offset = 0; + if (thread_ctx.lane_id == 0) { offset = atomicAdd(n_dumped, key_count); } + offset = __shfl(offset, 0); + __syncthreads(); + for (uint32_t i = 0; i < kWarpSize; ++i) { + const Key key = warp_keys[thread_ctx.warp_id_in_block][i]; + const Key age = warp_ages[thread_ctx.warp_id_in_block][i]; + if (age == 0) { continue; } + if (thread_ctx.lane_id == 0) { keys[offset] = key; } + __syncthreads(); + for (uint32_t j = thread_ctx.lane_id; j < cache_ctx.line_size; j += kWarpSize) { + values[offset * cache_ctx.line_size + j] = + cache_ctx.lines[(warp_start_key_index + i) * cache_ctx.line_size + j]; + } + __syncthreads(); + offset += 1; + } + } +} + +template +class LruCache : public Cache { + public: + OF_DISALLOW_COPY_AND_MOVE(LruCache); + explicit LruCache(const CacheOptions& options) + : device_index_{}, + max_query_length_(0), + query_indices_buffer_(nullptr), + query_keys_buffer_(nullptr), + value_type_(options.value_type) { + OF_CUDA_CHECK(hipGetDevice(&device_index_)); + InitLruCacheContext(options, &ctx_); + } + ~LruCache() override { + CudaCurrentDeviceGuard guard(device_index_); + if (max_query_length_ != 0) { + OF_CUDA_CHECK(hipFree(query_indices_buffer_)); + OF_CUDA_CHECK(hipFree(query_keys_buffer_)); + } + DestroyLruCacheContext(&ctx_); + } + + uint32_t KeySize() const override { return sizeof(Key); } + uint32_t ValueSize() const override { return sizeof(Elem) * ctx_.line_size; } + DataType ValueType() const override { return value_type_; } + uint64_t Capacity() const override { return ctx_.n_set * kWarpSize; } + uint32_t MaxQueryLength() const override { return max_query_length_; } + + void ReserveQueryLength(uint32_t query_length) override { + CudaCurrentDeviceGuard guard(device_index_); + if (query_length < max_query_length_) { return; } + if (max_query_length_ != 0) { + OF_CUDA_CHECK(hipFree(query_indices_buffer_)); + OF_CUDA_CHECK(hipFree(query_keys_buffer_)); + } + OF_CUDA_CHECK(hipMalloc(&query_indices_buffer_, query_length * sizeof(uint32_t))); + OF_CUDA_CHECK(hipMalloc(&query_keys_buffer_, query_length * sizeof(Key))); + max_query_length_ = query_length; + } + + CacheOptions::Policy Policy() const override { return CacheOptions::Policy::kLRU; } + + void Test(ep::Stream* stream, uint32_t n_keys, const void* keys, uint32_t* n_missing, + void* missing_keys, uint32_t* missing_indices) override { + CHECK_LE(n_keys, max_query_length_); + auto cuda_stream = stream->As(); + OF_CUDA_CHECK(hipMemsetAsync(n_missing, 0, sizeof(uint32_t), cuda_stream->cuda_stream())); + if (n_keys == 0) { return; } + cuda_stream->LaunchKernel(GetKernel, GetLaunchConfig(n_keys), ctx_, n_keys, + static_cast(keys), nullptr, n_missing, + static_cast(missing_keys), missing_indices); + } + + void Get(ep::Stream* stream, uint32_t n_keys, const void* keys, void* values, uint32_t* n_missing, + void* missing_keys, uint32_t* missing_indices) override { + CHECK_LE(n_keys, max_query_length_); + auto cuda_stream = stream->As(); + OF_CUDA_CHECK(hipMemsetAsync(n_missing, 0, sizeof(uint32_t), cuda_stream->cuda_stream())); + if (n_keys == 0) { return; } + cuda_stream->LaunchKernel(GetKernel, GetLaunchConfig(n_keys), ctx_, n_keys, + static_cast(keys), static_cast(values), n_missing, + static_cast(missing_keys), missing_indices); + } + + void Put(ep::Stream* stream, uint32_t n_keys, const void* keys, const void* values, + uint32_t* n_evicted, void* evicted_keys, void* evicted_values) override { + CHECK_LE(n_keys, max_query_length_); + auto cuda_stream = stream->As(); + OF_CUDA_CHECK(hipMemsetAsync(n_evicted, 0, sizeof(uint32_t), cuda_stream->cuda_stream())); + if (n_keys == 0) { return; } + cuda_stream->LaunchKernel(PutWithoutEvictingKernel, GetLaunchConfig(n_keys), ctx_, + n_keys, static_cast(keys), + static_cast(values), n_evicted, query_keys_buffer_, + query_indices_buffer_); + cuda_stream->LaunchKernel(EvictKernel, GetLaunchConfig(n_keys), ctx_, + query_keys_buffer_, query_indices_buffer_, + static_cast(values), n_evicted, + static_cast(evicted_keys), static_cast(evicted_values)); + } + + void Dump(ep::Stream* stream, uint64_t start_key_index, uint64_t end_key_index, + uint32_t* n_dumped, void* keys, void* values) override { + auto cuda_stream = stream->As(); + OF_CUDA_CHECK(hipMemsetAsync(n_dumped, 0, sizeof(uint32_t), cuda_stream->cuda_stream())); + const uint64_t max_dump_keys = end_key_index - start_key_index; + cuda_stream->LaunchKernel( + DumpKernel, + ep::CudaLaunchConfig((max_dump_keys + kNumWarpPerBlock - 1) / kNumWarpPerBlock, kBlockSize, + 0), + ctx_, start_key_index, end_key_index, n_dumped, static_cast(keys), + static_cast(values)); + } + + void Clear() override { ClearLruCacheContext(&ctx_); } + + private: + int device_index_; + uint32_t max_query_length_; + LruCacheContext ctx_; + uint32_t* query_indices_buffer_; + Key* query_keys_buffer_; + DataType value_type_; +}; + +template +std::unique_ptr DispatchValueType(const CacheOptions& options) { + if (options.value_size % sizeof(ulonglong2) == 0) { + return std::unique_ptr(new LruCache(options)); + } else if (options.value_size % sizeof(uint64_t) == 0) { + return std::unique_ptr(new LruCache(options)); + } else if (options.value_size % sizeof(uint32_t) == 0) { + return std::unique_ptr(new LruCache(options)); + } else if (options.value_size % sizeof(uint16_t) == 0) { + return std::unique_ptr(new LruCache(options)); + } else { + return std::unique_ptr(new LruCache(options)); + } +} + +std::unique_ptr DispatchKeyType(const CacheOptions& options) { + if (options.key_size == sizeof(uint32_t)) { + return DispatchValueType(options); + } else if (options.key_size == sizeof(uint64_t)) { + return DispatchValueType(options); + } else { + UNIMPLEMENTED(); + return nullptr; + } +} + +} // namespace + +std::unique_ptr NewLruCache(const CacheOptions& options) { return DispatchKeyType(options); } + +} // namespace embedding + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/embedding/mock_key_value_store.hip.cpp b/oneflow/core/embedding/mock_key_value_store.hip.cpp index de55f2a..9897779 100644 --- a/oneflow/core/embedding/mock_key_value_store.hip.cpp +++ b/oneflow/core/embedding/mock_key_value_store.hip.cpp @@ -1,249 +1,249 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/embedding/mock_key_value_store.h" -#include "oneflow/core/device/cuda_util.h" - -namespace oneflow { - -namespace embedding { - -namespace { - -template -class IteratorImpl : public KVIterator { - public: - OF_DISALLOW_COPY_AND_MOVE(IteratorImpl); - IteratorImpl(HashMap* store, uint32_t key_size, uint32_t value_size, - uint32_t max_query_length, void* host_keys_buffer, void* host_values_buffer, - uint32_t* host_num_buffer) - : store_(store), - pos_(store->begin()), - key_size_(key_size), - value_size_(value_size), - max_query_length_(max_query_length), - host_keys_buffer_(host_keys_buffer), - host_values_buffer_(host_values_buffer), - host_num_buffer_(host_num_buffer) {} - ~IteratorImpl() override = default; - - void NextN(ep::Stream* stream, uint32_t n_request, uint32_t* n_result, void* keys, - void* values) override { - CHECK_LE(n_request, max_query_length_); - auto cuda_stream = stream->As(); - CHECK_JUST(cuda_stream->Sync()); - *host_num_buffer_ = 0; - while (*host_num_buffer_ < n_request && pos_ != store_->end()) { - reinterpret_cast(host_keys_buffer_)[*host_num_buffer_] = pos_->first; - std::memcpy(reinterpret_cast(host_values_buffer_) + *host_num_buffer_ * value_size_, - pos_->second.data(), value_size_); - } - OF_CUDA_CHECK(hipMemcpyAsync(n_result, host_num_buffer_, sizeof(uint32_t), hipMemcpyDefault, - cuda_stream->cuda_stream())); - const uint32_t num_keys = *host_num_buffer_; - if (num_keys != 0) { - OF_CUDA_CHECK(hipMemcpyAsync(keys, host_keys_buffer_, num_keys * key_size_, - hipMemcpyDefault, cuda_stream->cuda_stream())); - OF_CUDA_CHECK(hipMemcpyAsync(values, host_values_buffer_, num_keys * value_size_, - hipMemcpyDefault, cuda_stream->cuda_stream())); - } - } - - void Reset() override { pos_ = store_->begin(); } - - private: - HashMap* store_; - typename HashMap::iterator pos_; - uint32_t key_size_; - uint32_t value_size_; - uint32_t max_query_length_; - void* host_keys_buffer_; - void* host_values_buffer_; - uint32_t* host_num_buffer_; -}; - -template -class KeyValueStoreImpl : public KeyValueStore { - public: - OF_DISALLOW_COPY_AND_MOVE(KeyValueStoreImpl); - explicit KeyValueStoreImpl(const MockKeyValueStoreOptions& options) - : device_index_(-1), max_query_length_(0) { - OF_CUDA_CHECK(hipGetDevice(&device_index_)); - key_size_ = options.key_size; - value_size_ = options.value_size; - OF_CUDA_CHECK(NumaAwareCudaMallocHost( - device_index_, reinterpret_cast(&host_query_keys_), key_size_ * max_query_length_)); - OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, - reinterpret_cast(&host_query_values_), - value_size_ * max_query_length_)); - OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, reinterpret_cast(&host_n_missing_), - sizeof(uint32_t))); - OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, - reinterpret_cast(&host_missing_indices_), - sizeof(uint32_t) * max_query_length_)); - } - ~KeyValueStoreImpl() { - CudaCurrentDeviceGuard guard(device_index_); - if (max_query_length_ != 0) { - OF_CUDA_CHECK(hipHostFree(host_query_keys_)); - OF_CUDA_CHECK(hipHostFree(host_query_values_)); - OF_CUDA_CHECK(hipHostFree(host_missing_indices_)); - } - OF_CUDA_CHECK(hipHostFree(host_n_missing_)); - } - - uint32_t KeySize() const override { return key_size_; } - - uint32_t ValueSize() const override { return value_size_; } - - uint32_t MaxQueryLength() const override { return max_query_length_; } - - void ReserveQueryLength(uint32_t query_length) override { - CudaCurrentDeviceGuard guard(device_index_); - if (query_length <= max_query_length_) { return; } - if (max_query_length_ != 0) { - OF_CUDA_CHECK(hipHostFree(host_query_keys_)); - OF_CUDA_CHECK(hipHostFree(host_query_values_)); - OF_CUDA_CHECK(hipHostFree(host_missing_indices_)); - } - OF_CUDA_CHECK(NumaAwareCudaMallocHost( - device_index_, reinterpret_cast(&host_query_keys_), key_size_ * query_length)); - OF_CUDA_CHECK(NumaAwareCudaMallocHost( - device_index_, reinterpret_cast(&host_query_values_), value_size_ * query_length)); - OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, - reinterpret_cast(&host_missing_indices_), - sizeof(uint32_t) * query_length)); - max_query_length_ = query_length; - } - - void Get(ep::Stream* stream, uint32_t num_keys, const void* keys, void* values, - uint32_t* n_missing, uint32_t* missing_indices) override; - void Put(ep::Stream* stream, uint32_t num_keys, const void* keys, const void* values) override; - bool SnapshotExists(const std::string& name) override; - void LoadSnapshot(const std::string& name) override; - void LoadSnapshot(const std::string& name, - const std::function& Hook) override; - void SaveSnapshot(const std::string& name) override; - - private: - int device_index_; - uint32_t max_query_length_; - uint32_t key_size_; - uint32_t value_size_; - Key* host_query_keys_{}; - uint8_t* host_query_values_{}; - uint32_t* host_n_missing_{}; - uint32_t* host_missing_indices_{}; - HashMap store_; - HashMap> snapshots_; - std::mutex mutex_; -}; - -template -void KeyValueStoreImpl::Get(ep::Stream* stream, uint32_t num_keys, const void* keys, - void* values, uint32_t* n_missing, uint32_t* missing_indices) { - std::lock_guard lock(mutex_); - auto cuda_stream = stream->As(); - CHECK_LE(num_keys, max_query_length_); - if (num_keys == 0) { - OF_CUDA_CHECK(hipMemsetAsync(n_missing, 0, sizeof(uint32_t), - stream->As()->cuda_stream())); - return; - } - OF_CUDA_CHECK(hipMemcpyAsync(host_query_keys_, keys, key_size_ * num_keys, hipMemcpyDefault, - cuda_stream->cuda_stream())); - CHECK_JUST(cuda_stream->Sync()); - *host_n_missing_ = 0; - for (uint32_t i = 0; i < num_keys; ++i) { - auto it = store_.find(host_query_keys_[i]); - if (it != store_.end()) { - std::memcpy(host_query_values_ + i * value_size_, it->second.data(), value_size_); - } else { - host_missing_indices_[*host_n_missing_] = i; - *host_n_missing_ += 1; - } - } - OF_CUDA_CHECK(hipMemcpyAsync(values, host_query_values_, num_keys * value_size_, - hipMemcpyDefault, cuda_stream->cuda_stream())); - OF_CUDA_CHECK(hipMemcpyAsync(n_missing, host_n_missing_, sizeof(uint32_t), hipMemcpyDefault, - cuda_stream->cuda_stream())); - OF_CUDA_CHECK(hipMemcpyAsync(missing_indices, host_missing_indices_, - (*host_n_missing_) * sizeof(uint32_t), hipMemcpyDefault, - cuda_stream->cuda_stream())); -} - -template -void KeyValueStoreImpl::Put(ep::Stream* stream, uint32_t num_keys, const void* keys, - const void* values) { - std::lock_guard lock(mutex_); - auto cuda_stream = stream->As(); - CHECK_LE(num_keys, max_query_length_); - if (num_keys == 0) { return; } - OF_CUDA_CHECK(hipMemcpyAsync(host_query_keys_, keys, key_size_ * num_keys, hipMemcpyDefault, - cuda_stream->cuda_stream())); - OF_CUDA_CHECK(hipMemcpyAsync(host_query_values_, values, value_size_ * num_keys, - hipMemcpyDefault, cuda_stream->cuda_stream())); - CHECK_JUST(cuda_stream->Sync()); - for (uint32_t i = 0; i < num_keys; ++i) { - store_[host_query_keys_[i]] = std::string( - reinterpret_cast(host_query_values_) + i * value_size_, value_size_); - } -} - -template -bool KeyValueStoreImpl::SnapshotExists(const std::string& name) { - return snapshots_.find(name) != snapshots_.end(); -} - -template -void KeyValueStoreImpl::LoadSnapshot(const std::string& name) { - CudaCurrentDeviceGuard guard(device_index_); - LoadSnapshot(name, nullptr); -} - -template -void KeyValueStoreImpl::LoadSnapshot(const std::string& name, - const std::function& Hook) { - CudaCurrentDeviceGuard guard(device_index_); - store_ = snapshots_[name]; - if (Hook) { - IteratorImpl iterator(&store_, KeySize(), ValueSize(), max_query_length_, host_query_keys_, - host_query_values_, host_n_missing_); - Hook(&iterator); - } -} - -template -void KeyValueStoreImpl::SaveSnapshot(const std::string& name) { - CudaCurrentDeviceGuard guard(device_index_); - snapshots_[name] = store_; -} - -} // namespace - -std::unique_ptr NewMockKeyValueStore(const MockKeyValueStoreOptions& options) { - if (options.key_size == sizeof(uint64_t)) { - return std::unique_ptr(new KeyValueStoreImpl(options)); - } else if (options.key_size == sizeof(uint32_t)) { - return std::unique_ptr(new KeyValueStoreImpl(options)); - } else { - UNIMPLEMENTED(); - return nullptr; - } -} - -} // namespace embedding - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/embedding/mock_key_value_store.h" +#include "oneflow/core/device/cuda_util.h" + +namespace oneflow { + +namespace embedding { + +namespace { + +template +class IteratorImpl : public KVIterator { + public: + OF_DISALLOW_COPY_AND_MOVE(IteratorImpl); + IteratorImpl(HashMap* store, uint32_t key_size, uint32_t value_size, + uint32_t max_query_length, void* host_keys_buffer, void* host_values_buffer, + uint32_t* host_num_buffer) + : store_(store), + pos_(store->begin()), + key_size_(key_size), + value_size_(value_size), + max_query_length_(max_query_length), + host_keys_buffer_(host_keys_buffer), + host_values_buffer_(host_values_buffer), + host_num_buffer_(host_num_buffer) {} + ~IteratorImpl() override = default; + + void NextN(ep::Stream* stream, uint32_t n_request, uint32_t* n_result, void* keys, + void* values) override { + CHECK_LE(n_request, max_query_length_); + auto cuda_stream = stream->As(); + CHECK_JUST(cuda_stream->Sync()); + *host_num_buffer_ = 0; + while (*host_num_buffer_ < n_request && pos_ != store_->end()) { + reinterpret_cast(host_keys_buffer_)[*host_num_buffer_] = pos_->first; + std::memcpy(reinterpret_cast(host_values_buffer_) + *host_num_buffer_ * value_size_, + pos_->second.data(), value_size_); + } + OF_CUDA_CHECK(hipMemcpyAsync(n_result, host_num_buffer_, sizeof(uint32_t), hipMemcpyDefault, + cuda_stream->cuda_stream())); + const uint32_t num_keys = *host_num_buffer_; + if (num_keys != 0) { + OF_CUDA_CHECK(hipMemcpyAsync(keys, host_keys_buffer_, num_keys * key_size_, + hipMemcpyDefault, cuda_stream->cuda_stream())); + OF_CUDA_CHECK(hipMemcpyAsync(values, host_values_buffer_, num_keys * value_size_, + hipMemcpyDefault, cuda_stream->cuda_stream())); + } + } + + void Reset() override { pos_ = store_->begin(); } + + private: + HashMap* store_; + typename HashMap::iterator pos_; + uint32_t key_size_; + uint32_t value_size_; + uint32_t max_query_length_; + void* host_keys_buffer_; + void* host_values_buffer_; + uint32_t* host_num_buffer_; +}; + +template +class KeyValueStoreImpl : public KeyValueStore { + public: + OF_DISALLOW_COPY_AND_MOVE(KeyValueStoreImpl); + explicit KeyValueStoreImpl(const MockKeyValueStoreOptions& options) + : device_index_(-1), max_query_length_(0) { + OF_CUDA_CHECK(hipGetDevice(&device_index_)); + key_size_ = options.key_size; + value_size_ = options.value_size; + OF_CUDA_CHECK(NumaAwareCudaMallocHost( + device_index_, reinterpret_cast(&host_query_keys_), key_size_ * max_query_length_)); + OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, + reinterpret_cast(&host_query_values_), + value_size_ * max_query_length_)); + OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, reinterpret_cast(&host_n_missing_), + sizeof(uint32_t))); + OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, + reinterpret_cast(&host_missing_indices_), + sizeof(uint32_t) * max_query_length_)); + } + ~KeyValueStoreImpl() { + CudaCurrentDeviceGuard guard(device_index_); + if (max_query_length_ != 0) { + OF_CUDA_CHECK(hipHostFree(host_query_keys_)); + OF_CUDA_CHECK(hipHostFree(host_query_values_)); + OF_CUDA_CHECK(hipHostFree(host_missing_indices_)); + } + OF_CUDA_CHECK(hipHostFree(host_n_missing_)); + } + + uint32_t KeySize() const override { return key_size_; } + + uint32_t ValueSize() const override { return value_size_; } + + uint32_t MaxQueryLength() const override { return max_query_length_; } + + void ReserveQueryLength(uint32_t query_length) override { + CudaCurrentDeviceGuard guard(device_index_); + if (query_length <= max_query_length_) { return; } + if (max_query_length_ != 0) { + OF_CUDA_CHECK(hipHostFree(host_query_keys_)); + OF_CUDA_CHECK(hipHostFree(host_query_values_)); + OF_CUDA_CHECK(hipHostFree(host_missing_indices_)); + } + OF_CUDA_CHECK(NumaAwareCudaMallocHost( + device_index_, reinterpret_cast(&host_query_keys_), key_size_ * query_length)); + OF_CUDA_CHECK(NumaAwareCudaMallocHost( + device_index_, reinterpret_cast(&host_query_values_), value_size_ * query_length)); + OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, + reinterpret_cast(&host_missing_indices_), + sizeof(uint32_t) * query_length)); + max_query_length_ = query_length; + } + + void Get(ep::Stream* stream, uint32_t num_keys, const void* keys, void* values, + uint32_t* n_missing, uint32_t* missing_indices) override; + void Put(ep::Stream* stream, uint32_t num_keys, const void* keys, const void* values) override; + bool SnapshotExists(const std::string& name) override; + void LoadSnapshot(const std::string& name) override; + void LoadSnapshot(const std::string& name, + const std::function& Hook) override; + void SaveSnapshot(const std::string& name) override; + + private: + int device_index_; + uint32_t max_query_length_; + uint32_t key_size_; + uint32_t value_size_; + Key* host_query_keys_{}; + uint8_t* host_query_values_{}; + uint32_t* host_n_missing_{}; + uint32_t* host_missing_indices_{}; + HashMap store_; + HashMap> snapshots_; + std::mutex mutex_; +}; + +template +void KeyValueStoreImpl::Get(ep::Stream* stream, uint32_t num_keys, const void* keys, + void* values, uint32_t* n_missing, uint32_t* missing_indices) { + std::lock_guard lock(mutex_); + auto cuda_stream = stream->As(); + CHECK_LE(num_keys, max_query_length_); + if (num_keys == 0) { + OF_CUDA_CHECK(hipMemsetAsync(n_missing, 0, sizeof(uint32_t), + stream->As()->cuda_stream())); + return; + } + OF_CUDA_CHECK(hipMemcpyAsync(host_query_keys_, keys, key_size_ * num_keys, hipMemcpyDefault, + cuda_stream->cuda_stream())); + CHECK_JUST(cuda_stream->Sync()); + *host_n_missing_ = 0; + for (uint32_t i = 0; i < num_keys; ++i) { + auto it = store_.find(host_query_keys_[i]); + if (it != store_.end()) { + std::memcpy(host_query_values_ + i * value_size_, it->second.data(), value_size_); + } else { + host_missing_indices_[*host_n_missing_] = i; + *host_n_missing_ += 1; + } + } + OF_CUDA_CHECK(hipMemcpyAsync(values, host_query_values_, num_keys * value_size_, + hipMemcpyDefault, cuda_stream->cuda_stream())); + OF_CUDA_CHECK(hipMemcpyAsync(n_missing, host_n_missing_, sizeof(uint32_t), hipMemcpyDefault, + cuda_stream->cuda_stream())); + OF_CUDA_CHECK(hipMemcpyAsync(missing_indices, host_missing_indices_, + (*host_n_missing_) * sizeof(uint32_t), hipMemcpyDefault, + cuda_stream->cuda_stream())); +} + +template +void KeyValueStoreImpl::Put(ep::Stream* stream, uint32_t num_keys, const void* keys, + const void* values) { + std::lock_guard lock(mutex_); + auto cuda_stream = stream->As(); + CHECK_LE(num_keys, max_query_length_); + if (num_keys == 0) { return; } + OF_CUDA_CHECK(hipMemcpyAsync(host_query_keys_, keys, key_size_ * num_keys, hipMemcpyDefault, + cuda_stream->cuda_stream())); + OF_CUDA_CHECK(hipMemcpyAsync(host_query_values_, values, value_size_ * num_keys, + hipMemcpyDefault, cuda_stream->cuda_stream())); + CHECK_JUST(cuda_stream->Sync()); + for (uint32_t i = 0; i < num_keys; ++i) { + store_[host_query_keys_[i]] = std::string( + reinterpret_cast(host_query_values_) + i * value_size_, value_size_); + } +} + +template +bool KeyValueStoreImpl::SnapshotExists(const std::string& name) { + return snapshots_.find(name) != snapshots_.end(); +} + +template +void KeyValueStoreImpl::LoadSnapshot(const std::string& name) { + CudaCurrentDeviceGuard guard(device_index_); + LoadSnapshot(name, nullptr); +} + +template +void KeyValueStoreImpl::LoadSnapshot(const std::string& name, + const std::function& Hook) { + CudaCurrentDeviceGuard guard(device_index_); + store_ = snapshots_[name]; + if (Hook) { + IteratorImpl iterator(&store_, KeySize(), ValueSize(), max_query_length_, host_query_keys_, + host_query_values_, host_n_missing_); + Hook(&iterator); + } +} + +template +void KeyValueStoreImpl::SaveSnapshot(const std::string& name) { + CudaCurrentDeviceGuard guard(device_index_); + snapshots_[name] = store_; +} + +} // namespace + +std::unique_ptr NewMockKeyValueStore(const MockKeyValueStoreOptions& options) { + if (options.key_size == sizeof(uint64_t)) { + return std::unique_ptr(new KeyValueStoreImpl(options)); + } else if (options.key_size == sizeof(uint32_t)) { + return std::unique_ptr(new KeyValueStoreImpl(options)); + } else { + UNIMPLEMENTED(); + return nullptr; + } +} + +} // namespace embedding + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/embedding/persistent_table_key_value_store.hip.cpp b/oneflow/core/embedding/persistent_table_key_value_store.hip.cpp index 46fca92..8ec7a04 100644 --- a/oneflow/core/embedding/persistent_table_key_value_store.hip.cpp +++ b/oneflow/core/embedding/persistent_table_key_value_store.hip.cpp @@ -1,243 +1,243 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/embedding/persistent_table_key_value_store.h" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/embedding/persistent_table.h" -#include -#include -#include -#include -#include - -namespace oneflow { - -namespace embedding { - -namespace { - -class IteratorImpl : public KVIterator { - public: - OF_DISALLOW_COPY_AND_MOVE(IteratorImpl); - IteratorImpl(PersistentTable::Iterator* base_iter, uint32_t key_size, uint32_t value_size, - uint32_t max_query_length, void* host_keys_buffer, void* host_values_buffer, - uint32_t* host_num_buffer) - : base_iter_(base_iter), - key_size_(key_size), - value_size_(value_size), - max_query_length_(max_query_length), - host_keys_buffer_(host_keys_buffer), - host_values_buffer_(host_values_buffer), - host_num_buffer_(host_num_buffer) {} - ~IteratorImpl() override = default; - - void NextN(ep::Stream* stream, uint32_t n_request, uint32_t* n_result, void* keys, - void* values) override { - CHECK_LE(n_request, max_query_length_); - auto cuda_stream = stream->As(); - CHECK_JUST(cuda_stream->Sync()); - base_iter_->Next(n_request, host_num_buffer_, host_keys_buffer_, host_values_buffer_); - OF_CUDA_CHECK(hipMemcpyAsync(n_result, host_num_buffer_, sizeof(uint32_t), hipMemcpyDefault, - cuda_stream->cuda_stream())); - const uint32_t num_keys = *host_num_buffer_; - if (num_keys != 0) { - OF_CUDA_CHECK(hipMemcpyAsync(keys, host_keys_buffer_, num_keys * key_size_, - hipMemcpyDefault, cuda_stream->cuda_stream())); - OF_CUDA_CHECK(hipMemcpyAsync(values, host_values_buffer_, num_keys * value_size_, - hipMemcpyDefault, cuda_stream->cuda_stream())); - } - } - - void Reset() override { base_iter_->Reset(); } - - private: - PersistentTable::Iterator* base_iter_; - uint32_t key_size_; - uint32_t value_size_; - uint32_t max_query_length_; - void* host_keys_buffer_; - void* host_values_buffer_; - uint32_t* host_num_buffer_; -}; - -template -class KeyValueStoreImpl : public KeyValueStore { - public: - OF_DISALLOW_COPY_AND_MOVE(KeyValueStoreImpl); - explicit KeyValueStoreImpl(const PersistentTableKeyValueStoreOptions& options) - : device_index_(-1), max_query_length_(0) { - OF_CUDA_CHECK(hipGetDevice(&device_index_)); - key_size_ = options.table_options.key_size; - value_size_ = options.table_options.value_size; - table_ = NewPersistentTable(options.table_options); - OF_CUDA_CHECK(NumaAwareCudaMallocHost( - device_index_, reinterpret_cast(&host_query_keys_), key_size_ * max_query_length_)); - OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, - reinterpret_cast(&host_query_values_), - value_size_ * max_query_length_)); - OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, reinterpret_cast(&host_n_missing_), - sizeof(uint32_t))); - OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, - reinterpret_cast(&host_missing_indices_), - sizeof(uint32_t) * max_query_length_)); - } - ~KeyValueStoreImpl() { - CudaCurrentDeviceGuard guard(device_index_); - if (max_query_length_ != 0) { - OF_CUDA_CHECK(hipHostFree(host_query_keys_)); - OF_CUDA_CHECK(hipHostFree(host_query_values_)); - OF_CUDA_CHECK(hipHostFree(host_missing_indices_)); - } - OF_CUDA_CHECK(hipHostFree(host_n_missing_)); - } - - uint32_t KeySize() const override { return key_size_; } - - uint32_t ValueSize() const override { return value_size_; } - - uint32_t MaxQueryLength() const override { return max_query_length_; } - - void ReserveQueryLength(uint32_t query_length) override { - CudaCurrentDeviceGuard guard(device_index_); - if (query_length <= max_query_length_) { return; } - if (max_query_length_ != 0) { - OF_CUDA_CHECK(hipHostFree(host_query_keys_)); - OF_CUDA_CHECK(hipHostFree(host_query_values_)); - OF_CUDA_CHECK(hipHostFree(host_missing_indices_)); - } - OF_CUDA_CHECK(NumaAwareCudaMallocHost( - device_index_, reinterpret_cast(&host_query_keys_), key_size_ * query_length)); - OF_CUDA_CHECK(NumaAwareCudaMallocHost( - device_index_, reinterpret_cast(&host_query_values_), value_size_ * query_length)); - OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, - reinterpret_cast(&host_missing_indices_), - sizeof(uint32_t) * query_length)); - max_query_length_ = query_length; - } - - void Get(ep::Stream* stream, uint32_t num_keys, const void* keys, void* values, - uint32_t* n_missing, uint32_t* missing_indices) override; - void Put(ep::Stream* stream, uint32_t num_keys, const void* keys, const void* values) override; - bool SnapshotExists(const std::string& name) override; - void LoadSnapshot(const std::string& name) override; - void LoadSnapshot(const std::string& name, - const std::function& Hook) override; - void SaveSnapshot(const std::string& name) override; - - private: - int device_index_; - uint32_t max_query_length_; - uint32_t key_size_; - uint32_t value_size_; - Key* host_query_keys_{}; - uint8_t* host_query_values_{}; - uint32_t* host_n_missing_{}; - uint32_t* host_missing_indices_{}; - - std::mutex mutex_; - std::unique_ptr table_; -}; - -template -void KeyValueStoreImpl::Get(ep::Stream* stream, uint32_t num_keys, const void* keys, - void* values, uint32_t* n_missing, uint32_t* missing_indices) { - std::lock_guard lock(mutex_); - auto cuda_stream = stream->As(); - CHECK_LE(num_keys, max_query_length_); - if (num_keys == 0) { - OF_CUDA_CHECK(hipMemsetAsync(n_missing, 0, sizeof(uint32_t), - stream->As()->cuda_stream())); - return; - } - OF_CUDA_CHECK(hipMemcpyAsync(host_query_keys_, keys, key_size_ * num_keys, hipMemcpyDefault, - cuda_stream->cuda_stream())); - CHECK_JUST(cuda_stream->Sync()); - - table_->Get(num_keys, host_query_keys_, host_query_values_, host_n_missing_, - host_missing_indices_); - - OF_CUDA_CHECK(hipMemcpyAsync(values, host_query_values_, num_keys * value_size_, - hipMemcpyDefault, cuda_stream->cuda_stream())); - OF_CUDA_CHECK(hipMemcpyAsync(n_missing, host_n_missing_, sizeof(uint32_t), hipMemcpyDefault, - cuda_stream->cuda_stream())); - OF_CUDA_CHECK(hipMemcpyAsync(missing_indices, host_missing_indices_, - (*host_n_missing_) * sizeof(uint32_t), hipMemcpyDefault, - cuda_stream->cuda_stream())); -} - -template -void KeyValueStoreImpl::Put(ep::Stream* stream, uint32_t num_keys, const void* keys, - const void* values) { - std::lock_guard lock(mutex_); - auto cuda_stream = stream->As(); - CHECK_LE(num_keys, max_query_length_); - if (num_keys == 0) { return; } - OF_CUDA_CHECK(hipMemcpyAsync(host_query_keys_, keys, key_size_ * num_keys, hipMemcpyDefault, - cuda_stream->cuda_stream())); - OF_CUDA_CHECK(hipMemcpyAsync(host_query_values_, values, value_size_ * num_keys, - hipMemcpyDefault, cuda_stream->cuda_stream())); - CHECK_JUST(cuda_stream->Sync()); - table_->Put(num_keys, host_query_keys_, host_query_values_); -} - -template -bool KeyValueStoreImpl::SnapshotExists(const std::string& name) { - return table_->SnapshotExists(name); -} - -template -void KeyValueStoreImpl::LoadSnapshot(const std::string& name) { - CudaCurrentDeviceGuard guard(device_index_); - LoadSnapshot(name, nullptr); -} - -template -void KeyValueStoreImpl::LoadSnapshot(const std::string& name, - const std::function& Hook) { - CudaCurrentDeviceGuard guard(device_index_); - if (Hook) { - table_->LoadSnapshot(name, [&](PersistentTable::Iterator* chunk_iterator) { - IteratorImpl iterator(chunk_iterator, KeySize(), ValueSize(), max_query_length_, - host_query_keys_, host_query_values_, host_n_missing_); - Hook(&iterator); - }); - } else { - table_->LoadSnapshot(name); - } -} - -template -void KeyValueStoreImpl::SaveSnapshot(const std::string& name) { - CudaCurrentDeviceGuard guard(device_index_); - table_->SaveSnapshot(name); -} - -} // namespace - -std::unique_ptr NewPersistentTableKeyValueStore( - const PersistentTableKeyValueStoreOptions& options) { - if (options.table_options.key_size == sizeof(uint64_t)) { - return std::unique_ptr(new KeyValueStoreImpl(options)); - } else if (options.table_options.key_size == sizeof(uint32_t)) { - return std::unique_ptr(new KeyValueStoreImpl(options)); - } else { - UNIMPLEMENTED(); - return nullptr; - } -} - -} // namespace embedding - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/embedding/persistent_table_key_value_store.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/embedding/persistent_table.h" +#include +#include +#include +#include +#include + +namespace oneflow { + +namespace embedding { + +namespace { + +class IteratorImpl : public KVIterator { + public: + OF_DISALLOW_COPY_AND_MOVE(IteratorImpl); + IteratorImpl(PersistentTable::Iterator* base_iter, uint32_t key_size, uint32_t value_size, + uint32_t max_query_length, void* host_keys_buffer, void* host_values_buffer, + uint32_t* host_num_buffer) + : base_iter_(base_iter), + key_size_(key_size), + value_size_(value_size), + max_query_length_(max_query_length), + host_keys_buffer_(host_keys_buffer), + host_values_buffer_(host_values_buffer), + host_num_buffer_(host_num_buffer) {} + ~IteratorImpl() override = default; + + void NextN(ep::Stream* stream, uint32_t n_request, uint32_t* n_result, void* keys, + void* values) override { + CHECK_LE(n_request, max_query_length_); + auto cuda_stream = stream->As(); + CHECK_JUST(cuda_stream->Sync()); + base_iter_->Next(n_request, host_num_buffer_, host_keys_buffer_, host_values_buffer_); + OF_CUDA_CHECK(hipMemcpyAsync(n_result, host_num_buffer_, sizeof(uint32_t), hipMemcpyDefault, + cuda_stream->cuda_stream())); + const uint32_t num_keys = *host_num_buffer_; + if (num_keys != 0) { + OF_CUDA_CHECK(hipMemcpyAsync(keys, host_keys_buffer_, num_keys * key_size_, + hipMemcpyDefault, cuda_stream->cuda_stream())); + OF_CUDA_CHECK(hipMemcpyAsync(values, host_values_buffer_, num_keys * value_size_, + hipMemcpyDefault, cuda_stream->cuda_stream())); + } + } + + void Reset() override { base_iter_->Reset(); } + + private: + PersistentTable::Iterator* base_iter_; + uint32_t key_size_; + uint32_t value_size_; + uint32_t max_query_length_; + void* host_keys_buffer_; + void* host_values_buffer_; + uint32_t* host_num_buffer_; +}; + +template +class KeyValueStoreImpl : public KeyValueStore { + public: + OF_DISALLOW_COPY_AND_MOVE(KeyValueStoreImpl); + explicit KeyValueStoreImpl(const PersistentTableKeyValueStoreOptions& options) + : device_index_(-1), max_query_length_(0) { + OF_CUDA_CHECK(hipGetDevice(&device_index_)); + key_size_ = options.table_options.key_size; + value_size_ = options.table_options.value_size; + table_ = NewPersistentTable(options.table_options); + OF_CUDA_CHECK(NumaAwareCudaMallocHost( + device_index_, reinterpret_cast(&host_query_keys_), key_size_ * max_query_length_)); + OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, + reinterpret_cast(&host_query_values_), + value_size_ * max_query_length_)); + OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, reinterpret_cast(&host_n_missing_), + sizeof(uint32_t))); + OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, + reinterpret_cast(&host_missing_indices_), + sizeof(uint32_t) * max_query_length_)); + } + ~KeyValueStoreImpl() { + CudaCurrentDeviceGuard guard(device_index_); + if (max_query_length_ != 0) { + OF_CUDA_CHECK(hipHostFree(host_query_keys_)); + OF_CUDA_CHECK(hipHostFree(host_query_values_)); + OF_CUDA_CHECK(hipHostFree(host_missing_indices_)); + } + OF_CUDA_CHECK(hipHostFree(host_n_missing_)); + } + + uint32_t KeySize() const override { return key_size_; } + + uint32_t ValueSize() const override { return value_size_; } + + uint32_t MaxQueryLength() const override { return max_query_length_; } + + void ReserveQueryLength(uint32_t query_length) override { + CudaCurrentDeviceGuard guard(device_index_); + if (query_length <= max_query_length_) { return; } + if (max_query_length_ != 0) { + OF_CUDA_CHECK(hipHostFree(host_query_keys_)); + OF_CUDA_CHECK(hipHostFree(host_query_values_)); + OF_CUDA_CHECK(hipHostFree(host_missing_indices_)); + } + OF_CUDA_CHECK(NumaAwareCudaMallocHost( + device_index_, reinterpret_cast(&host_query_keys_), key_size_ * query_length)); + OF_CUDA_CHECK(NumaAwareCudaMallocHost( + device_index_, reinterpret_cast(&host_query_values_), value_size_ * query_length)); + OF_CUDA_CHECK(NumaAwareCudaMallocHost(device_index_, + reinterpret_cast(&host_missing_indices_), + sizeof(uint32_t) * query_length)); + max_query_length_ = query_length; + } + + void Get(ep::Stream* stream, uint32_t num_keys, const void* keys, void* values, + uint32_t* n_missing, uint32_t* missing_indices) override; + void Put(ep::Stream* stream, uint32_t num_keys, const void* keys, const void* values) override; + bool SnapshotExists(const std::string& name) override; + void LoadSnapshot(const std::string& name) override; + void LoadSnapshot(const std::string& name, + const std::function& Hook) override; + void SaveSnapshot(const std::string& name) override; + + private: + int device_index_; + uint32_t max_query_length_; + uint32_t key_size_; + uint32_t value_size_; + Key* host_query_keys_{}; + uint8_t* host_query_values_{}; + uint32_t* host_n_missing_{}; + uint32_t* host_missing_indices_{}; + + std::mutex mutex_; + std::unique_ptr table_; +}; + +template +void KeyValueStoreImpl::Get(ep::Stream* stream, uint32_t num_keys, const void* keys, + void* values, uint32_t* n_missing, uint32_t* missing_indices) { + std::lock_guard lock(mutex_); + auto cuda_stream = stream->As(); + CHECK_LE(num_keys, max_query_length_); + if (num_keys == 0) { + OF_CUDA_CHECK(hipMemsetAsync(n_missing, 0, sizeof(uint32_t), + stream->As()->cuda_stream())); + return; + } + OF_CUDA_CHECK(hipMemcpyAsync(host_query_keys_, keys, key_size_ * num_keys, hipMemcpyDefault, + cuda_stream->cuda_stream())); + CHECK_JUST(cuda_stream->Sync()); + + table_->Get(num_keys, host_query_keys_, host_query_values_, host_n_missing_, + host_missing_indices_); + + OF_CUDA_CHECK(hipMemcpyAsync(values, host_query_values_, num_keys * value_size_, + hipMemcpyDefault, cuda_stream->cuda_stream())); + OF_CUDA_CHECK(hipMemcpyAsync(n_missing, host_n_missing_, sizeof(uint32_t), hipMemcpyDefault, + cuda_stream->cuda_stream())); + OF_CUDA_CHECK(hipMemcpyAsync(missing_indices, host_missing_indices_, + (*host_n_missing_) * sizeof(uint32_t), hipMemcpyDefault, + cuda_stream->cuda_stream())); +} + +template +void KeyValueStoreImpl::Put(ep::Stream* stream, uint32_t num_keys, const void* keys, + const void* values) { + std::lock_guard lock(mutex_); + auto cuda_stream = stream->As(); + CHECK_LE(num_keys, max_query_length_); + if (num_keys == 0) { return; } + OF_CUDA_CHECK(hipMemcpyAsync(host_query_keys_, keys, key_size_ * num_keys, hipMemcpyDefault, + cuda_stream->cuda_stream())); + OF_CUDA_CHECK(hipMemcpyAsync(host_query_values_, values, value_size_ * num_keys, + hipMemcpyDefault, cuda_stream->cuda_stream())); + CHECK_JUST(cuda_stream->Sync()); + table_->Put(num_keys, host_query_keys_, host_query_values_); +} + +template +bool KeyValueStoreImpl::SnapshotExists(const std::string& name) { + return table_->SnapshotExists(name); +} + +template +void KeyValueStoreImpl::LoadSnapshot(const std::string& name) { + CudaCurrentDeviceGuard guard(device_index_); + LoadSnapshot(name, nullptr); +} + +template +void KeyValueStoreImpl::LoadSnapshot(const std::string& name, + const std::function& Hook) { + CudaCurrentDeviceGuard guard(device_index_); + if (Hook) { + table_->LoadSnapshot(name, [&](PersistentTable::Iterator* chunk_iterator) { + IteratorImpl iterator(chunk_iterator, KeySize(), ValueSize(), max_query_length_, + host_query_keys_, host_query_values_, host_n_missing_); + Hook(&iterator); + }); + } else { + table_->LoadSnapshot(name); + } +} + +template +void KeyValueStoreImpl::SaveSnapshot(const std::string& name) { + CudaCurrentDeviceGuard guard(device_index_); + table_->SaveSnapshot(name); +} + +} // namespace + +std::unique_ptr NewPersistentTableKeyValueStore( + const PersistentTableKeyValueStoreOptions& options) { + if (options.table_options.key_size == sizeof(uint64_t)) { + return std::unique_ptr(new KeyValueStoreImpl(options)); + } else if (options.table_options.key_size == sizeof(uint32_t)) { + return std::unique_ptr(new KeyValueStoreImpl(options)); + } else { + UNIMPLEMENTED(); + return nullptr; + } +} + +} // namespace embedding + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/ep/rocm/cuda_device.cpp b/oneflow/core/ep/rocm/cuda_device.cpp index 850a490..d0dae8a 100644 --- a/oneflow/core/ep/rocm/cuda_device.cpp +++ b/oneflow/core/ep/rocm/cuda_device.cpp @@ -1,179 +1,179 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/ep/rocm/cuda_device.h" -#include "oneflow/core/ep/rocm/cuda_event.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -#ifdef WITH_ROCM - -#include -#include - -// #if CUDA_VERSION >= 11000 -// #include -// #endif - -namespace oneflow { - -namespace ep { - -namespace { - -constexpr size_t kDefaultConstBufElementCount = 1024 * 1024; - -template -void CreateConstBuffer(void** buf, T value, size_t n) { - OF_CUDA_CHECK(hipMalloc(buf, n * sizeof(T))); - std::vector host(n, value); - OF_CUDA_CHECK(hipMemcpy(*buf, host.data(), n * sizeof(T), hipMemcpyDefault)); -} - -} // namespace - -CudaDevice::CudaDevice(int device_index, DeviceManager* device_manager) - : device_index_(device_index), - event_flags_{}, - properties_{}, - device_manager_(device_manager), - const_buf_elem_cnt_(0), - const_zeros_buffer_(nullptr), - const_ones_buffer_fp32_(nullptr), - const_ones_buffer_fp16_(nullptr), - const_ones_buffer_bf16_(nullptr) { - CudaCurrentDeviceGuard guard(device_index_); - OF_CUDA_CHECK(hipGetDeviceProperties(&properties_, device_index_)); - event_flags_ = hipEventDisableTiming; - if (ParseBooleanFromEnv("ONEFLOW_STREAM_CUDA_EVENT_FLAG_BLOCKING_SYNC", false)) { - event_flags_ |= hipEventBlockingSync; - } - const_buf_elem_cnt_ = ParseIntegerFromEnv("ONEFLOW_EP_CUDA_CONST_BUFFER_ELEMENT_COUNT", - kDefaultConstBufElementCount); - if (const_buf_elem_cnt_ > 0) { - CreateConstBuffer(&const_zeros_buffer_, static_cast(0), const_buf_elem_cnt_); - CreateConstBuffer(&const_ones_buffer_fp32_, static_cast(1.0), - const_buf_elem_cnt_); - CreateConstBuffer(&const_ones_buffer_fp16_, static_cast(1.0), const_buf_elem_cnt_); -// #if CUDA_VERSION >= 11000 -// CreateConstBuffer(&const_ones_buffer_bf16_, static_cast(1.0), -// const_buf_elem_cnt_); -// #endif - } -} - -CudaDevice::~CudaDevice() { - CudaCurrentDeviceGuard guard(device_index_); - for (auto* event : events_) { delete event; } - OF_CUDA_CHECK(hipFree(const_zeros_buffer_)); - OF_CUDA_CHECK(hipFree(const_ones_buffer_fp32_)); - OF_CUDA_CHECK(hipFree(const_ones_buffer_fp16_)); - OF_CUDA_CHECK(hipFree(const_ones_buffer_bf16_)); -} - -void CudaDevice::SetAsActiveDevice() { OF_CUDA_CHECK(hipSetDevice(device_index_)); } - -Stream* CudaDevice::CreateStream() { - CudaCurrentDeviceGuard guard(device_index_); - return new CudaStream(this); -} - -void CudaDevice::DestroyStream(Stream* stream) { - CudaCurrentDeviceGuard guard(device_index_); - delete stream; -} - -void CudaDevice::CreateEvents(Event** events, size_t count) { - size_t copied = 0; - { - std::lock_guard lock(events_mutex_); - copied = std::min(count, events_.size()); - size_t offset = events_.size() - copied; - std::copy(events_.begin() + offset, events_.end(), events); - events_.resize(offset); - } - if (copied != count) { - CudaCurrentDeviceGuard guard(device_index_); - for (size_t i = copied; i < count; ++i) { events[i] = new CudaEvent(event_flags_); } - } -} - -void CudaDevice::DestroyEvents(Event** events, size_t count) { - std::lock_guard lock(events_mutex_); - events_.insert(events_.end(), events, events + count); -} - -Maybe CudaDevice::Alloc(const AllocationOptions& options, void** ptr, size_t size) { - CudaCurrentDeviceGuard guard(device_index_); - CHECK(!options.HasPinnedDevice()); - hipError_t err = hipMalloc(ptr, size); - if (err != hipSuccess) { - return Error::RuntimeError() << hipGetErrorString(err); - } else { - return Maybe::Ok(); - } -} - -void CudaDevice::Free(const AllocationOptions& attr, void* ptr) { - CudaCurrentDeviceGuard guard(device_index_); - OF_CUDA_CHECK(hipFree(ptr)); -} - -Maybe CudaDevice::AllocPinned(const AllocationOptions& options, void** ptr, size_t size) { - CudaCurrentDeviceGuard guard(device_index_); - hipError_t err = NumaAwareCudaMallocHost(device_index_, ptr, size); - if (err != hipSuccess) { - return Error::RuntimeError() << hipGetErrorString(err); - } else { - return Maybe::Ok(); - } -} - -void CudaDevice::FreePinned(const AllocationOptions& options, void* ptr) { - CudaCurrentDeviceGuard guard(device_index_); - OF_CUDA_CHECK(hipHostFree(ptr)); -} - -const hipDeviceProp_t& CudaDevice::properties() const { return properties_; } - -const void* CudaDevice::GetConstZeros(DataType data_type, size_t n) const { - if (GetSizeOfDataType(data_type) * n - <= GetSizeOfDataType(DataType::kFloat) * const_buf_elem_cnt_) { - return const_zeros_buffer_; - } else { - return nullptr; - } -} - -const void* CudaDevice::GetConstOnes(DataType data_type, size_t n) const { - if (n <= const_buf_elem_cnt_) { - if (data_type == DataType::kFloat) { - return const_ones_buffer_fp32_; - } else if (data_type == DataType::kFloat16) { - return const_ones_buffer_fp16_; - } else if (data_type == DataType::kBFloat16) { - return const_ones_buffer_bf16_; - } else { - return nullptr; - } - } else { - return nullptr; - } -} - -} // namespace ep - -} // namespace oneflow - -#endif // WITH_ROCM +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/ep/rocm/cuda_device.h" +#include "oneflow/core/ep/rocm/cuda_event.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +#ifdef WITH_ROCM + +#include +#include + +// #if CUDA_VERSION >= 11000 +// #include +// #endif + +namespace oneflow { + +namespace ep { + +namespace { + +constexpr size_t kDefaultConstBufElementCount = 1024 * 1024; + +template +void CreateConstBuffer(void** buf, T value, size_t n) { + OF_CUDA_CHECK(hipMalloc(buf, n * sizeof(T))); + std::vector host(n, value); + OF_CUDA_CHECK(hipMemcpy(*buf, host.data(), n * sizeof(T), hipMemcpyDefault)); +} + +} // namespace + +CudaDevice::CudaDevice(int device_index, DeviceManager* device_manager) + : device_index_(device_index), + event_flags_{}, + properties_{}, + device_manager_(device_manager), + const_buf_elem_cnt_(0), + const_zeros_buffer_(nullptr), + const_ones_buffer_fp32_(nullptr), + const_ones_buffer_fp16_(nullptr), + const_ones_buffer_bf16_(nullptr) { + CudaCurrentDeviceGuard guard(device_index_); + OF_CUDA_CHECK(hipGetDeviceProperties(&properties_, device_index_)); + event_flags_ = hipEventDisableTiming; + if (ParseBooleanFromEnv("ONEFLOW_STREAM_CUDA_EVENT_FLAG_BLOCKING_SYNC", false)) { + event_flags_ |= hipEventBlockingSync; + } + const_buf_elem_cnt_ = ParseIntegerFromEnv("ONEFLOW_EP_CUDA_CONST_BUFFER_ELEMENT_COUNT", + kDefaultConstBufElementCount); + if (const_buf_elem_cnt_ > 0) { + CreateConstBuffer(&const_zeros_buffer_, static_cast(0), const_buf_elem_cnt_); + CreateConstBuffer(&const_ones_buffer_fp32_, static_cast(1.0), + const_buf_elem_cnt_); + CreateConstBuffer(&const_ones_buffer_fp16_, static_cast(1.0), const_buf_elem_cnt_); +// #if CUDA_VERSION >= 11000 +// CreateConstBuffer(&const_ones_buffer_bf16_, static_cast(1.0), +// const_buf_elem_cnt_); +// #endif + } +} + +CudaDevice::~CudaDevice() { + CudaCurrentDeviceGuard guard(device_index_); + for (auto* event : events_) { delete event; } + OF_CUDA_CHECK(hipFree(const_zeros_buffer_)); + OF_CUDA_CHECK(hipFree(const_ones_buffer_fp32_)); + OF_CUDA_CHECK(hipFree(const_ones_buffer_fp16_)); + OF_CUDA_CHECK(hipFree(const_ones_buffer_bf16_)); +} + +void CudaDevice::SetAsActiveDevice() { OF_CUDA_CHECK(hipSetDevice(device_index_)); } + +Stream* CudaDevice::CreateStream() { + CudaCurrentDeviceGuard guard(device_index_); + return new CudaStream(this); +} + +void CudaDevice::DestroyStream(Stream* stream) { + CudaCurrentDeviceGuard guard(device_index_); + delete stream; +} + +void CudaDevice::CreateEvents(Event** events, size_t count) { + size_t copied = 0; + { + std::lock_guard lock(events_mutex_); + copied = std::min(count, events_.size()); + size_t offset = events_.size() - copied; + std::copy(events_.begin() + offset, events_.end(), events); + events_.resize(offset); + } + if (copied != count) { + CudaCurrentDeviceGuard guard(device_index_); + for (size_t i = copied; i < count; ++i) { events[i] = new CudaEvent(event_flags_); } + } +} + +void CudaDevice::DestroyEvents(Event** events, size_t count) { + std::lock_guard lock(events_mutex_); + events_.insert(events_.end(), events, events + count); +} + +Maybe CudaDevice::Alloc(const AllocationOptions& options, void** ptr, size_t size) { + CudaCurrentDeviceGuard guard(device_index_); + CHECK(!options.HasPinnedDevice()); + hipError_t err = hipMalloc(ptr, size); + if (err != hipSuccess) { + return Error::RuntimeError() << hipGetErrorString(err); + } else { + return Maybe::Ok(); + } +} + +void CudaDevice::Free(const AllocationOptions& attr, void* ptr) { + CudaCurrentDeviceGuard guard(device_index_); + OF_CUDA_CHECK(hipFree(ptr)); +} + +Maybe CudaDevice::AllocPinned(const AllocationOptions& options, void** ptr, size_t size) { + CudaCurrentDeviceGuard guard(device_index_); + hipError_t err = NumaAwareCudaMallocHost(device_index_, ptr, size); + if (err != hipSuccess) { + return Error::RuntimeError() << hipGetErrorString(err); + } else { + return Maybe::Ok(); + } +} + +void CudaDevice::FreePinned(const AllocationOptions& options, void* ptr) { + CudaCurrentDeviceGuard guard(device_index_); + OF_CUDA_CHECK(hipHostFree(ptr)); +} + +const hipDeviceProp_t& CudaDevice::properties() const { return properties_; } + +const void* CudaDevice::GetConstZeros(DataType data_type, size_t n) const { + if (GetSizeOfDataType(data_type) * n + <= GetSizeOfDataType(DataType::kFloat) * const_buf_elem_cnt_) { + return const_zeros_buffer_; + } else { + return nullptr; + } +} + +const void* CudaDevice::GetConstOnes(DataType data_type, size_t n) const { + if (n <= const_buf_elem_cnt_) { + if (data_type == DataType::kFloat) { + return const_ones_buffer_fp32_; + } else if (data_type == DataType::kFloat16) { + return const_ones_buffer_fp16_; + } else if (data_type == DataType::kBFloat16) { + return const_ones_buffer_bf16_; + } else { + return nullptr; + } + } else { + return nullptr; + } +} + +} // namespace ep + +} // namespace oneflow + +#endif // WITH_ROCM diff --git a/oneflow/core/ep/rocm/cuda_device.h b/oneflow/core/ep/rocm/cuda_device.h index 76e1015..1623b8e 100644 --- a/oneflow/core/ep/rocm/cuda_device.h +++ b/oneflow/core/ep/rocm/cuda_device.h @@ -1,78 +1,78 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_ -#define ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_ - -#include "oneflow/core/ep/include/device.h" -#include "oneflow/core/common/data_type.h" - -#ifdef WITH_ROCM - -#include - -namespace oneflow { - -namespace ep { - -class CudaDevice : public Device { - public: - OF_DISALLOW_COPY_AND_MOVE(CudaDevice); - explicit CudaDevice(int device_index, DeviceManager* device_manager); - ~CudaDevice() override; - - void SetAsActiveDevice() override; - - DeviceType device_type() const override { return DeviceType::kCUDA; } - size_t device_index() const override { return device_index_; } - DeviceManager* device_manager() const override { return device_manager_; } - - Stream* CreateStream() override; - void DestroyStream(Stream* stream) override; - - void CreateEvents(Event** events, size_t count) override; - void DestroyEvents(Event** events, size_t count) override; - - Maybe Alloc(const AllocationOptions& options, void** ptr, size_t size) override; - void Free(const AllocationOptions& options, void* ptr) override; - Maybe AllocPinned(const AllocationOptions& options, void** ptr, size_t size) override; - void FreePinned(const AllocationOptions& options, void* ptr) override; - - const hipDeviceProp_t& properties() const; - - const void* GetConstZeros(DataType data_type, size_t n) const; - const void* GetConstOnes(DataType data_type, size_t n) const; - - private: - int device_index_; - std::mutex events_mutex_; - std::vector events_; - unsigned int event_flags_; - hipDeviceProp_t properties_; - DeviceManager* device_manager_; - int64_t const_buf_elem_cnt_; - void* const_zeros_buffer_; - void* const_ones_buffer_fp32_; - void* const_ones_buffer_fp16_; - void* const_ones_buffer_bf16_; -}; - -} // namespace ep - -} // namespace oneflow - -#endif // WITH_ROCM - -#endif // ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_ +#define ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_ + +#include "oneflow/core/ep/include/device.h" +#include "oneflow/core/common/data_type.h" + +#ifdef WITH_ROCM + +#include + +namespace oneflow { + +namespace ep { + +class CudaDevice : public Device { + public: + OF_DISALLOW_COPY_AND_MOVE(CudaDevice); + explicit CudaDevice(int device_index, DeviceManager* device_manager); + ~CudaDevice() override; + + void SetAsActiveDevice() override; + + DeviceType device_type() const override { return DeviceType::kCUDA; } + size_t device_index() const override { return device_index_; } + DeviceManager* device_manager() const override { return device_manager_; } + + Stream* CreateStream() override; + void DestroyStream(Stream* stream) override; + + void CreateEvents(Event** events, size_t count) override; + void DestroyEvents(Event** events, size_t count) override; + + Maybe Alloc(const AllocationOptions& options, void** ptr, size_t size) override; + void Free(const AllocationOptions& options, void* ptr) override; + Maybe AllocPinned(const AllocationOptions& options, void** ptr, size_t size) override; + void FreePinned(const AllocationOptions& options, void* ptr) override; + + const hipDeviceProp_t& properties() const; + + const void* GetConstZeros(DataType data_type, size_t n) const; + const void* GetConstOnes(DataType data_type, size_t n) const; + + private: + int device_index_; + std::mutex events_mutex_; + std::vector events_; + unsigned int event_flags_; + hipDeviceProp_t properties_; + DeviceManager* device_manager_; + int64_t const_buf_elem_cnt_; + void* const_zeros_buffer_; + void* const_ones_buffer_fp32_; + void* const_ones_buffer_fp16_; + void* const_ones_buffer_bf16_; +}; + +} // namespace ep + +} // namespace oneflow + +#endif // WITH_ROCM + +#endif // ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_ diff --git a/oneflow/core/ep/rocm/cuda_device_manager.cpp b/oneflow/core/ep/rocm/cuda_device_manager.cpp index 48664df..6ea769f 100644 --- a/oneflow/core/ep/rocm/cuda_device_manager.cpp +++ b/oneflow/core/ep/rocm/cuda_device_manager.cpp @@ -1,68 +1,68 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/ep/rocm/cuda_device_manager.h" -#include "oneflow/core/device/cuda_util.h" - -#ifdef WITH_ROCM - -namespace oneflow { - -namespace ep { - -CudaDeviceManager::CudaDeviceManager(DeviceManagerRegistry* registry) : registry_(registry) {} -CudaDeviceManager::~CudaDeviceManager() = default; - -DeviceManagerRegistry* CudaDeviceManager::registry() const { return registry_; } - -std::shared_ptr CudaDeviceManager::GetDevice(size_t device_index) { - std::lock_guard lock(devices_mutex_); - if (device_index < devices_.size() && devices_.at(device_index)) { - return devices_.at(device_index); - } - auto device = std::make_shared(device_index, this); - if (device_index >= devices_.size()) { devices_.resize(device_index + 1); } - devices_.at(device_index) = device; - return device; -} - -size_t CudaDeviceManager::GetDeviceCount(size_t primary_device_index) { - CudaCurrentDeviceGuard guard(primary_device_index); - return this->GetDeviceCount(); -} - -size_t CudaDeviceManager::GetDeviceCount() { - int count = 0; - hipError_t err = hipGetDeviceCount(&count); - if (err == hipErrorNoDevice || err == hipErrorInsufficientDriver) { return 0; } - OF_CUDA_CHECK(err); - return count; -} - -size_t CudaDeviceManager::GetActiveDeviceIndex() { - int device = 0; - OF_CUDA_CHECK(hipGetDevice(&device)); - return static_cast(device); -} - -void CudaDeviceManager::SetActiveDeviceByIndex(size_t device_index) { - OF_CUDA_CHECK(hipSetDevice(static_cast(device_index))); -} - -} // namespace ep - -} // namespace oneflow - -#endif // WITH_ROCM +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/ep/rocm/cuda_device_manager.h" +#include "oneflow/core/device/cuda_util.h" + +#ifdef WITH_ROCM + +namespace oneflow { + +namespace ep { + +CudaDeviceManager::CudaDeviceManager(DeviceManagerRegistry* registry) : registry_(registry) {} +CudaDeviceManager::~CudaDeviceManager() = default; + +DeviceManagerRegistry* CudaDeviceManager::registry() const { return registry_; } + +std::shared_ptr CudaDeviceManager::GetDevice(size_t device_index) { + std::lock_guard lock(devices_mutex_); + if (device_index < devices_.size() && devices_.at(device_index)) { + return devices_.at(device_index); + } + auto device = std::make_shared(device_index, this); + if (device_index >= devices_.size()) { devices_.resize(device_index + 1); } + devices_.at(device_index) = device; + return device; +} + +size_t CudaDeviceManager::GetDeviceCount(size_t primary_device_index) { + CudaCurrentDeviceGuard guard(primary_device_index); + return this->GetDeviceCount(); +} + +size_t CudaDeviceManager::GetDeviceCount() { + int count = 0; + hipError_t err = hipGetDeviceCount(&count); + if (err == hipErrorNoDevice || err == hipErrorInsufficientDriver) { return 0; } + OF_CUDA_CHECK(err); + return count; +} + +size_t CudaDeviceManager::GetActiveDeviceIndex() { + int device = 0; + OF_CUDA_CHECK(hipGetDevice(&device)); + return static_cast(device); +} + +void CudaDeviceManager::SetActiveDeviceByIndex(size_t device_index) { + OF_CUDA_CHECK(hipSetDevice(static_cast(device_index))); +} + +} // namespace ep + +} // namespace oneflow + +#endif // WITH_ROCM diff --git a/oneflow/core/ep/rocm/cuda_device_manager.h b/oneflow/core/ep/rocm/cuda_device_manager.h index e1b9488..22a9fc8 100644 --- a/oneflow/core/ep/rocm/cuda_device_manager.h +++ b/oneflow/core/ep/rocm/cuda_device_manager.h @@ -1,54 +1,54 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_MANAGER_H_ -#define ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_MANAGER_H_ - -#include "oneflow/core/ep/include/device_manager.h" -#include "oneflow/core/ep/rocm/cuda_device.h" - -#ifdef WITH_ROCM - -namespace oneflow { -namespace ep { - -class CudaDevice; - -class CudaDeviceManager : public DeviceManager { - public: - OF_DISALLOW_COPY_AND_MOVE(CudaDeviceManager); - CudaDeviceManager(DeviceManagerRegistry* registry); - ~CudaDeviceManager() override; - - DeviceManagerRegistry* registry() const override; - std::shared_ptr GetDevice(size_t device_index) override; - size_t GetDeviceCount(size_t primary_device_index) override; - size_t GetDeviceCount() override; - size_t GetActiveDeviceIndex() override; - void SetActiveDeviceByIndex(size_t device_index) override; - - private: - std::mutex devices_mutex_; - std::vector> devices_; - DeviceManagerRegistry* registry_; -}; - -} // namespace ep - -} // namespace oneflow - -#endif // WITH_ROCM - -#endif // ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_MANAGER_H_ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_MANAGER_H_ +#define ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_MANAGER_H_ + +#include "oneflow/core/ep/include/device_manager.h" +#include "oneflow/core/ep/rocm/cuda_device.h" + +#ifdef WITH_ROCM + +namespace oneflow { +namespace ep { + +class CudaDevice; + +class CudaDeviceManager : public DeviceManager { + public: + OF_DISALLOW_COPY_AND_MOVE(CudaDeviceManager); + CudaDeviceManager(DeviceManagerRegistry* registry); + ~CudaDeviceManager() override; + + DeviceManagerRegistry* registry() const override; + std::shared_ptr GetDevice(size_t device_index) override; + size_t GetDeviceCount(size_t primary_device_index) override; + size_t GetDeviceCount() override; + size_t GetActiveDeviceIndex() override; + void SetActiveDeviceByIndex(size_t device_index) override; + + private: + std::mutex devices_mutex_; + std::vector> devices_; + DeviceManagerRegistry* registry_; +}; + +} // namespace ep + +} // namespace oneflow + +#endif // WITH_ROCM + +#endif // ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_MANAGER_H_ diff --git a/oneflow/core/ep/rocm/cuda_device_manager_factory.cpp b/oneflow/core/ep/rocm/cuda_device_manager_factory.cpp index fb8d15c..6b559fe 100644 --- a/oneflow/core/ep/rocm/cuda_device_manager_factory.cpp +++ b/oneflow/core/ep/rocm/cuda_device_manager_factory.cpp @@ -1,117 +1,117 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/ep/include/device_manager_factory.h" -#include "oneflow/core/ep/include/device_manager_registry.h" -#include "oneflow/core/ep/rocm/cuda_device_manager.h" - -#ifdef WITH_ROCM - -#include -#include -#include - -namespace oneflow { - -namespace ep { - -namespace { - -std::string GetCudaVersionString(int version) { - return std::to_string(version / 1000) + "." + std::to_string((version % 1000) / 10); -} - -bool GetCudnnVersion(size_t* major, size_t* minor, size_t* patch) { - miopenStatus_t status = miopenGetVersion(major, minor, patch); - if (status == miopenStatusSuccess) { - return true; - } else { - LOG(ERROR) << "Failed to get cuDNN version: " << miopenGetErrorString(status); - return false; - } -} - -bool GetCudnnVersionString(std::string* version) { - size_t version_major = 0; - size_t version_minor = 0; - size_t version_patch = 0; - if (!GetCudnnVersion(&version_major, &version_minor, &version_patch)) { return false; } - *version = std::to_string(version_major) + "." + std::to_string(version_minor) + "." - + std::to_string(version_patch); - return true; -} - -void CudaDumpVersionInfo() { - { - int cuda_runtime_version = 0; - hipError_t err = hipRuntimeGetVersion(&cuda_runtime_version); - if (err == hipSuccess) { - LOG(INFO) << "CUDA runtime version: " << GetCudaVersionString(cuda_runtime_version); - } else { - LOG(ERROR) << "Failed to get cuda runtime version: " << hipGetErrorString(err); - } - } - - { - std::string cudnn_version_string; - if (GetCudnnVersionString(&cudnn_version_string)) { - LOG(INFO) << "cuDNN version: " << cudnn_version_string; - } - } - - { - int nccl_version = 0; - ncclResult_t result = ncclGetVersion(&nccl_version); - if (result == ncclSuccess) { - int nccl_version_major = - (nccl_version >= 20900) ? (nccl_version / 10000) : (nccl_version / 1000); - int nccl_version_minor = - (nccl_version >= 20900) ? (nccl_version % 10000) / 100 : (nccl_version % 1000) / 100; - int nccl_version_patch = (nccl_version % 100); - LOG(INFO) << "NCCL version: " << nccl_version_major << "." << nccl_version_minor << "." - << nccl_version_patch; - } else { - LOG(ERROR) << "Failed to get NCCL version: " << ncclGetErrorString(result); - } - } -} - -class CudaDeviceManagerFactory : public DeviceManagerFactory { - public: - OF_DISALLOW_COPY_AND_MOVE(CudaDeviceManagerFactory); - CudaDeviceManagerFactory() = default; - ~CudaDeviceManagerFactory() override = default; - - std::unique_ptr NewDeviceManager(DeviceManagerRegistry* registry) override { - return std::make_unique(registry); - } - - DeviceType device_type() const override { return DeviceType::kCUDA; } - - std::string device_type_name() const override { return "cuda"; } - - void DumpVersionInfo() const override { CudaDumpVersionInfo(); } -}; - -COMMAND(DeviceManagerRegistry::RegisterDeviceManagerFactory( - std::make_unique())) - -} // namespace - -} // namespace ep - -} // namespace oneflow - -#endif // WITH_ROCM +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/ep/include/device_manager_factory.h" +#include "oneflow/core/ep/include/device_manager_registry.h" +#include "oneflow/core/ep/rocm/cuda_device_manager.h" + +#ifdef WITH_ROCM + +#include +#include +#include + +namespace oneflow { + +namespace ep { + +namespace { + +std::string GetCudaVersionString(int version) { + return std::to_string(version / 1000) + "." + std::to_string((version % 1000) / 10); +} + +bool GetCudnnVersion(size_t* major, size_t* minor, size_t* patch) { + miopenStatus_t status = miopenGetVersion(major, minor, patch); + if (status == miopenStatusSuccess) { + return true; + } else { + LOG(ERROR) << "Failed to get cuDNN version: " << miopenGetErrorString(status); + return false; + } +} + +bool GetCudnnVersionString(std::string* version) { + size_t version_major = 0; + size_t version_minor = 0; + size_t version_patch = 0; + if (!GetCudnnVersion(&version_major, &version_minor, &version_patch)) { return false; } + *version = std::to_string(version_major) + "." + std::to_string(version_minor) + "." + + std::to_string(version_patch); + return true; +} + +void CudaDumpVersionInfo() { + { + int cuda_runtime_version = 0; + hipError_t err = hipRuntimeGetVersion(&cuda_runtime_version); + if (err == hipSuccess) { + LOG(INFO) << "CUDA runtime version: " << GetCudaVersionString(cuda_runtime_version); + } else { + LOG(ERROR) << "Failed to get cuda runtime version: " << hipGetErrorString(err); + } + } + + { + std::string cudnn_version_string; + if (GetCudnnVersionString(&cudnn_version_string)) { + LOG(INFO) << "cuDNN version: " << cudnn_version_string; + } + } + + { + int nccl_version = 0; + ncclResult_t result = ncclGetVersion(&nccl_version); + if (result == ncclSuccess) { + int nccl_version_major = + (nccl_version >= 20900) ? (nccl_version / 10000) : (nccl_version / 1000); + int nccl_version_minor = + (nccl_version >= 20900) ? (nccl_version % 10000) / 100 : (nccl_version % 1000) / 100; + int nccl_version_patch = (nccl_version % 100); + LOG(INFO) << "NCCL version: " << nccl_version_major << "." << nccl_version_minor << "." + << nccl_version_patch; + } else { + LOG(ERROR) << "Failed to get NCCL version: " << ncclGetErrorString(result); + } + } +} + +class CudaDeviceManagerFactory : public DeviceManagerFactory { + public: + OF_DISALLOW_COPY_AND_MOVE(CudaDeviceManagerFactory); + CudaDeviceManagerFactory() = default; + ~CudaDeviceManagerFactory() override = default; + + std::unique_ptr NewDeviceManager(DeviceManagerRegistry* registry) override { + return std::make_unique(registry); + } + + DeviceType device_type() const override { return DeviceType::kCUDA; } + + std::string device_type_name() const override { return "cuda"; } + + void DumpVersionInfo() const override { CudaDumpVersionInfo(); } +}; + +COMMAND(DeviceManagerRegistry::RegisterDeviceManagerFactory( + std::make_unique())) + +} // namespace + +} // namespace ep + +} // namespace oneflow + +#endif // WITH_ROCM diff --git a/oneflow/core/ep/rocm/cuda_event.cpp b/oneflow/core/ep/rocm/cuda_event.cpp index 20ce0f0..011adef 100644 --- a/oneflow/core/ep/rocm/cuda_event.cpp +++ b/oneflow/core/ep/rocm/cuda_event.cpp @@ -1,56 +1,56 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/ep/rocm/cuda_event.h" - -#ifdef WITH_ROCM - -namespace oneflow { - -namespace ep { - -CudaEvent::CudaEvent(unsigned int flags) : cuda_event_{} { - OF_CUDA_CHECK(hipEventCreateWithFlags(&cuda_event_, flags)); -} - -CudaEvent::~CudaEvent() { OF_CUDA_CHECK(hipEventDestroy(cuda_event_)); } - -Maybe CudaEvent::QueryDone() { - hipError_t err = hipEventQuery(cuda_event_); - if (err == hipSuccess) { - return Maybe(true); - } else if (err == hipErrorNotReady) { - return Maybe(false); - } else { - return Error::RuntimeError() << hipGetErrorString(err); - } -} - -Maybe CudaEvent::Sync() { - hipError_t err = hipEventSynchronize(cuda_event_); - if (err == hipSuccess) { - return Maybe::Ok(); - } else { - return Error::RuntimeError() << hipGetErrorString(err); - } -} - -hipEvent_t CudaEvent::cuda_event() { return cuda_event_; } - -} // namespace ep - -} // namespace oneflow - -#endif // WITH_ROCM +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/ep/rocm/cuda_event.h" + +#ifdef WITH_ROCM + +namespace oneflow { + +namespace ep { + +CudaEvent::CudaEvent(unsigned int flags) : cuda_event_{} { + OF_CUDA_CHECK(hipEventCreateWithFlags(&cuda_event_, flags)); +} + +CudaEvent::~CudaEvent() { OF_CUDA_CHECK(hipEventDestroy(cuda_event_)); } + +Maybe CudaEvent::QueryDone() { + hipError_t err = hipEventQuery(cuda_event_); + if (err == hipSuccess) { + return Maybe(true); + } else if (err == hipErrorNotReady) { + return Maybe(false); + } else { + return Error::RuntimeError() << hipGetErrorString(err); + } +} + +Maybe CudaEvent::Sync() { + hipError_t err = hipEventSynchronize(cuda_event_); + if (err == hipSuccess) { + return Maybe::Ok(); + } else { + return Error::RuntimeError() << hipGetErrorString(err); + } +} + +hipEvent_t CudaEvent::cuda_event() { return cuda_event_; } + +} // namespace ep + +} // namespace oneflow + +#endif // WITH_ROCM diff --git a/oneflow/core/ep/rocm/cuda_event.h b/oneflow/core/ep/rocm/cuda_event.h index 37a3379..62caf75 100644 --- a/oneflow/core/ep/rocm/cuda_event.h +++ b/oneflow/core/ep/rocm/cuda_event.h @@ -1,50 +1,50 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_EP_ROCM_CUDA_EVENT_H_ -#define ONEFLOW_CORE_EP_ROCM_CUDA_EVENT_H_ - -#include "oneflow/core/ep/include/event.h" - -#ifdef WITH_ROCM - -#include "oneflow/core/device/cuda_util.h" - -namespace oneflow { - -namespace ep { - -class CudaEvent : public Event { - public: - OF_DISALLOW_COPY_AND_MOVE(CudaEvent); - explicit CudaEvent(unsigned int flags); - ~CudaEvent() override; - - Maybe QueryDone() override; - Maybe Sync() override; - - hipEvent_t cuda_event(); - - private: - hipEvent_t cuda_event_; -}; - -} // namespace ep - -} // namespace oneflow - -#endif // WITH_ROCM - -#endif // ONEFLOW_CORE_EP_ROCM_CUDA_EVENT_H_ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_EP_ROCM_CUDA_EVENT_H_ +#define ONEFLOW_CORE_EP_ROCM_CUDA_EVENT_H_ + +#include "oneflow/core/ep/include/event.h" + +#ifdef WITH_ROCM + +#include "oneflow/core/device/cuda_util.h" + +namespace oneflow { + +namespace ep { + +class CudaEvent : public Event { + public: + OF_DISALLOW_COPY_AND_MOVE(CudaEvent); + explicit CudaEvent(unsigned int flags); + ~CudaEvent() override; + + Maybe QueryDone() override; + Maybe Sync() override; + + hipEvent_t cuda_event(); + + private: + hipEvent_t cuda_event_; +}; + +} // namespace ep + +} // namespace oneflow + +#endif // WITH_ROCM + +#endif // ONEFLOW_CORE_EP_ROCM_CUDA_EVENT_H_ diff --git a/oneflow/core/ep/rocm/cuda_stream.cpp b/oneflow/core/ep/rocm/cuda_stream.cpp index 18f1870..1508ba0 100644 --- a/oneflow/core/ep/rocm/cuda_stream.cpp +++ b/oneflow/core/ep/rocm/cuda_stream.cpp @@ -1,180 +1,180 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include "oneflow/core/job/global_for.h" -#include "oneflow/core/job/resource_desc.h" -#include "oneflow/core/hardware/node_device_descriptor_manager.h" -#include "oneflow/core/hardware/cuda_device_descriptor.h" -#include "oneflow/core/ep/rocm/cuda_event.h" -#include "oneflow/core/ep/rocm/cuda_device.h" - -#ifdef WITH_ROCM - -namespace oneflow { - -namespace ep { - -namespace { - -constexpr size_t kDefaultWorkspaceSize = 4 * 1024 * 1024; // 4M - -void SetAffinityByDevice(int dev_id) { - auto node_device_desc_mgr = Singleton::Get(); - if (node_device_desc_mgr == nullptr) { return; } - auto node_device_desc = node_device_desc_mgr->GetLocalNodeDeviceDescriptor(); - auto cuda_device = std::dynamic_pointer_cast( - node_device_desc->GetDevice(hardware::kCudaDeviceDescriptorClassName, dev_id)); - if (!cuda_device) { return; } - node_device_desc->Topology()->SetCPUAffinityByPCIBusID(cuda_device->PCIBusID()); - node_device_desc->Topology()->SetMemoryAffinityByPCIBusID(cuda_device->PCIBusID()); -} - -} // namespace - -#ifdef WITH_ROCM_GRAPHS - -CudaGraphExecutable::CudaGraphExecutable() : graph_exec_(nullptr), dev_(-1) {} - -CudaGraphExecutable::~CudaGraphExecutable() { Reset(); } - -void CudaGraphExecutable::Update(hipGraph_t graph) { - int dev = -1; - OF_CUDA_CHECK(hipGetDevice(&dev)); - if (dev != dev_) { Reset(); } - dev_ = dev; - if (graph_exec_ != nullptr) { - hipGraphExecUpdateResult update_result{}; - hipGraphNode_t error_node = nullptr; - OF_CUDA_CHECK(hipGraphExecUpdate(graph_exec_, graph, &error_node, &update_result)); - if (update_result == hipGraphExecUpdateSuccess) { return; } - } - Reset(); - OF_CUDA_CHECK(hipGraphInstantiate(&graph_exec_, graph, NULL, NULL, 0)); -} - -void CudaGraphExecutable::Launch(hipStream_t stream) const { - OF_CUDA_CHECK(hipGraphLaunch(graph_exec_, stream)); -} - -bool CudaGraphExecutable::IsInstantiated() const { return graph_exec_ != nullptr; } - -void CudaGraphExecutable::Reset() { - if (graph_exec_ != nullptr) { - CudaCurrentDeviceGuard guard(dev_); - OF_CUDA_CHECK(hipGraphExecDestroy(graph_exec_)); - } -} - -#endif // WITH_ROCM_GRAPHS - -CudaStream::CudaStream(CudaDevice* device) - : device_index_(device->device_index()), device_(device) { - CudaCurrentDeviceGuard guard(device_index_); - // cuda_stream - OF_CUDA_CHECK(hipStreamCreate(&cuda_stream_)); - // cublas_handle - OF_CUBLAS_CHECK(hipblasCreate(&cublas_handle_)); - OF_CUBLAS_CHECK(hipblasSetStream(cublas_handle_, cuda_stream_)); - - workspace_size_ = kDefaultWorkspaceSize; - OF_CUDA_CHECK(hipMalloc(&workspace_, workspace_size_)); - - OF_CUDNN_CHECK(hipdnnCreate(&cudnn_handle_)); - - OF_CUDNN_CHECK(hipdnnSetStream(cudnn_handle_, cuda_stream_)); -} - -CudaStream::~CudaStream() { - CudaCurrentDeviceGuard guard(device_index_); - OF_CUDA_CHECK(hipStreamSynchronize(cuda_stream_)); - OF_CUDNN_CHECK(hipdnnDestroy(cudnn_handle_)); - OF_CUBLAS_CHECK(hipblasDestroy(cublas_handle_)); - - OF_CUDA_CHECK(hipStreamDestroy(cuda_stream_)); - OF_CUDA_CHECK(hipFree(workspace_)); -} - -Maybe CudaStream::OnExecutionContextSetup() { - OF_CUDA_CHECK(hipSetDevice(device_index_)); - SetAffinityByDevice(device_index_); - return Maybe::Ok(); -} - -Maybe CudaStream::OnExecutionContextTeardown() { return Maybe::Ok(); } - -DeviceType CudaStream::device_type() const { return DeviceType::kCUDA; } - -CudaDevice* CudaStream::device() const { return device_; } - -Maybe CudaStream::Sync() { - hipError_t err = hipStreamSynchronize(cuda_stream_); - if (err == hipSuccess) { - return Maybe::Ok(); - } else { - return Error::RuntimeError() << hipGetErrorString(err) << " (" << err << ") "; - } -} - -void CudaStream::RecordEvent(Event* event) { - auto* cuda_event = static_cast(event); // NOLINT - OF_CUDA_CHECK(hipEventRecord(cuda_event->cuda_event(), cuda_stream_)); -} - -hipStream_t CudaStream::cuda_stream() const { return cuda_stream_; } - -hipblasHandle_t CudaStream::cublas_handle() const { return cublas_handle_; } - -void* CudaStream::cublas_workspace() const { return workspace_; } - -size_t CudaStream::cublas_workspace_size() const { return workspace_size_; } - -hipdnnHandle_t CudaStream::cudnn_handle() const { return cudnn_handle_; } - -const hipDeviceProp_t& CudaStream::device_properties() const { return device_->properties(); } - -int CudaStream::cuda_arch() const { - return device_->properties().major * 100 + device_->properties().minor * 10; -} - -#ifdef WITH_ROCM_GRAPHS - -void CudaStream::BeginGraphCapture() { - CHECK(!is_graph_capturing_); - is_graph_capturing_ = true; - OF_CUDA_CHECK(hipStreamBeginCapture(cuda_stream_, hipStreamCaptureModeThreadLocal)); -} - -void CudaStream::EndGraphCapture(CudaGraphExecutable* executable) { - hipGraph_t graph = nullptr; - OF_CUDA_CHECK(hipStreamEndCapture(cuda_stream_, &graph)); - executable->Update(graph); - OF_CUDA_CHECK(hipGraphDestroy(graph)); - is_graph_capturing_ = false; -} - -bool CudaStream::IsGraphCapturing() const { return is_graph_capturing_; } - -void CudaStream::LaunchGraph(const CudaGraphExecutable* executable) { - executable->Launch(cuda_stream_); -} - -#endif // WITH_ROCM_GRAPHS - -} // namespace ep - -} // namespace oneflow - -#endif // WITH_ROCM +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include "oneflow/core/job/global_for.h" +#include "oneflow/core/job/resource_desc.h" +#include "oneflow/core/hardware/node_device_descriptor_manager.h" +#include "oneflow/core/hardware/cuda_device_descriptor.h" +#include "oneflow/core/ep/rocm/cuda_event.h" +#include "oneflow/core/ep/rocm/cuda_device.h" + +#ifdef WITH_ROCM + +namespace oneflow { + +namespace ep { + +namespace { + +constexpr size_t kDefaultWorkspaceSize = 4 * 1024 * 1024; // 4M + +void SetAffinityByDevice(int dev_id) { + auto node_device_desc_mgr = Singleton::Get(); + if (node_device_desc_mgr == nullptr) { return; } + auto node_device_desc = node_device_desc_mgr->GetLocalNodeDeviceDescriptor(); + auto cuda_device = std::dynamic_pointer_cast( + node_device_desc->GetDevice(hardware::kCudaDeviceDescriptorClassName, dev_id)); + if (!cuda_device) { return; } + node_device_desc->Topology()->SetCPUAffinityByPCIBusID(cuda_device->PCIBusID()); + node_device_desc->Topology()->SetMemoryAffinityByPCIBusID(cuda_device->PCIBusID()); +} + +} // namespace + +#ifdef WITH_ROCM_GRAPHS + +CudaGraphExecutable::CudaGraphExecutable() : graph_exec_(nullptr), dev_(-1) {} + +CudaGraphExecutable::~CudaGraphExecutable() { Reset(); } + +void CudaGraphExecutable::Update(hipGraph_t graph) { + int dev = -1; + OF_CUDA_CHECK(hipGetDevice(&dev)); + if (dev != dev_) { Reset(); } + dev_ = dev; + if (graph_exec_ != nullptr) { + hipGraphExecUpdateResult update_result{}; + hipGraphNode_t error_node = nullptr; + OF_CUDA_CHECK(hipGraphExecUpdate(graph_exec_, graph, &error_node, &update_result)); + if (update_result == hipGraphExecUpdateSuccess) { return; } + } + Reset(); + OF_CUDA_CHECK(hipGraphInstantiate(&graph_exec_, graph, NULL, NULL, 0)); +} + +void CudaGraphExecutable::Launch(hipStream_t stream) const { + OF_CUDA_CHECK(hipGraphLaunch(graph_exec_, stream)); +} + +bool CudaGraphExecutable::IsInstantiated() const { return graph_exec_ != nullptr; } + +void CudaGraphExecutable::Reset() { + if (graph_exec_ != nullptr) { + CudaCurrentDeviceGuard guard(dev_); + OF_CUDA_CHECK(hipGraphExecDestroy(graph_exec_)); + } +} + +#endif // WITH_ROCM_GRAPHS + +CudaStream::CudaStream(CudaDevice* device) + : device_index_(device->device_index()), device_(device) { + CudaCurrentDeviceGuard guard(device_index_); + // cuda_stream + OF_CUDA_CHECK(hipStreamCreate(&cuda_stream_)); + // cublas_handle + OF_CUBLAS_CHECK(hipblasCreate(&cublas_handle_)); + OF_CUBLAS_CHECK(hipblasSetStream(cublas_handle_, cuda_stream_)); + + workspace_size_ = kDefaultWorkspaceSize; + OF_CUDA_CHECK(hipMalloc(&workspace_, workspace_size_)); + + OF_CUDNN_CHECK(hipdnnCreate(&cudnn_handle_)); + + OF_CUDNN_CHECK(hipdnnSetStream(cudnn_handle_, cuda_stream_)); +} + +CudaStream::~CudaStream() { + CudaCurrentDeviceGuard guard(device_index_); + OF_CUDA_CHECK(hipStreamSynchronize(cuda_stream_)); + OF_CUDNN_CHECK(hipdnnDestroy(cudnn_handle_)); + OF_CUBLAS_CHECK(hipblasDestroy(cublas_handle_)); + + OF_CUDA_CHECK(hipStreamDestroy(cuda_stream_)); + OF_CUDA_CHECK(hipFree(workspace_)); +} + +Maybe CudaStream::OnExecutionContextSetup() { + OF_CUDA_CHECK(hipSetDevice(device_index_)); + SetAffinityByDevice(device_index_); + return Maybe::Ok(); +} + +Maybe CudaStream::OnExecutionContextTeardown() { return Maybe::Ok(); } + +DeviceType CudaStream::device_type() const { return DeviceType::kCUDA; } + +CudaDevice* CudaStream::device() const { return device_; } + +Maybe CudaStream::Sync() { + hipError_t err = hipStreamSynchronize(cuda_stream_); + if (err == hipSuccess) { + return Maybe::Ok(); + } else { + return Error::RuntimeError() << hipGetErrorString(err) << " (" << err << ") "; + } +} + +void CudaStream::RecordEvent(Event* event) { + auto* cuda_event = static_cast(event); // NOLINT + OF_CUDA_CHECK(hipEventRecord(cuda_event->cuda_event(), cuda_stream_)); +} + +hipStream_t CudaStream::cuda_stream() const { return cuda_stream_; } + +hipblasHandle_t CudaStream::cublas_handle() const { return cublas_handle_; } + +void* CudaStream::cublas_workspace() const { return workspace_; } + +size_t CudaStream::cublas_workspace_size() const { return workspace_size_; } + +hipdnnHandle_t CudaStream::cudnn_handle() const { return cudnn_handle_; } + +const hipDeviceProp_t& CudaStream::device_properties() const { return device_->properties(); } + +int CudaStream::cuda_arch() const { + return device_->properties().major * 100 + device_->properties().minor * 10; +} + +#ifdef WITH_ROCM_GRAPHS + +void CudaStream::BeginGraphCapture() { + CHECK(!is_graph_capturing_); + is_graph_capturing_ = true; + OF_CUDA_CHECK(hipStreamBeginCapture(cuda_stream_, hipStreamCaptureModeThreadLocal)); +} + +void CudaStream::EndGraphCapture(CudaGraphExecutable* executable) { + hipGraph_t graph = nullptr; + OF_CUDA_CHECK(hipStreamEndCapture(cuda_stream_, &graph)); + executable->Update(graph); + OF_CUDA_CHECK(hipGraphDestroy(graph)); + is_graph_capturing_ = false; +} + +bool CudaStream::IsGraphCapturing() const { return is_graph_capturing_; } + +void CudaStream::LaunchGraph(const CudaGraphExecutable* executable) { + executable->Launch(cuda_stream_); +} + +#endif // WITH_ROCM_GRAPHS + +} // namespace ep + +} // namespace oneflow + +#endif // WITH_ROCM diff --git a/oneflow/core/ep/rocm/cuda_stream.h b/oneflow/core/ep/rocm/cuda_stream.h index b3149a7..b63af20 100644 --- a/oneflow/core/ep/rocm/cuda_stream.h +++ b/oneflow/core/ep/rocm/cuda_stream.h @@ -1,168 +1,168 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_EP_ROCM_CUDA_STREAM_H_ -#define ONEFLOW_CORE_EP_ROCM_CUDA_STREAM_H_ - -#include "oneflow/core/ep/include/stream.h" -#include "oneflow/core/ep/rocm/cuda_device.h" - -#ifdef WITH_ROCM - -#include -#include "oneflow/core/hipdnn/hipdnn.h" - -// #if CUDA_VERSION >= 11000 -// #define WITH_ROCM_GRAPHS -// #endif // CUDA_VERSION >= 11000 - -#include "oneflow/core/device/cuda_util.h" - -namespace oneflow { - -namespace ep { - -class CudaDevice; - -#ifdef WITH_ROCM_GRAPHS - -class CudaGraphExecutable { - public: - OF_DISALLOW_COPY_AND_MOVE(CudaGraphExecutable); - CudaGraphExecutable(); - ~CudaGraphExecutable(); - - void Update(hipGraph_t graph); - void Launch(hipStream_t stream) const; - bool IsInstantiated() const; - - private: - void Reset(); - - hipGraphExec_t graph_exec_; - int dev_; -}; - -#endif // WITH_ROCM_GRAPHS - -struct CudaLaunchConfig { - dim3 grid_dim; - dim3 block_dim; - size_t shared_mem_size; - CudaLaunchConfig() : grid_dim{}, block_dim{}, shared_mem_size(0) {} - - CudaLaunchConfig(unsigned int grid_size, unsigned int block_size, size_t shared_mem_size) - : grid_dim(grid_size), block_dim(block_size), shared_mem_size(shared_mem_size) {} -}; - -class CudaStream : public Stream { - public: - OF_DISALLOW_COPY_AND_MOVE(CudaStream); - explicit CudaStream(CudaDevice* device); - ~CudaStream() override; - - static constexpr uint32_t kDefaultBlockSize = 256; - - DeviceType device_type() const override; - CudaDevice* device() const override; - Maybe Sync() override; - void RecordEvent(Event* event) override; - - Maybe OnExecutionContextSetup() override; - Maybe OnExecutionContextTeardown() override; - - hipStream_t cuda_stream() const; - hipblasHandle_t cublas_handle() const; - -// #if CUDA_VERSION >= 10010 - -// cublasLtHandle_t cublas_lt_handle() const; - -// #endif - - hipdnnHandle_t cudnn_handle() const; - void* cublas_workspace() const; - size_t cublas_workspace_size() const; - const hipDeviceProp_t& device_properties() const; - int cuda_arch() const; - - void InitLaunchConfigWithWaves(CudaLaunchConfig* config, size_t elem_cnt, size_t block_size, - size_t max_waves) const { - const uint32_t max_grid_size = max_waves * device_properties().multiProcessorCount - * (device_properties().maxThreadsPerMultiProcessor / block_size); - const uint32_t grid_size = - std::min(max_grid_size, (elem_cnt + block_size - 1) / block_size); - config->grid_dim = dim3(grid_size); - config->block_dim = dim3(block_size); - config->shared_mem_size = 0; - } - -#ifdef __HIPCC__ - template - void LaunchKernel(void (*kernel)(Params...), const CudaLaunchConfig& launch_config, - Args... args) { - kernel<<>>(args...); - } - - template - void LaunchKernel(void (*kernel)(Params...), size_t elem_cnt, size_t max_waves, Args... args) { - constexpr uint32_t block_size = kDefaultBlockSize; - CudaLaunchConfig config{}; - InitLaunchConfigWithWaves(&config, elem_cnt, block_size, max_waves); - LaunchKernel(kernel, config, args...); - } - - template - void LaunchKernelDefaultWaves(void (*kernel)(Params...), size_t elem_cnt, Args... args) { - const size_t default_waves = 32; - LaunchKernel(kernel, elem_cnt, default_waves, args...); - } -#endif // __HIPCC__ - -#ifdef WITH_ROCM_GRAPHS - void BeginGraphCapture(); - void EndGraphCapture(CudaGraphExecutable* executable); - bool IsGraphCapturing() const; - void LaunchGraph(const CudaGraphExecutable* executable); -#endif // WITH_ROCM_GRAPHS - - private: - hipStream_t cuda_stream_{}; - hipblasHandle_t cublas_handle_{}; - -// #if CUDA_VERSION >= 10010 - -// cublasLtHandle_t cublas_lt_handle_{}; - -// #endif - - hipdnnHandle_t cudnn_handle_{}; - int device_index_; - void* workspace_{}; - size_t workspace_size_{}; -#ifdef WITH_ROCM_GRAPHS - bool is_graph_capturing_{}; -#endif // WITH_ROCM_GRAPHS - CudaDevice* device_; -}; - -} // namespace ep - -} // namespace oneflow - -#endif // WITH_ROCM - -#endif // ONEFLOW_CORE_EP_ROCM_CUDA_STREAM_H_ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_EP_ROCM_CUDA_STREAM_H_ +#define ONEFLOW_CORE_EP_ROCM_CUDA_STREAM_H_ + +#include "oneflow/core/ep/include/stream.h" +#include "oneflow/core/ep/rocm/cuda_device.h" + +#ifdef WITH_ROCM + +#include +#include "oneflow/core/hipdnn/hipdnn.h" + +// #if CUDA_VERSION >= 11000 +// #define WITH_ROCM_GRAPHS +// #endif // CUDA_VERSION >= 11000 + +#include "oneflow/core/device/cuda_util.h" + +namespace oneflow { + +namespace ep { + +class CudaDevice; + +#ifdef WITH_ROCM_GRAPHS + +class CudaGraphExecutable { + public: + OF_DISALLOW_COPY_AND_MOVE(CudaGraphExecutable); + CudaGraphExecutable(); + ~CudaGraphExecutable(); + + void Update(hipGraph_t graph); + void Launch(hipStream_t stream) const; + bool IsInstantiated() const; + + private: + void Reset(); + + hipGraphExec_t graph_exec_; + int dev_; +}; + +#endif // WITH_ROCM_GRAPHS + +struct CudaLaunchConfig { + dim3 grid_dim; + dim3 block_dim; + size_t shared_mem_size; + CudaLaunchConfig() : grid_dim{}, block_dim{}, shared_mem_size(0) {} + + CudaLaunchConfig(unsigned int grid_size, unsigned int block_size, size_t shared_mem_size) + : grid_dim(grid_size), block_dim(block_size), shared_mem_size(shared_mem_size) {} +}; + +class CudaStream : public Stream { + public: + OF_DISALLOW_COPY_AND_MOVE(CudaStream); + explicit CudaStream(CudaDevice* device); + ~CudaStream() override; + + static constexpr uint32_t kDefaultBlockSize = 256; + + DeviceType device_type() const override; + CudaDevice* device() const override; + Maybe Sync() override; + void RecordEvent(Event* event) override; + + Maybe OnExecutionContextSetup() override; + Maybe OnExecutionContextTeardown() override; + + hipStream_t cuda_stream() const; + hipblasHandle_t cublas_handle() const; + +// #if CUDA_VERSION >= 10010 + +// cublasLtHandle_t cublas_lt_handle() const; + +// #endif + + hipdnnHandle_t cudnn_handle() const; + void* cublas_workspace() const; + size_t cublas_workspace_size() const; + const hipDeviceProp_t& device_properties() const; + int cuda_arch() const; + + void InitLaunchConfigWithWaves(CudaLaunchConfig* config, size_t elem_cnt, size_t block_size, + size_t max_waves) const { + const uint32_t max_grid_size = max_waves * device_properties().multiProcessorCount + * (device_properties().maxThreadsPerMultiProcessor / block_size); + const uint32_t grid_size = + std::min(max_grid_size, (elem_cnt + block_size - 1) / block_size); + config->grid_dim = dim3(grid_size); + config->block_dim = dim3(block_size); + config->shared_mem_size = 0; + } + +#ifdef __HIPCC__ + template + void LaunchKernel(void (*kernel)(Params...), const CudaLaunchConfig& launch_config, + Args... args) { + kernel<<>>(args...); + } + + template + void LaunchKernel(void (*kernel)(Params...), size_t elem_cnt, size_t max_waves, Args... args) { + constexpr uint32_t block_size = kDefaultBlockSize; + CudaLaunchConfig config{}; + InitLaunchConfigWithWaves(&config, elem_cnt, block_size, max_waves); + LaunchKernel(kernel, config, args...); + } + + template + void LaunchKernelDefaultWaves(void (*kernel)(Params...), size_t elem_cnt, Args... args) { + const size_t default_waves = 32; + LaunchKernel(kernel, elem_cnt, default_waves, args...); + } +#endif // __HIPCC__ + +#ifdef WITH_ROCM_GRAPHS + void BeginGraphCapture(); + void EndGraphCapture(CudaGraphExecutable* executable); + bool IsGraphCapturing() const; + void LaunchGraph(const CudaGraphExecutable* executable); +#endif // WITH_ROCM_GRAPHS + + private: + hipStream_t cuda_stream_{}; + hipblasHandle_t cublas_handle_{}; + +// #if CUDA_VERSION >= 10010 + +// cublasLtHandle_t cublas_lt_handle_{}; + +// #endif + + hipdnnHandle_t cudnn_handle_{}; + int device_index_; + void* workspace_{}; + size_t workspace_size_{}; +#ifdef WITH_ROCM_GRAPHS + bool is_graph_capturing_{}; +#endif // WITH_ROCM_GRAPHS + CudaDevice* device_; +}; + +} // namespace ep + +} // namespace oneflow + +#endif // WITH_ROCM + +#endif // ONEFLOW_CORE_EP_ROCM_CUDA_STREAM_H_ diff --git a/oneflow/core/ep/rocm/primitive/add.hip.cpp b/oneflow/core/ep/rocm/primitive/add.hip.cpp index 174cdbb..20cdd17 100644 --- a/oneflow/core/ep/rocm/primitive/add.hip.cpp +++ b/oneflow/core/ep/rocm/primitive/add.hip.cpp @@ -1,139 +1,139 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/common/preprocessor.h" -#include "oneflow/core/ep/include/primitive/add.h" -#include "oneflow/core/ep/rocm/primitive/type_seq.h" -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include "oneflow/core/device/cuda_pseudo_bfloat16.h" - -namespace oneflow { - -namespace ep { -namespace primitive { - -namespace { - -template -struct AddFunctor; - -template -struct AddFunctor { - __device__ T operator()(T x) const { return x; } -}; - -template -struct AddFunctor { - __device__ T operator()(T x0, U x1, Args... xs) const { - return x0 + AddFunctor()(x1, xs...); - } -}; - -template -__global__ void AddGpu(const Args*... srcs, T* dst, size_t count) { - CUDA_1D_KERNEL_LOOP_T(size_t, i, count) { dst[i] = AddFunctor()(srcs[i]...); } -} - -template -void LaunchAddGpu(hipStream_t stream, const Args*... srcs, T* dst, size_t count) { - AddGpu - <<>>(srcs..., dst, count); -} - -template -void DispatchLaunch(hipStream_t stream, const T* const* srcs, size_t arity, T* dst, size_t count) { - if (arity == 0) { - OF_CUDA_CHECK(hipMemsetAsync(dst, 0, count * sizeof(T), stream)); - } else if (arity == 1) { - OF_CUDA_CHECK(hipMemcpyAsync(dst, srcs[0], count * sizeof(T), hipMemcpyDefault, stream)); - } else if (arity == 2) { - OF_CUDA_CHECK((cuda::elementwise::Binary, T, T, T>( - AddFunctor(), count, dst, srcs[0], srcs[1], stream))); - } else if (arity == 3) { - OF_CUDA_CHECK((cuda::elementwise::Ternary, T, T, T, T>( - AddFunctor(), count, dst, srcs[0], srcs[1], srcs[2], stream))); - } else if (arity == 4) { - LaunchAddGpu(stream, srcs[0], srcs[1], srcs[2], srcs[3], dst, count); - } else if (arity == 5) { - LaunchAddGpu(stream, srcs[0], srcs[1], srcs[2], srcs[3], srcs[4], dst, count); - } else if (arity == 6) { - LaunchAddGpu(stream, srcs[0], srcs[1], srcs[2], srcs[3], srcs[4], srcs[5], - dst, count); - } else if (arity == 7) { - LaunchAddGpu(stream, srcs[0], srcs[1], srcs[2], srcs[3], srcs[4], - srcs[5], srcs[6], dst, count); - } else if (arity == 8) { - LaunchAddGpu(stream, srcs[0], srcs[1], srcs[2], srcs[3], srcs[4], - srcs[5], srcs[6], srcs[7], dst, count); - } else { - DispatchLaunch(stream, srcs + 7, arity - 7, dst, count); - LaunchAddGpu(stream, srcs[0], srcs[1], srcs[2], srcs[3], srcs[4], - srcs[5], srcs[6], dst, dst, count); - } -} - -template -class AddImpl : public Add { - public: - OF_DISALLOW_COPY_AND_MOVE(AddImpl); - AddImpl() = default; - ~AddImpl() override = default; - - using Add::Launch; - void Launch(Stream* stream, const void* const* srcs, size_t arity, void* dst, - size_t count) override { - hipStream_t cuda_stream = stream->As()->cuda_stream(); - DispatchLaunch(cuda_stream, reinterpret_cast(srcs), arity, - reinterpret_cast(dst), count); - } -}; - -template -std::unique_ptr NewAdd() { - return std::unique_ptr(new AddImpl()); -} - -class AddFactoryImpl : public AddFactory { - public: - OF_DISALLOW_COPY_AND_MOVE(AddFactoryImpl); - AddFactoryImpl() = default; - ~AddFactoryImpl() override = default; - - std::unique_ptr New(DataType data_type) override { -#define MAKE_NEW_ADD_ENTRY(type_cpp, type_proto) {type_proto, NewAdd}, - - static const std::map()>> new_add_handle{ - OF_PP_FOR_EACH_TUPLE(MAKE_NEW_ADD_ENTRY, CUDA_PRIMITIVE_ALL_TYPE_SEQ)}; - -#undef MAKE_NEW_ADD_ENTRY - - const auto it = new_add_handle.find(data_type); - if (it != new_add_handle.end()) { - return it->second(); - } else { - return nullptr; - } - } -}; - -REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, AddFactory, AddFactoryImpl); - -} // namespace - -} // namespace primitive -} // namespace ep - -} // namespace oneflow +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/common/preprocessor.h" +#include "oneflow/core/ep/include/primitive/add.h" +#include "oneflow/core/ep/rocm/primitive/type_seq.h" +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include "oneflow/core/device/cuda_pseudo_bfloat16.h" + +namespace oneflow { + +namespace ep { +namespace primitive { + +namespace { + +template +struct AddFunctor; + +template +struct AddFunctor { + __device__ T operator()(T x) const { return x; } +}; + +template +struct AddFunctor { + __device__ T operator()(T x0, U x1, Args... xs) const { + return x0 + AddFunctor()(x1, xs...); + } +}; + +template +__global__ void AddGpu(const Args*... srcs, T* dst, size_t count) { + CUDA_1D_KERNEL_LOOP_T(size_t, i, count) { dst[i] = AddFunctor()(srcs[i]...); } +} + +template +void LaunchAddGpu(hipStream_t stream, const Args*... srcs, T* dst, size_t count) { + AddGpu + <<>>(srcs..., dst, count); +} + +template +void DispatchLaunch(hipStream_t stream, const T* const* srcs, size_t arity, T* dst, size_t count) { + if (arity == 0) { + OF_CUDA_CHECK(hipMemsetAsync(dst, 0, count * sizeof(T), stream)); + } else if (arity == 1) { + OF_CUDA_CHECK(hipMemcpyAsync(dst, srcs[0], count * sizeof(T), hipMemcpyDefault, stream)); + } else if (arity == 2) { + OF_CUDA_CHECK((cuda::elementwise::Binary, T, T, T>( + AddFunctor(), count, dst, srcs[0], srcs[1], stream))); + } else if (arity == 3) { + OF_CUDA_CHECK((cuda::elementwise::Ternary, T, T, T, T>( + AddFunctor(), count, dst, srcs[0], srcs[1], srcs[2], stream))); + } else if (arity == 4) { + LaunchAddGpu(stream, srcs[0], srcs[1], srcs[2], srcs[3], dst, count); + } else if (arity == 5) { + LaunchAddGpu(stream, srcs[0], srcs[1], srcs[2], srcs[3], srcs[4], dst, count); + } else if (arity == 6) { + LaunchAddGpu(stream, srcs[0], srcs[1], srcs[2], srcs[3], srcs[4], srcs[5], + dst, count); + } else if (arity == 7) { + LaunchAddGpu(stream, srcs[0], srcs[1], srcs[2], srcs[3], srcs[4], + srcs[5], srcs[6], dst, count); + } else if (arity == 8) { + LaunchAddGpu(stream, srcs[0], srcs[1], srcs[2], srcs[3], srcs[4], + srcs[5], srcs[6], srcs[7], dst, count); + } else { + DispatchLaunch(stream, srcs + 7, arity - 7, dst, count); + LaunchAddGpu(stream, srcs[0], srcs[1], srcs[2], srcs[3], srcs[4], + srcs[5], srcs[6], dst, dst, count); + } +} + +template +class AddImpl : public Add { + public: + OF_DISALLOW_COPY_AND_MOVE(AddImpl); + AddImpl() = default; + ~AddImpl() override = default; + + using Add::Launch; + void Launch(Stream* stream, const void* const* srcs, size_t arity, void* dst, + size_t count) override { + hipStream_t cuda_stream = stream->As()->cuda_stream(); + DispatchLaunch(cuda_stream, reinterpret_cast(srcs), arity, + reinterpret_cast(dst), count); + } +}; + +template +std::unique_ptr NewAdd() { + return std::unique_ptr(new AddImpl()); +} + +class AddFactoryImpl : public AddFactory { + public: + OF_DISALLOW_COPY_AND_MOVE(AddFactoryImpl); + AddFactoryImpl() = default; + ~AddFactoryImpl() override = default; + + std::unique_ptr New(DataType data_type) override { +#define MAKE_NEW_ADD_ENTRY(type_cpp, type_proto) {type_proto, NewAdd}, + + static const std::map()>> new_add_handle{ + OF_PP_FOR_EACH_TUPLE(MAKE_NEW_ADD_ENTRY, CUDA_PRIMITIVE_ALL_TYPE_SEQ)}; + +#undef MAKE_NEW_ADD_ENTRY + + const auto it = new_add_handle.find(data_type); + if (it != new_add_handle.end()) { + return it->second(); + } else { + return nullptr; + } + } +}; + +REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, AddFactory, AddFactoryImpl); + +} // namespace + +} // namespace primitive +} // namespace ep + +} // namespace oneflow diff --git a/oneflow/core/ep/rocm/primitive/binary_functor.hip.h b/oneflow/core/ep/rocm/primitive/binary_functor.hip.h index 3dd42dc..b04935e 100644 --- a/oneflow/core/ep/rocm/primitive/binary_functor.hip.h +++ b/oneflow/core/ep/rocm/primitive/binary_functor.hip.h @@ -1,151 +1,151 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#include "oneflow/core/ep/common/primitive/binary_functor.h" - -namespace oneflow { -namespace ep { -namespace primitive { -namespace broadcast_elementwise_binary { - -template -struct BinaryFunctor { - OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {} - - OF_DEVICE_FUNC Dst operator()(Src src0, Src src1) const { return pow(src0, src1); } -}; - -template<> -struct BinaryFunctor { - OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {} - - OF_DEVICE_FUNC bool operator()(bool src0, bool src1) const { - return static_cast(pow(static_cast(src0), static_cast(src1))); - } -}; - -template<> -struct BinaryFunctor { - OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {} - - OF_DEVICE_FUNC half operator()(half src0, half src1) const { - return static_cast(pow(static_cast(src0), static_cast(src1))); - } -}; - -template -struct BinaryFunctor { - OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) { -#if defined(__CUDA_ARCH__) - coef = sqrt(static_cast(2.0) / acos(static_cast(-1.0))); -#elif defined(__HIP_DEVICE_COMPILE__) - coef = sqrt(static_cast(2.0) / acos(static_cast(-1.0))); -#else - coef = std::sqrt(static_cast(2.0) / std::acos(static_cast(-1.0))); -#endif - } - - OF_DEVICE_FUNC Dst operator()(Src dy, Src x) const { - return static_cast(0.5) - * (static_cast(1.0) + erf(static_cast(M_SQRT1_2) * x) - + x * coef * exp(static_cast(-0.5) * x * x)) - * dy; - } - Src coef; -}; - -template -struct BinaryFunctor { - OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {} - - OF_DEVICE_FUNC Dst operator()(Src dy, Src x) const { - Src tanh_val = tanh(x); - return static_cast(dy * (static_cast(1.0) - tanh_val * tanh_val)); - } -}; - -// /*********nv_bfloat16_kernel*******/ - -// #if CUDA_VERSION >= 11000 - -// template<> -// struct BinaryFunctor { -// OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {} - -// OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src0, nv_bfloat16 src1) const { -// return static_cast(pow(static_cast(src0), static_cast(src1))); -// } -// }; - -// #define SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(op) \ -// template<> \ -// struct BinaryFunctor { \ -// OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \ -// \ -// BinaryFunctor float_functor; \ -// OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src0, nv_bfloat16 src1) const { \ -// return __float2bfloat16(float_functor(__bfloat162float(src0), __bfloat162float(src1))); \ -// } \ -// }; - -// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kEluBackwardWithDyX); -// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kCeluBackwardWithDyX); -// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kGeluBackwardWithDyX); -// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardswishBackwardWithDyX); -// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardsigmoidBackwardWithDyX); -// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardshrinkBackwardWithDyY); -// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardtanhBackwardWithDyY); -// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kLeakyReluBackwardWithDyX); -// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kMishBackwardWithDyX); -// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSeluBackwardWithDyX); -// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSiluBackwardWithDyX); -// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSoftsignBackwardWithDyX); -// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSoftplusBackwardWithDyX); -// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSoftshrinkBackwardWithDyY); -// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kTanhBackwardWithDyX); -// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kThresholdBackwardWithDyX); - -// #endif // CUDA_VERSION >= 11000 - -#define SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(op) \ - template<> \ - struct BinaryFunctor { \ - OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \ - \ - BinaryFunctor float_functor; \ - OF_DEVICE_FUNC half operator()(half src0, half src1) const { \ - return __float2half(float_functor(__half2float(src0), __half2float(src1))); \ - } \ - }; - -SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kEluBackwardWithDyX); -SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kCeluBackwardWithDyX); -SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kGeluBackwardWithDyX); -SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kHardswishBackwardWithDyX); -SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kHardshrinkBackwardWithDyY); -SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kMishBackwardWithDyX); -SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSiluBackwardWithDyX); -SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSeluBackwardWithDyX); -SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSoftplusBackwardWithDyX); -SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSoftsignBackwardWithDyX); -SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSoftshrinkBackwardWithDyY); -SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kThresholdBackwardWithDyX); -SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kTanhBackwardWithDyX); - -} // namespace broadcast_elementwise_binary -} // namespace primitive -} // namespace ep +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "oneflow/core/ep/common/primitive/binary_functor.h" + +namespace oneflow { +namespace ep { +namespace primitive { +namespace broadcast_elementwise_binary { + +template +struct BinaryFunctor { + OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {} + + OF_DEVICE_FUNC Dst operator()(Src src0, Src src1) const { return pow(src0, src1); } +}; + +template<> +struct BinaryFunctor { + OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {} + + OF_DEVICE_FUNC bool operator()(bool src0, bool src1) const { + return static_cast(pow(static_cast(src0), static_cast(src1))); + } +}; + +template<> +struct BinaryFunctor { + OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {} + + OF_DEVICE_FUNC half operator()(half src0, half src1) const { + return static_cast(pow(static_cast(src0), static_cast(src1))); + } +}; + +template +struct BinaryFunctor { + OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) { +#if defined(__CUDA_ARCH__) + coef = sqrt(static_cast(2.0) / acos(static_cast(-1.0))); +#elif defined(__HIP_DEVICE_COMPILE__) + coef = sqrt(static_cast(2.0) / acos(static_cast(-1.0))); +#else + coef = std::sqrt(static_cast(2.0) / std::acos(static_cast(-1.0))); +#endif + } + + OF_DEVICE_FUNC Dst operator()(Src dy, Src x) const { + return static_cast(0.5) + * (static_cast(1.0) + erf(static_cast(M_SQRT1_2) * x) + + x * coef * exp(static_cast(-0.5) * x * x)) + * dy; + } + Src coef; +}; + +template +struct BinaryFunctor { + OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {} + + OF_DEVICE_FUNC Dst operator()(Src dy, Src x) const { + Src tanh_val = tanh(x); + return static_cast(dy * (static_cast(1.0) - tanh_val * tanh_val)); + } +}; + +// /*********nv_bfloat16_kernel*******/ + +// #if CUDA_VERSION >= 11000 + +// template<> +// struct BinaryFunctor { +// OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {} + +// OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src0, nv_bfloat16 src1) const { +// return static_cast(pow(static_cast(src0), static_cast(src1))); +// } +// }; + +// #define SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(op) \ +// template<> \ +// struct BinaryFunctor { \ +// OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \ +// \ +// BinaryFunctor float_functor; \ +// OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src0, nv_bfloat16 src1) const { \ +// return __float2bfloat16(float_functor(__bfloat162float(src0), __bfloat162float(src1))); \ +// } \ +// }; + +// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kEluBackwardWithDyX); +// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kCeluBackwardWithDyX); +// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kGeluBackwardWithDyX); +// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardswishBackwardWithDyX); +// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardsigmoidBackwardWithDyX); +// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardshrinkBackwardWithDyY); +// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardtanhBackwardWithDyY); +// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kLeakyReluBackwardWithDyX); +// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kMishBackwardWithDyX); +// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSeluBackwardWithDyX); +// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSiluBackwardWithDyX); +// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSoftsignBackwardWithDyX); +// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSoftplusBackwardWithDyX); +// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSoftshrinkBackwardWithDyY); +// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kTanhBackwardWithDyX); +// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kThresholdBackwardWithDyX); + +// #endif // CUDA_VERSION >= 11000 + +#define SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(op) \ + template<> \ + struct BinaryFunctor { \ + OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \ + \ + BinaryFunctor float_functor; \ + OF_DEVICE_FUNC half operator()(half src0, half src1) const { \ + return __float2half(float_functor(__half2float(src0), __half2float(src1))); \ + } \ + }; + +SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kEluBackwardWithDyX); +SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kCeluBackwardWithDyX); +SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kGeluBackwardWithDyX); +SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kHardswishBackwardWithDyX); +SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kHardshrinkBackwardWithDyY); +SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kMishBackwardWithDyX); +SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSiluBackwardWithDyX); +SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSeluBackwardWithDyX); +SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSoftplusBackwardWithDyX); +SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSoftsignBackwardWithDyX); +SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSoftshrinkBackwardWithDyY); +SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kThresholdBackwardWithDyX); +SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kTanhBackwardWithDyX); + +} // namespace broadcast_elementwise_binary +} // namespace primitive +} // namespace ep } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.cpp b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.cpp index 25759ec..38909e5 100644 --- a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.cpp +++ b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.cpp @@ -1,110 +1,110 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/ep/include/primitive/broadcast_elementwise_binary.h" -#include "oneflow/core/ep/common/primitive/broadcast_elementwise_binary.h" -#include "oneflow/core/ep/rocm/primitive/type_seq.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/core/ep/rocm/primitive/binary_functor.hip.h" - -namespace oneflow { - -namespace ep { -namespace primitive { -namespace broadcast_elementwise_binary { - -template -std::unique_ptr NewBroadcastElementwiseBinary(Scalar attr0, - Scalar attr1); - -namespace { - -class BroadcastElementwiseBinaryFactoryImpl : public BroadcastElementwiseBinaryFactory { - public: - OF_DISALLOW_COPY_AND_MOVE(BroadcastElementwiseBinaryFactoryImpl); - BroadcastElementwiseBinaryFactoryImpl() = default; - ~BroadcastElementwiseBinaryFactoryImpl() override = default; - - std::unique_ptr New(BinaryOp op, DataType src_type, DataType dst_type, - size_t max_num_dims) override { - return New(op, src_type, dst_type, max_num_dims, Scalar(), Scalar()); - } - - std::unique_ptr New(BinaryOp op, DataType src_type, DataType dst_type, - size_t max_num_dims, Scalar attr0) override { - return New(op, src_type, dst_type, max_num_dims, attr0, Scalar()); - } - - std::unique_ptr New(BinaryOp binary_op, DataType src_type, - DataType dst_type, size_t max_num_dims, - Scalar attr0, Scalar attr1) override { - if (max_num_dims > kMaxNumDims) { return nullptr; } -#define MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY(binary_op, data_type_pair) \ - {std::make_tuple(binary_op, OF_PP_PAIR_SECOND(data_type_pair), \ - OF_PP_PAIR_SECOND(data_type_pair)), \ - NewBroadcastElementwiseBinary}, - -#define MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_AND_LOGICAL_ENTRY( \ - binary_op, src_data_type_pair, dst_data_type_pair) \ - {std::make_tuple(binary_op, OF_PP_PAIR_SECOND(src_data_type_pair), \ - OF_PP_PAIR_SECOND(dst_data_type_pair)), \ - NewBroadcastElementwiseBinary}, - -#define MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY(binary_op, data_type_pair) \ - {std::make_tuple(binary_op, OF_PP_PAIR_SECOND(data_type_pair), \ - OF_PP_PAIR_SECOND(data_type_pair)), \ - NewBroadcastElementwiseBinary}, - - static const std::map< - std::tuple, - std::function(Scalar, Scalar)>> - new_broadcast_elementwise_binary_handle{ - OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY, - BINARY_MATH_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ) - - OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE( - MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_AND_LOGICAL_ENTRY, - BINARY_COMPARISION_OP_SEQ BINARY_LOGICAL_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ, - CUDA_PRIMITIVE_BOOL_TYPE_SEQ) - - OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE( - MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY, - BINARY_ACTIVATION_BACKWARD_OP_SEQ, CUDA_PRIMITIVE_FLOATING_TYPE_SEQ)}; - -#undef MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_AND_LOGICAL_ENTRY -#undef MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY - - const auto it = new_broadcast_elementwise_binary_handle.find( - std::make_tuple(binary_op, src_type, dst_type)); - if (it != new_broadcast_elementwise_binary_handle.end()) { - return it->second(attr0, attr1); - } else { - return nullptr; - } - } -}; - -REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, BroadcastElementwiseBinaryFactory, - BroadcastElementwiseBinaryFactoryImpl); -} // namespace -} // namespace broadcast_elementwise_binary -} // namespace primitive -} // namespace ep - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/ep/include/primitive/broadcast_elementwise_binary.h" +#include "oneflow/core/ep/common/primitive/broadcast_elementwise_binary.h" +#include "oneflow/core/ep/rocm/primitive/type_seq.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/core/ep/rocm/primitive/binary_functor.hip.h" + +namespace oneflow { + +namespace ep { +namespace primitive { +namespace broadcast_elementwise_binary { + +template +std::unique_ptr NewBroadcastElementwiseBinary(Scalar attr0, + Scalar attr1); + +namespace { + +class BroadcastElementwiseBinaryFactoryImpl : public BroadcastElementwiseBinaryFactory { + public: + OF_DISALLOW_COPY_AND_MOVE(BroadcastElementwiseBinaryFactoryImpl); + BroadcastElementwiseBinaryFactoryImpl() = default; + ~BroadcastElementwiseBinaryFactoryImpl() override = default; + + std::unique_ptr New(BinaryOp op, DataType src_type, DataType dst_type, + size_t max_num_dims) override { + return New(op, src_type, dst_type, max_num_dims, Scalar(), Scalar()); + } + + std::unique_ptr New(BinaryOp op, DataType src_type, DataType dst_type, + size_t max_num_dims, Scalar attr0) override { + return New(op, src_type, dst_type, max_num_dims, attr0, Scalar()); + } + + std::unique_ptr New(BinaryOp binary_op, DataType src_type, + DataType dst_type, size_t max_num_dims, + Scalar attr0, Scalar attr1) override { + if (max_num_dims > kMaxNumDims) { return nullptr; } +#define MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY(binary_op, data_type_pair) \ + {std::make_tuple(binary_op, OF_PP_PAIR_SECOND(data_type_pair), \ + OF_PP_PAIR_SECOND(data_type_pair)), \ + NewBroadcastElementwiseBinary}, + +#define MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_AND_LOGICAL_ENTRY( \ + binary_op, src_data_type_pair, dst_data_type_pair) \ + {std::make_tuple(binary_op, OF_PP_PAIR_SECOND(src_data_type_pair), \ + OF_PP_PAIR_SECOND(dst_data_type_pair)), \ + NewBroadcastElementwiseBinary}, + +#define MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY(binary_op, data_type_pair) \ + {std::make_tuple(binary_op, OF_PP_PAIR_SECOND(data_type_pair), \ + OF_PP_PAIR_SECOND(data_type_pair)), \ + NewBroadcastElementwiseBinary}, + + static const std::map< + std::tuple, + std::function(Scalar, Scalar)>> + new_broadcast_elementwise_binary_handle{ + OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY, + BINARY_MATH_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ) + + OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE( + MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_AND_LOGICAL_ENTRY, + BINARY_COMPARISION_OP_SEQ BINARY_LOGICAL_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ, + CUDA_PRIMITIVE_BOOL_TYPE_SEQ) + + OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE( + MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY, + BINARY_ACTIVATION_BACKWARD_OP_SEQ, CUDA_PRIMITIVE_FLOATING_TYPE_SEQ)}; + +#undef MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_AND_LOGICAL_ENTRY +#undef MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY + + const auto it = new_broadcast_elementwise_binary_handle.find( + std::make_tuple(binary_op, src_type, dst_type)); + if (it != new_broadcast_elementwise_binary_handle.end()) { + return it->second(attr0, attr1); + } else { + return nullptr; + } + } +}; + +REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, BroadcastElementwiseBinaryFactory, + BroadcastElementwiseBinaryFactoryImpl); +} // namespace +} // namespace broadcast_elementwise_binary +} // namespace primitive +} // namespace ep + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h index 20b2717..9a48365 100644 --- a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h +++ b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h @@ -1,397 +1,397 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/ep/include/primitive/broadcast_elementwise_binary.h" -#include "oneflow/core/ep/common/primitive/broadcast_elementwise_binary.h" -#include "oneflow/core/ep/rocm/primitive/type_seq.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/core/ep/rocm/primitive/binary_functor.hip.h" - -namespace oneflow { - -namespace ep { -namespace primitive { -namespace broadcast_elementwise_binary { - -namespace { - -template -struct GetPackType { - using type = typename std::aligned_storage::type; -}; - -template -using PackType = typename GetPackType::type; - -template -union Pack { - static_assert(sizeof(PackType) == sizeof(T) * N, ""); - OF_DEVICE_FUNC Pack() { - // do nothing - } - PackType storage; - T elem[N]; -}; - -template -struct BroadcastElementwiseBinaryParams { - NdIndexOffsetHelper src0_index_helper; - NdIndexOffsetHelper src1_index_helper; - NdIndexOffsetHelper dst_index_helper; - size_t num_dims; - IndexType src0_index_mask[max_dims]; - IndexType src1_index_mask[max_dims]; - IndexType count{}; - const void* src0{}; - const void* src1{}; - void* dst{}; - Scalar attr0; - Scalar attr1; -}; - -template -__global__ void BroadcastElementwiseBinaryGpu( - BroadcastElementwiseBinaryParams params) { - constexpr size_t dst_pack_size = - src0_pack_size > src1_pack_size ? src0_pack_size : src1_pack_size; - static_assert(src0_pack_size == dst_pack_size || src0_pack_size == 1, ""); - static_assert(src1_pack_size == dst_pack_size || src1_pack_size == 1, ""); - - const PackType* src0 = - reinterpret_cast*>(params.src0); - const PackType* src1 = - reinterpret_cast*>(params.src1); - PackType* dst = reinterpret_cast*>(params.dst); - - IndexType src0_index[max_dims]; - IndexType src1_index[max_dims]; - IndexType dst_index[max_dims]; - size_t num_dims = params.num_dims; - CUDA_1D_KERNEL_LOOP_T(IndexType, offset, params.count) { - params.dst_index_helper.OffsetToNdIndex(offset, dst_index, num_dims); -#pragma unroll - for (int i = 0; i < max_dims; ++i) { - if (i < num_dims) { - src0_index[i] = params.src0_index_mask[i] * dst_index[i]; - src1_index[i] = params.src1_index_mask[i] * dst_index[i]; - } else { - src0_index[i] = 0; - src1_index[i] = 0; - } - } - const IndexType src0_offset = params.src0_index_helper.NdIndexToOffset(src0_index, num_dims); - const IndexType src1_offset = params.src1_index_helper.NdIndexToOffset(src1_index, num_dims); - Pack src0_pack; - src0_pack.storage = src0[src0_offset]; - Pack src1_pack; - src1_pack.storage = src1[src1_offset]; - Pack dst_pack; - BinaryFunctor functor(params.attr0, params.attr1); -#pragma unroll - for (int j = 0; j < dst_pack_size; ++j) { - const Src src0_val = - (src0_pack_size == dst_pack_size) ? src0_pack.elem[j] : src0_pack.elem[0]; - const Src src1_val = - (src1_pack_size == dst_pack_size) ? src1_pack.elem[j] : src1_pack.elem[0]; - dst_pack.elem[j] = functor(src0_val, src1_val); - } - dst[offset] = dst_pack.storage; - } -} - -template -void LaunchKernel(Stream* stream, int num_dims, const int64_t* src0_dims, const void* src0, - const int64_t* src1_dims, const void* src1, const int64_t* dst_dims, void* dst, - size_t count, Scalar attr0, Scalar attr1) { - BroadcastElementwiseBinaryParams params; - for (size_t i = 0; i < num_dims; ++i) { - params.src0_index_mask[i] = (src0_dims[i] == 1) ? 0 : 1; - params.src1_index_mask[i] = (src1_dims[i] == 1) ? 0 : 1; - } - params.src0_index_helper = NdIndexOffsetHelper(src0_dims, num_dims); - params.src1_index_helper = NdIndexOffsetHelper(src1_dims, num_dims); - params.dst_index_helper = NdIndexOffsetHelper(dst_dims, num_dims); - params.num_dims = num_dims; - params.src0 = src0; - params.src1 = src1; - params.dst = dst; - params.count = static_cast(count); - params.attr0 = attr0; - params.attr1 = attr1; - auto* cuda_stream = stream->As(); - BroadcastElementwiseBinaryGpu - <<cuda_stream()>>>(params); -} - -template -void DispatchIndexType(Stream* stream, size_t num_dims, const int64_t* src0_dims, const void* src0, - const int64_t* src1_dims, const void* src1, const int64_t* dst_dims, - void* dst, Scalar attr0, Scalar attr1) { - size_t count = GetElementCount(num_dims, dst_dims); - if (count < GetMaxVal()) { - LaunchKernel( - stream, num_dims, src0_dims, src0, src1_dims, src1, dst_dims, dst, count, attr0, attr1); - } else { - LaunchKernel( - stream, num_dims, src0_dims, src0, src1_dims, src1, dst_dims, dst, count, attr0, attr1); - } -} - -template -void DispatchPackSize(Stream* stream, size_t src0_pack_size, size_t src1_pack_size, size_t num_dims, - const int64_t* src0_dims, const void* src0, const int64_t* src1_dims, - const void* src1, const int64_t* dst_dims, void* dst, Scalar attr0, - Scalar attr1) { - void (*func)(Stream* /*stream*/, size_t /*num_dims*/, const int64_t* /*src0_dims*/, - const void* /*src0*/, const int64_t* /*src1_dims*/, const void* /*src1*/, - const int64_t* /*dst_dims*/, void* /*dst*/, Scalar /*attr0*/, Scalar /*attr1*/) = - nullptr; - if (src0_pack_size == 1 && src1_pack_size == 1) { - func = DispatchIndexType; - } else if (src0_pack_size == 4 && src1_pack_size == 4) { - func = DispatchIndexType; - } else if (src0_pack_size == 1 && src1_pack_size == 4) { - func = DispatchIndexType; - } else if (src0_pack_size == 4 && src1_pack_size == 1) { - func = DispatchIndexType; - } else { - UNIMPLEMENTED(); - } - func(stream, num_dims, src0_dims, src0, src1_dims, src1, dst_dims, dst, attr0, attr1); -} - -template -void DispatchNumDims(Stream* stream, size_t src0_pack_size, size_t src1_pack_size, size_t num_dims, - const int64_t* src0_dims, const void* src0, const int64_t* src1_dims, - const void* src1, const int64_t* dst_dims, void* dst, Scalar attr0, - Scalar attr1) { - void (*func)(Stream* /*stream*/, size_t /*src0_pack_size*/, size_t /*src1_pack_size*/, - size_t /*num_dims*/, const int64_t* /*src0_dims*/, const void* /*src0*/, - const int64_t* /*src1_dims*/, const void* /*src1*/, const int64_t* /*dst_dims*/, - void* /*dst*/, Scalar /*attr0*/, Scalar /*attr1*/) = nullptr; - CHECK_NE(num_dims, 1); - if (num_dims == 2) { - func = DispatchPackSize; - } else if (num_dims == 3) { - func = DispatchPackSize; - } else if (num_dims == 4) { - func = DispatchPackSize; - } else if (num_dims <= 8) { - func = DispatchPackSize; - } else { - UNIMPLEMENTED(); - } - func(stream, src0_pack_size, src1_pack_size, num_dims, src0_dims, src0, src1_dims, src1, dst_dims, - dst, attr0, attr1); -} - -template -size_t GetPackSize(size_t num_src_dims, const int64_t* src0_dims, const void* src0, - const int64_t* src1_dims, const void* src1, void* dst) { - static_assert(max_pack_size > 0 && (max_pack_size & (max_pack_size - 1)) == 0, ""); - CHECK(src0_dims[num_src_dims - 1] != 1 || src1_dims[num_src_dims - 1] != 1); - auto dst_ptr = reinterpret_cast(dst); - for (size_t pack_size = max_pack_size; pack_size > 2; pack_size /= 2) { - bool is_src0_supported = (src0_dims[num_src_dims - 1] == 1) - || IsPackSizeSupported(pack_size, num_src_dims, src0_dims, src0); - bool is_src1_supported = (src1_dims[num_src_dims - 1] == 1) - || IsPackSizeSupported(pack_size, num_src_dims, src1_dims, src1); - if (is_src0_supported && is_src1_supported && (dst_ptr % (pack_size * sizeof(R))) == 0) { - return pack_size; - } - } - return 1; -} - -constexpr size_t kMaxPackSize = 4; - -template -void LaunchWithSimplified(Stream* stream, size_t simplified_num_dims, int64_t* simplified_src0_dims, - const void* src0, int64_t* simplified_src1_dims, const void* src1, - int64_t* simplified_dst_dims, void* dst, Scalar attr0, Scalar attr1) { - CHECK_LE(simplified_num_dims, kMaxNumDims); - size_t pack_size = GetPackSize(simplified_num_dims, simplified_src0_dims, - src0, simplified_src1_dims, src1, dst); - size_t src0_pack_size = 1; - size_t src1_pack_size = 1; - if (simplified_src0_dims[simplified_num_dims - 1] != 1) { - simplified_src0_dims[simplified_num_dims - 1] /= pack_size; - src0_pack_size = pack_size; - } - if (simplified_src1_dims[simplified_num_dims - 1] != 1) { - simplified_src1_dims[simplified_num_dims - 1] /= pack_size; - src1_pack_size = pack_size; - } - simplified_dst_dims[simplified_num_dims - 1] /= pack_size; - DispatchNumDims(stream, src0_pack_size, src1_pack_size, simplified_num_dims, - simplified_src0_dims, src0, simplified_src1_dims, src1, - simplified_dst_dims, dst, attr0, attr1); -} - -template -struct BinaryLhsScalarFunctor { - __host__ __device__ BinaryLhsScalarFunctor(Src scalar, Scalar attr0, Scalar attr1) - : scalar(scalar), functor(attr0, attr1) {} - __device__ Dst operator()(Src src) const { return functor(scalar, src); } - const Src scalar; - BinaryFunctor functor; -}; - -template -struct BinaryRhsScalarFunctor { - __host__ __device__ BinaryRhsScalarFunctor(Src scalar, Scalar attr0, Scalar attr1) - : scalar(scalar), functor(attr0, attr1) {} - __device__ Dst operator()(Src src) const { return functor(src, scalar); } - const Src scalar; - BinaryFunctor functor; -}; - -template -struct BinaryLhsScalarPtrFunctorFactory { - __host__ __device__ BinaryLhsScalarPtrFunctorFactory(const Src* scalar_ptr, Scalar attr0, - Scalar attr1) - : scalar_ptr(scalar_ptr), attr0(attr0), attr1(attr1) {} - __device__ BinaryLhsScalarFunctor operator()() const { - return BinaryLhsScalarFunctor(*scalar_ptr, attr0, attr1); - } - const Src* scalar_ptr; - Scalar attr0, attr1; -}; - -template -struct BinaryRhsScalarPtrFunctorFactory { - __host__ __device__ explicit BinaryRhsScalarPtrFunctorFactory(const Src* scalar_ptr, Scalar attr0, - Scalar attr1) - : scalar_ptr(scalar_ptr), attr0(attr0), attr1(attr1) {} - __device__ BinaryRhsScalarFunctor operator()() const { - return BinaryRhsScalarFunctor(*scalar_ptr, attr0, attr1); - } - const Src* scalar_ptr; - Scalar attr0, attr1; -}; - -template -void DispatchLaunch(Stream* stream, size_t num_src0_dims, const int64_t* src0_dims, const Src* src0, - size_t num_src1_dims, const int64_t* src1_dims, const Src* src1, Dst* dst, - Scalar attr0, Scalar attr1) { - auto* cuda_stream = stream->As(); - size_t simplified_num_dims = 0; - int64_t simplified_src0_dims[kMaxNumDims]; - int64_t simplified_src1_dims[kMaxNumDims]; - int64_t simplified_dst_dims[kMaxNumDims]; - SimplifyBroadcastDims(num_src0_dims, src0_dims, num_src1_dims, src1_dims, - &simplified_num_dims, simplified_src0_dims, - simplified_src1_dims, simplified_dst_dims); - CheckInplace(simplified_num_dims, simplified_src0_dims, src0, simplified_src1_dims, src1, - simplified_dst_dims, dst); - if (IsDimsEquals(simplified_num_dims, simplified_src0_dims, simplified_num_dims, - simplified_src1_dims)) { - const int64_t elem_cnt = GetElementCount(simplified_num_dims, simplified_src0_dims); - OF_CUDA_CHECK((cuda::elementwise::Binary( - BinaryFunctor(attr0, attr1), elem_cnt, dst, src0, - src1, cuda_stream->cuda_stream()))); - } else { - if (simplified_num_dims == 1 && simplified_src0_dims[0] == 1) { - OF_CUDA_CHECK((cuda::elementwise::UnaryWithFactory( - BinaryLhsScalarPtrFunctorFactory(src0, attr0, attr1), - simplified_src1_dims[0], dst, src1, cuda_stream->cuda_stream()))); - } else if (simplified_num_dims == 1 && simplified_src1_dims[0] == 1) { - OF_CUDA_CHECK((cuda::elementwise::UnaryWithFactory( - BinaryRhsScalarPtrFunctorFactory(src1, attr0, attr1), - simplified_src0_dims[0], dst, src0, cuda_stream->cuda_stream()))); - } else { - LaunchWithSimplified(stream, simplified_num_dims, simplified_src0_dims, - src0, simplified_src1_dims, src1, - simplified_dst_dims, dst, attr0, attr1); - } - } -} - -template -T GetValue(Scalar value) { - return value.Value(); -} - -template<> -half GetValue(Scalar value) { - return static_cast(GetValue(value)); -} - -// #if CUDA_VERSION >= 11000 - -// template<> -// nv_bfloat16 GetValue(Scalar value) { -// return static_cast(GetValue(value)); -// } - -// #endif // CUDA_VERSION >= 11000 - -template -class BroadcastElementwiseBinaryImpl : public BroadcastElementwiseBinary { - public: - OF_DISALLOW_COPY_AND_MOVE(BroadcastElementwiseBinaryImpl); - BroadcastElementwiseBinaryImpl(Scalar attr0, Scalar attr1) : attr0(attr0), attr1(attr1) {} - ~BroadcastElementwiseBinaryImpl() override = default; - - void Launch(Stream* stream, Scalar src0, size_t num_src1_dims, const int64_t* src1_dims, - const void* src1, void* dst) override { - auto* cuda_stream = stream->As(); - const size_t elem_cnt = GetElementCount(num_src1_dims, src1_dims); - OF_CUDA_CHECK((cuda::elementwise::Unary( - BinaryLhsScalarFunctor(GetValue(src0), attr0, attr1), elem_cnt, - reinterpret_cast(dst), reinterpret_cast(src1), - cuda_stream->cuda_stream()))); - } - void Launch(Stream* stream, size_t num_src0_dims, const int64_t* src0_dims, const void* src0, - Scalar src1, void* dst) override { - auto* cuda_stream = stream->As(); - const size_t elem_cnt = GetElementCount(num_src0_dims, src0_dims); - OF_CUDA_CHECK((cuda::elementwise::Unary( - BinaryRhsScalarFunctor(GetValue(src1), attr0, attr1), elem_cnt, - reinterpret_cast(dst), reinterpret_cast(src0), - cuda_stream->cuda_stream()))); - } - void Launch(Stream* stream, size_t num_src0_dims, const int64_t* src0_dims, const void* src0, - size_t num_src1_dims, const int64_t* src1_dims, const void* src1, - void* dst) override { - DispatchLaunch( - stream, num_src0_dims, src0_dims, reinterpret_cast(src0), num_src1_dims, - src1_dims, reinterpret_cast(src1), reinterpret_cast(dst), attr0, attr1); - } - - private: - Scalar attr0, attr1; -}; - -} // namespace - -template -std::unique_ptr NewBroadcastElementwiseBinary(Scalar attr0, - Scalar attr1) { - return std::unique_ptr( - new BroadcastElementwiseBinaryImpl(attr0, attr1)); -} - -} // namespace broadcast_elementwise_binary -} // namespace primitive -} // namespace ep - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/ep/include/primitive/broadcast_elementwise_binary.h" +#include "oneflow/core/ep/common/primitive/broadcast_elementwise_binary.h" +#include "oneflow/core/ep/rocm/primitive/type_seq.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/core/ep/rocm/primitive/binary_functor.hip.h" + +namespace oneflow { + +namespace ep { +namespace primitive { +namespace broadcast_elementwise_binary { + +namespace { + +template +struct GetPackType { + using type = typename std::aligned_storage::type; +}; + +template +using PackType = typename GetPackType::type; + +template +union Pack { + static_assert(sizeof(PackType) == sizeof(T) * N, ""); + OF_DEVICE_FUNC Pack() { + // do nothing + } + PackType storage; + T elem[N]; +}; + +template +struct BroadcastElementwiseBinaryParams { + NdIndexOffsetHelper src0_index_helper; + NdIndexOffsetHelper src1_index_helper; + NdIndexOffsetHelper dst_index_helper; + size_t num_dims; + IndexType src0_index_mask[max_dims]; + IndexType src1_index_mask[max_dims]; + IndexType count{}; + const void* src0{}; + const void* src1{}; + void* dst{}; + Scalar attr0; + Scalar attr1; +}; + +template +__global__ void BroadcastElementwiseBinaryGpu( + BroadcastElementwiseBinaryParams params) { + constexpr size_t dst_pack_size = + src0_pack_size > src1_pack_size ? src0_pack_size : src1_pack_size; + static_assert(src0_pack_size == dst_pack_size || src0_pack_size == 1, ""); + static_assert(src1_pack_size == dst_pack_size || src1_pack_size == 1, ""); + + const PackType* src0 = + reinterpret_cast*>(params.src0); + const PackType* src1 = + reinterpret_cast*>(params.src1); + PackType* dst = reinterpret_cast*>(params.dst); + + IndexType src0_index[max_dims]; + IndexType src1_index[max_dims]; + IndexType dst_index[max_dims]; + size_t num_dims = params.num_dims; + CUDA_1D_KERNEL_LOOP_T(IndexType, offset, params.count) { + params.dst_index_helper.OffsetToNdIndex(offset, dst_index, num_dims); +#pragma unroll + for (int i = 0; i < max_dims; ++i) { + if (i < num_dims) { + src0_index[i] = params.src0_index_mask[i] * dst_index[i]; + src1_index[i] = params.src1_index_mask[i] * dst_index[i]; + } else { + src0_index[i] = 0; + src1_index[i] = 0; + } + } + const IndexType src0_offset = params.src0_index_helper.NdIndexToOffset(src0_index, num_dims); + const IndexType src1_offset = params.src1_index_helper.NdIndexToOffset(src1_index, num_dims); + Pack src0_pack; + src0_pack.storage = src0[src0_offset]; + Pack src1_pack; + src1_pack.storage = src1[src1_offset]; + Pack dst_pack; + BinaryFunctor functor(params.attr0, params.attr1); +#pragma unroll + for (int j = 0; j < dst_pack_size; ++j) { + const Src src0_val = + (src0_pack_size == dst_pack_size) ? src0_pack.elem[j] : src0_pack.elem[0]; + const Src src1_val = + (src1_pack_size == dst_pack_size) ? src1_pack.elem[j] : src1_pack.elem[0]; + dst_pack.elem[j] = functor(src0_val, src1_val); + } + dst[offset] = dst_pack.storage; + } +} + +template +void LaunchKernel(Stream* stream, int num_dims, const int64_t* src0_dims, const void* src0, + const int64_t* src1_dims, const void* src1, const int64_t* dst_dims, void* dst, + size_t count, Scalar attr0, Scalar attr1) { + BroadcastElementwiseBinaryParams params; + for (size_t i = 0; i < num_dims; ++i) { + params.src0_index_mask[i] = (src0_dims[i] == 1) ? 0 : 1; + params.src1_index_mask[i] = (src1_dims[i] == 1) ? 0 : 1; + } + params.src0_index_helper = NdIndexOffsetHelper(src0_dims, num_dims); + params.src1_index_helper = NdIndexOffsetHelper(src1_dims, num_dims); + params.dst_index_helper = NdIndexOffsetHelper(dst_dims, num_dims); + params.num_dims = num_dims; + params.src0 = src0; + params.src1 = src1; + params.dst = dst; + params.count = static_cast(count); + params.attr0 = attr0; + params.attr1 = attr1; + auto* cuda_stream = stream->As(); + BroadcastElementwiseBinaryGpu + <<cuda_stream()>>>(params); +} + +template +void DispatchIndexType(Stream* stream, size_t num_dims, const int64_t* src0_dims, const void* src0, + const int64_t* src1_dims, const void* src1, const int64_t* dst_dims, + void* dst, Scalar attr0, Scalar attr1) { + size_t count = GetElementCount(num_dims, dst_dims); + if (count < GetMaxVal()) { + LaunchKernel( + stream, num_dims, src0_dims, src0, src1_dims, src1, dst_dims, dst, count, attr0, attr1); + } else { + LaunchKernel( + stream, num_dims, src0_dims, src0, src1_dims, src1, dst_dims, dst, count, attr0, attr1); + } +} + +template +void DispatchPackSize(Stream* stream, size_t src0_pack_size, size_t src1_pack_size, size_t num_dims, + const int64_t* src0_dims, const void* src0, const int64_t* src1_dims, + const void* src1, const int64_t* dst_dims, void* dst, Scalar attr0, + Scalar attr1) { + void (*func)(Stream* /*stream*/, size_t /*num_dims*/, const int64_t* /*src0_dims*/, + const void* /*src0*/, const int64_t* /*src1_dims*/, const void* /*src1*/, + const int64_t* /*dst_dims*/, void* /*dst*/, Scalar /*attr0*/, Scalar /*attr1*/) = + nullptr; + if (src0_pack_size == 1 && src1_pack_size == 1) { + func = DispatchIndexType; + } else if (src0_pack_size == 4 && src1_pack_size == 4) { + func = DispatchIndexType; + } else if (src0_pack_size == 1 && src1_pack_size == 4) { + func = DispatchIndexType; + } else if (src0_pack_size == 4 && src1_pack_size == 1) { + func = DispatchIndexType; + } else { + UNIMPLEMENTED(); + } + func(stream, num_dims, src0_dims, src0, src1_dims, src1, dst_dims, dst, attr0, attr1); +} + +template +void DispatchNumDims(Stream* stream, size_t src0_pack_size, size_t src1_pack_size, size_t num_dims, + const int64_t* src0_dims, const void* src0, const int64_t* src1_dims, + const void* src1, const int64_t* dst_dims, void* dst, Scalar attr0, + Scalar attr1) { + void (*func)(Stream* /*stream*/, size_t /*src0_pack_size*/, size_t /*src1_pack_size*/, + size_t /*num_dims*/, const int64_t* /*src0_dims*/, const void* /*src0*/, + const int64_t* /*src1_dims*/, const void* /*src1*/, const int64_t* /*dst_dims*/, + void* /*dst*/, Scalar /*attr0*/, Scalar /*attr1*/) = nullptr; + CHECK_NE(num_dims, 1); + if (num_dims == 2) { + func = DispatchPackSize; + } else if (num_dims == 3) { + func = DispatchPackSize; + } else if (num_dims == 4) { + func = DispatchPackSize; + } else if (num_dims <= 8) { + func = DispatchPackSize; + } else { + UNIMPLEMENTED(); + } + func(stream, src0_pack_size, src1_pack_size, num_dims, src0_dims, src0, src1_dims, src1, dst_dims, + dst, attr0, attr1); +} + +template +size_t GetPackSize(size_t num_src_dims, const int64_t* src0_dims, const void* src0, + const int64_t* src1_dims, const void* src1, void* dst) { + static_assert(max_pack_size > 0 && (max_pack_size & (max_pack_size - 1)) == 0, ""); + CHECK(src0_dims[num_src_dims - 1] != 1 || src1_dims[num_src_dims - 1] != 1); + auto dst_ptr = reinterpret_cast(dst); + for (size_t pack_size = max_pack_size; pack_size > 2; pack_size /= 2) { + bool is_src0_supported = (src0_dims[num_src_dims - 1] == 1) + || IsPackSizeSupported(pack_size, num_src_dims, src0_dims, src0); + bool is_src1_supported = (src1_dims[num_src_dims - 1] == 1) + || IsPackSizeSupported(pack_size, num_src_dims, src1_dims, src1); + if (is_src0_supported && is_src1_supported && (dst_ptr % (pack_size * sizeof(R))) == 0) { + return pack_size; + } + } + return 1; +} + +constexpr size_t kMaxPackSize = 4; + +template +void LaunchWithSimplified(Stream* stream, size_t simplified_num_dims, int64_t* simplified_src0_dims, + const void* src0, int64_t* simplified_src1_dims, const void* src1, + int64_t* simplified_dst_dims, void* dst, Scalar attr0, Scalar attr1) { + CHECK_LE(simplified_num_dims, kMaxNumDims); + size_t pack_size = GetPackSize(simplified_num_dims, simplified_src0_dims, + src0, simplified_src1_dims, src1, dst); + size_t src0_pack_size = 1; + size_t src1_pack_size = 1; + if (simplified_src0_dims[simplified_num_dims - 1] != 1) { + simplified_src0_dims[simplified_num_dims - 1] /= pack_size; + src0_pack_size = pack_size; + } + if (simplified_src1_dims[simplified_num_dims - 1] != 1) { + simplified_src1_dims[simplified_num_dims - 1] /= pack_size; + src1_pack_size = pack_size; + } + simplified_dst_dims[simplified_num_dims - 1] /= pack_size; + DispatchNumDims(stream, src0_pack_size, src1_pack_size, simplified_num_dims, + simplified_src0_dims, src0, simplified_src1_dims, src1, + simplified_dst_dims, dst, attr0, attr1); +} + +template +struct BinaryLhsScalarFunctor { + __host__ __device__ BinaryLhsScalarFunctor(Src scalar, Scalar attr0, Scalar attr1) + : scalar(scalar), functor(attr0, attr1) {} + __device__ Dst operator()(Src src) const { return functor(scalar, src); } + const Src scalar; + BinaryFunctor functor; +}; + +template +struct BinaryRhsScalarFunctor { + __host__ __device__ BinaryRhsScalarFunctor(Src scalar, Scalar attr0, Scalar attr1) + : scalar(scalar), functor(attr0, attr1) {} + __device__ Dst operator()(Src src) const { return functor(src, scalar); } + const Src scalar; + BinaryFunctor functor; +}; + +template +struct BinaryLhsScalarPtrFunctorFactory { + __host__ __device__ BinaryLhsScalarPtrFunctorFactory(const Src* scalar_ptr, Scalar attr0, + Scalar attr1) + : scalar_ptr(scalar_ptr), attr0(attr0), attr1(attr1) {} + __device__ BinaryLhsScalarFunctor operator()() const { + return BinaryLhsScalarFunctor(*scalar_ptr, attr0, attr1); + } + const Src* scalar_ptr; + Scalar attr0, attr1; +}; + +template +struct BinaryRhsScalarPtrFunctorFactory { + __host__ __device__ explicit BinaryRhsScalarPtrFunctorFactory(const Src* scalar_ptr, Scalar attr0, + Scalar attr1) + : scalar_ptr(scalar_ptr), attr0(attr0), attr1(attr1) {} + __device__ BinaryRhsScalarFunctor operator()() const { + return BinaryRhsScalarFunctor(*scalar_ptr, attr0, attr1); + } + const Src* scalar_ptr; + Scalar attr0, attr1; +}; + +template +void DispatchLaunch(Stream* stream, size_t num_src0_dims, const int64_t* src0_dims, const Src* src0, + size_t num_src1_dims, const int64_t* src1_dims, const Src* src1, Dst* dst, + Scalar attr0, Scalar attr1) { + auto* cuda_stream = stream->As(); + size_t simplified_num_dims = 0; + int64_t simplified_src0_dims[kMaxNumDims]; + int64_t simplified_src1_dims[kMaxNumDims]; + int64_t simplified_dst_dims[kMaxNumDims]; + SimplifyBroadcastDims(num_src0_dims, src0_dims, num_src1_dims, src1_dims, + &simplified_num_dims, simplified_src0_dims, + simplified_src1_dims, simplified_dst_dims); + CheckInplace(simplified_num_dims, simplified_src0_dims, src0, simplified_src1_dims, src1, + simplified_dst_dims, dst); + if (IsDimsEquals(simplified_num_dims, simplified_src0_dims, simplified_num_dims, + simplified_src1_dims)) { + const int64_t elem_cnt = GetElementCount(simplified_num_dims, simplified_src0_dims); + OF_CUDA_CHECK((cuda::elementwise::Binary( + BinaryFunctor(attr0, attr1), elem_cnt, dst, src0, + src1, cuda_stream->cuda_stream()))); + } else { + if (simplified_num_dims == 1 && simplified_src0_dims[0] == 1) { + OF_CUDA_CHECK((cuda::elementwise::UnaryWithFactory( + BinaryLhsScalarPtrFunctorFactory(src0, attr0, attr1), + simplified_src1_dims[0], dst, src1, cuda_stream->cuda_stream()))); + } else if (simplified_num_dims == 1 && simplified_src1_dims[0] == 1) { + OF_CUDA_CHECK((cuda::elementwise::UnaryWithFactory( + BinaryRhsScalarPtrFunctorFactory(src1, attr0, attr1), + simplified_src0_dims[0], dst, src0, cuda_stream->cuda_stream()))); + } else { + LaunchWithSimplified(stream, simplified_num_dims, simplified_src0_dims, + src0, simplified_src1_dims, src1, + simplified_dst_dims, dst, attr0, attr1); + } + } +} + +template +T GetValue(Scalar value) { + return value.Value(); +} + +template<> +half GetValue(Scalar value) { + return static_cast(GetValue(value)); +} + +// #if CUDA_VERSION >= 11000 + +// template<> +// nv_bfloat16 GetValue(Scalar value) { +// return static_cast(GetValue(value)); +// } + +// #endif // CUDA_VERSION >= 11000 + +template +class BroadcastElementwiseBinaryImpl : public BroadcastElementwiseBinary { + public: + OF_DISALLOW_COPY_AND_MOVE(BroadcastElementwiseBinaryImpl); + BroadcastElementwiseBinaryImpl(Scalar attr0, Scalar attr1) : attr0(attr0), attr1(attr1) {} + ~BroadcastElementwiseBinaryImpl() override = default; + + void Launch(Stream* stream, Scalar src0, size_t num_src1_dims, const int64_t* src1_dims, + const void* src1, void* dst) override { + auto* cuda_stream = stream->As(); + const size_t elem_cnt = GetElementCount(num_src1_dims, src1_dims); + OF_CUDA_CHECK((cuda::elementwise::Unary( + BinaryLhsScalarFunctor(GetValue(src0), attr0, attr1), elem_cnt, + reinterpret_cast(dst), reinterpret_cast(src1), + cuda_stream->cuda_stream()))); + } + void Launch(Stream* stream, size_t num_src0_dims, const int64_t* src0_dims, const void* src0, + Scalar src1, void* dst) override { + auto* cuda_stream = stream->As(); + const size_t elem_cnt = GetElementCount(num_src0_dims, src0_dims); + OF_CUDA_CHECK((cuda::elementwise::Unary( + BinaryRhsScalarFunctor(GetValue(src1), attr0, attr1), elem_cnt, + reinterpret_cast(dst), reinterpret_cast(src0), + cuda_stream->cuda_stream()))); + } + void Launch(Stream* stream, size_t num_src0_dims, const int64_t* src0_dims, const void* src0, + size_t num_src1_dims, const int64_t* src1_dims, const void* src1, + void* dst) override { + DispatchLaunch( + stream, num_src0_dims, src0_dims, reinterpret_cast(src0), num_src1_dims, + src1_dims, reinterpret_cast(src1), reinterpret_cast(dst), attr0, attr1); + } + + private: + Scalar attr0, attr1; +}; + +} // namespace + +template +std::unique_ptr NewBroadcastElementwiseBinary(Scalar attr0, + Scalar attr1) { + return std::unique_ptr( + new BroadcastElementwiseBinaryImpl(attr0, attr1)); +} + +} // namespace broadcast_elementwise_binary +} // namespace primitive +} // namespace ep + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_activation_grad.hip.cpp b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_activation_grad.hip.cpp index c6252be..c991252 100644 --- a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_activation_grad.hip.cpp +++ b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_activation_grad.hip.cpp @@ -1,39 +1,39 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h" - -namespace oneflow { - -namespace ep { -namespace primitive { -namespace broadcast_elementwise_binary { - -#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY(binary_op, \ - data_type_pair) \ - template std::unique_ptr NewBroadcastElementwiseBinary< \ - binary_op, OF_PP_PAIR_FIRST(data_type_pair), OF_PP_PAIR_FIRST(data_type_pair)>( \ - Scalar attr0, Scalar attr1); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY, - BINARY_ACTIVATION_BACKWARD_OP_SEQ, - CUDA_PRIMITIVE_FLOATING_TYPE_SEQ); - -} // namespace broadcast_elementwise_binary -} // namespace primitive -} // namespace ep - -} // namespace oneflow - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h" + +namespace oneflow { + +namespace ep { +namespace primitive { +namespace broadcast_elementwise_binary { + +#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY(binary_op, \ + data_type_pair) \ + template std::unique_ptr NewBroadcastElementwiseBinary< \ + binary_op, OF_PP_PAIR_FIRST(data_type_pair), OF_PP_PAIR_FIRST(data_type_pair)>( \ + Scalar attr0, Scalar attr1); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY, + BINARY_ACTIVATION_BACKWARD_OP_SEQ, + CUDA_PRIMITIVE_FLOATING_TYPE_SEQ); + +} // namespace broadcast_elementwise_binary +} // namespace primitive +} // namespace ep + +} // namespace oneflow + diff --git a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_comparision.hip.cpp b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_comparision.hip.cpp index a7fc91e..fd9c0d4 100644 --- a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_comparision.hip.cpp +++ b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_comparision.hip.cpp @@ -1,38 +1,38 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h" - -namespace oneflow { - -namespace ep { -namespace primitive { -namespace broadcast_elementwise_binary { - -#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_ENTRY( \ - binary_op, src_data_type_pair, dst_data_type_pair) \ - template std::unique_ptr NewBroadcastElementwiseBinary< \ - binary_op, OF_PP_PAIR_FIRST(src_data_type_pair), OF_PP_PAIR_FIRST(dst_data_type_pair)>( \ - Scalar attr0, Scalar attr1); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_ENTRY, - BINARY_COMPARISION_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ, - CUDA_PRIMITIVE_BOOL_TYPE_SEQ); - -} // namespace broadcast_elementwise_binary -} // namespace primitive -} // namespace ep - -} // namespace oneflow +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h" + +namespace oneflow { + +namespace ep { +namespace primitive { +namespace broadcast_elementwise_binary { + +#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_ENTRY( \ + binary_op, src_data_type_pair, dst_data_type_pair) \ + template std::unique_ptr NewBroadcastElementwiseBinary< \ + binary_op, OF_PP_PAIR_FIRST(src_data_type_pair), OF_PP_PAIR_FIRST(dst_data_type_pair)>( \ + Scalar attr0, Scalar attr1); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_ENTRY, + BINARY_COMPARISION_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ, + CUDA_PRIMITIVE_BOOL_TYPE_SEQ); + +} // namespace broadcast_elementwise_binary +} // namespace primitive +} // namespace ep + +} // namespace oneflow diff --git a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_logical.hip.cpp b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_logical.hip.cpp index ffa05eb..4a03ee1 100644 --- a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_logical.hip.cpp +++ b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_logical.hip.cpp @@ -1,38 +1,38 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h" - -namespace oneflow { - -namespace ep { -namespace primitive { -namespace broadcast_elementwise_binary { - -#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_LOGICAL_ENTRY(binary_op, src_data_type_pair, \ - dst_data_type_pair) \ - template std::unique_ptr NewBroadcastElementwiseBinary< \ - binary_op, OF_PP_PAIR_FIRST(src_data_type_pair), OF_PP_PAIR_FIRST(dst_data_type_pair)>( \ - Scalar attr0, Scalar attr1); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_LOGICAL_ENTRY, - BINARY_COMPARISION_OP_SEQ BINARY_LOGICAL_OP_SEQ, - CUDA_PRIMITIVE_ALL_TYPE_SEQ, CUDA_PRIMITIVE_BOOL_TYPE_SEQ); - -} // namespace broadcast_elementwise_binary -} // namespace primitive -} // namespace ep - -} // namespace oneflow +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h" + +namespace oneflow { + +namespace ep { +namespace primitive { +namespace broadcast_elementwise_binary { + +#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_LOGICAL_ENTRY(binary_op, src_data_type_pair, \ + dst_data_type_pair) \ + template std::unique_ptr NewBroadcastElementwiseBinary< \ + binary_op, OF_PP_PAIR_FIRST(src_data_type_pair), OF_PP_PAIR_FIRST(dst_data_type_pair)>( \ + Scalar attr0, Scalar attr1); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_LOGICAL_ENTRY, + BINARY_COMPARISION_OP_SEQ BINARY_LOGICAL_OP_SEQ, + CUDA_PRIMITIVE_ALL_TYPE_SEQ, CUDA_PRIMITIVE_BOOL_TYPE_SEQ); + +} // namespace broadcast_elementwise_binary +} // namespace primitive +} // namespace ep + +} // namespace oneflow diff --git a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_math.hip.cpp b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_math.hip.cpp index a2ca2bb..144f1a7 100644 --- a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_math.hip.cpp +++ b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_math.hip.cpp @@ -1,36 +1,36 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h" - -namespace oneflow { - -namespace ep { -namespace primitive { -namespace broadcast_elementwise_binary { - -#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY(binary_op, data_type_pair) \ - template std::unique_ptr NewBroadcastElementwiseBinary< \ - binary_op, OF_PP_PAIR_FIRST(data_type_pair), OF_PP_PAIR_FIRST(data_type_pair)>( \ - Scalar attr0, Scalar attr1); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY, - BINARY_MATH_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ); - -} // namespace broadcast_elementwise_binary -} // namespace primitive -} // namespace ep - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h" + +namespace oneflow { + +namespace ep { +namespace primitive { +namespace broadcast_elementwise_binary { + +#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY(binary_op, data_type_pair) \ + template std::unique_ptr NewBroadcastElementwiseBinary< \ + binary_op, OF_PP_PAIR_FIRST(data_type_pair), OF_PP_PAIR_FIRST(data_type_pair)>( \ + Scalar attr0, Scalar attr1); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY, + BINARY_MATH_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ); + +} // namespace broadcast_elementwise_binary +} // namespace primitive +} // namespace ep + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/ep/rocm/primitive/broadcast_matmul.cpp b/oneflow/core/ep/rocm/primitive/broadcast_matmul.cpp index d42ace2..3a92b64 100644 --- a/oneflow/core/ep/rocm/primitive/broadcast_matmul.cpp +++ b/oneflow/core/ep/rocm/primitive/broadcast_matmul.cpp @@ -1,237 +1,237 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifdef WITH_ROCM - -#include "oneflow/core/ep/include/primitive/primitive.h" -#include "oneflow/core/ep/include/primitive/broadcast_matmul.h" -#include "oneflow/core/ep/common/primitive/broadcast_matmul.h" -#include "oneflow/core/common/optional.h" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include -#include -namespace oneflow { - -namespace ep { -namespace primitive { - -namespace broadcast_matmul { - -namespace internal { - -namespace { - -constexpr size_t kMaxNumDims = 8; - -Optional OptCudaDataType(DataType data_type) { - switch (data_type) { - case kFloat: return HIPBLAS_R_32F; - case kDouble: return HIPBLAS_R_64F; - case kFloat16: return HIPBLAS_R_16F; -// #if CUDA_VERSION >= 11000 -// case kBFloat16: return CUDA_R_16BF; -// #endif // CUDA_VERSION >= 11000 - default: return NullOpt; - } -} - -hipblasDatatype_t GetCudaDataType(DataType data_type) { - auto cuda_data_type = OptCudaDataType(data_type); - CHECK(cuda_data_type.has_value()); - return cuda_data_type.value_or(HIPBLAS_R_32F); -} - -union CublasScalarParameter { - double d; - float s; -}; - -CublasScalarParameter GetCublasScalarParameter(Scalar scalar, hipblasDatatype_t compute_type) { - CublasScalarParameter sp{}; - if (compute_type == HIPBLAS_R_64F) { - sp.d = scalar.Value(); - } else if (compute_type == HIPBLAS_R_32F) { - sp.s = scalar.Value(); - } else if (compute_type == HIPBLAS_R_16F) { - sp.s = scalar.Value(); - } else { - UNIMPLEMENTED(); - } - return sp; -} - -hipblasDatatype_t GetComputeType(DataType data_type) { - switch (data_type) { - case kFloat: return HIPBLAS_R_32F; - case kDouble: return HIPBLAS_R_64F; - case kFloat16: return HIPBLAS_R_16F; -// #if CUDA_VERSION >= 11000 -// case kBFloat16: return HIPBLAS_R_32F; -// #endif // CUDA_VERSION >= 11000 - default: UNIMPLEMENTED(); return HIPBLAS_R_32F; - } -} - -void LaunchBroadcastMatmul(Stream* stream, DataType data_type, BlasTransposeType transpose_a, - BlasTransposeType transpose_b, int64_t num_batch_dims, - const int64_t* broadcast_batch_dims, const int64_t* a_batch_dims, - const int64_t* b_batch_dims, const int64_t* c_batch_dims, int64_t m, - int64_t n, int64_t k, Scalar alpha, const void* a, const void* b, - Scalar beta, void* c) { - auto* cuda_stream = stream->As(); - const auto cuda_data_type = GetCudaDataType(data_type); - const auto compute_type = GetComputeType(data_type); - const auto sp_alpha = GetCublasScalarParameter(alpha, compute_type); - __half h_alpha = 0; - if (compute_type == HIPBLAS_R_16F) { - h_alpha = __float2half(sp_alpha.s); - } - const auto GetCublasOperation = [](BlasTransposeType transpose_type) { - if (transpose_type == BlasTransposeType::N) { - return HIPBLAS_OP_N; - } else if (transpose_type == BlasTransposeType::T) { - return HIPBLAS_OP_T; - } else { - UNIMPLEMENTED(); - return HIPBLAS_OP_N; - } - }; - const hipblasOperation_t cublas_trans_a = GetCublasOperation(transpose_b); - const hipblasOperation_t cublas_trans_b = GetCublasOperation(transpose_a); - const int cublas_m = n; - const int cublas_n = m; - const int cublas_k = k; - int cublas_lda = 0; - if (transpose_b == BlasTransposeType::N) { - cublas_lda = n; - } else if (transpose_b == BlasTransposeType::T) { - cublas_lda = k; - } else { - UNIMPLEMENTED(); - } - int cublas_ldb = 0; - if (transpose_a == BlasTransposeType::N) { - cublas_ldb = k; - } else if (transpose_a == BlasTransposeType::T) { - cublas_ldb = m; - } else { - UNIMPLEMENTED(); - } - const int cublas_ldc = n; - // CublasMathModeGuard guard(cuda_stream->cublas_handle()); -// if (data_type == DataType::kFloat16) { -// #if CUDA_VERSION < 11000 -// guard.SetMathMode(CUBLAS_TENSOR_OP_MATH); -// #else -// guard.SetMathMode(CUBLAS_DEFAULT_MATH); -// #endif // CUDA_VERSION < 11000 -// } -// #if CUDA_VERSION >= 11000 -// hipblasGemmAlgo_t algo = HIPBLAS_GEMM_DEFAULT; - hipblasGemmAlgo_t algo = HIPBLAS_GEMM_DEFAULT; -// #else -// hipblasGemmAlgo_t algo = -// (data_type == DataType::kFloat16) ? CUBLAS_GEMM_DFALT_TENSOR_OP : HIPBLAS_GEMM_DEFAULT; -// #endif - if (num_batch_dims == 1 && c_batch_dims[0] != 1) { - const void* cublas_a = b; - const void* cublas_b = a; - void* cublas_c = c; - const int64_t a_batch_count = a_batch_dims[0]; - const int64_t b_batch_count = b_batch_dims[0]; - CHECK(a_batch_count == 1 || b_batch_count == 1 || a_batch_count == b_batch_count); - CHECK_GT(a_batch_count, 0); - CHECK_GT(b_batch_count, 0); - const int batch_count = std::max(a_batch_count, b_batch_count); - const long long int cublas_stride_a = b_batch_count == 1 ? 0 : cublas_m * cublas_k; - const long long int cublas_stride_b = a_batch_count == 1 ? 0 : cublas_k * cublas_n; - const long long int cublas_stride_c = cublas_m * cublas_n; - const auto sp_beta = GetCublasScalarParameter(beta, compute_type); - __half h_beta = 0; - if (compute_type == HIPBLAS_R_16F) { - h_beta = __float2half(sp_beta.s); - OF_CUBLAS_CHECK(hipblasGemmStridedBatchedEx( - cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n, cublas_k, - &h_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_stride_a, cublas_b, cuda_data_type, - cublas_ldb, cublas_stride_b, &h_beta, cublas_c, cuda_data_type, cublas_ldc, - cublas_stride_c, batch_count, compute_type, algo)); - } else { - OF_CUBLAS_CHECK(hipblasGemmStridedBatchedEx( - cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n, cublas_k, - &sp_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_stride_a, cublas_b, cuda_data_type, - cublas_ldb, cublas_stride_b, &sp_beta, cublas_c, cuda_data_type, cublas_ldc, - cublas_stride_c, batch_count, compute_type, algo)); - } - - } else { - auto func = [&](const void* batch_a, const void* batch_b, void* batch_c, Scalar batch_beta) { - const auto sp_beta = GetCublasScalarParameter(batch_beta, compute_type); - __half h_beta = 0; - const void* cublas_a = batch_b; - const void* cublas_b = batch_a; - void* cublas_c = batch_c; - if (compute_type == HIPBLAS_R_16F) { - h_beta = __float2half(sp_beta.s); - OF_CUBLAS_CHECK(hipblasGemmEx( - cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n, - cublas_k, &h_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_b, cuda_data_type, - cublas_ldb, &h_beta, cublas_c, cuda_data_type, cublas_ldc, compute_type, algo)); - } else { - OF_CUBLAS_CHECK(hipblasGemmEx( - cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n, - cublas_k, &sp_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_b, cuda_data_type, - cublas_ldb, &sp_beta, cublas_c, cuda_data_type, cublas_ldc, compute_type, algo)); - } - - }; - ForEachMatmul(data_type, m, n, k, beta, num_batch_dims, broadcast_batch_dims, - a_batch_dims, b_batch_dims, c_batch_dims, a, b, c, func); - } -} - -class BroadcastMatmulFactoryImpl : public BroadcastMatmulFactory { - public: - OF_DISALLOW_COPY_AND_MOVE(BroadcastMatmulFactoryImpl); - BroadcastMatmulFactoryImpl() = default; - ~BroadcastMatmulFactoryImpl() override = default; - - std::unique_ptr New(DataType data_type, BlasTransposeType transpose_a, - BlasTransposeType transpose_b, - size_t max_num_dims) override { - auto cuda_data_type = OptCudaDataType(data_type); - if (max_num_dims <= kMaxNumDims && cuda_data_type.has_value()) { - return std::make_unique>(data_type, transpose_a, - transpose_b); - } else { - return nullptr; - } - } -}; - -REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, BroadcastMatmulFactory, BroadcastMatmulFactoryImpl); - -} // namespace - -} // namespace internal - -} // namespace broadcast_matmul - -} // namespace primitive -} // namespace ep - -} // namespace oneflow - -#endif // WITH_ROCM +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef WITH_ROCM + +#include "oneflow/core/ep/include/primitive/primitive.h" +#include "oneflow/core/ep/include/primitive/broadcast_matmul.h" +#include "oneflow/core/ep/common/primitive/broadcast_matmul.h" +#include "oneflow/core/common/optional.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include +#include +namespace oneflow { + +namespace ep { +namespace primitive { + +namespace broadcast_matmul { + +namespace internal { + +namespace { + +constexpr size_t kMaxNumDims = 8; + +Optional OptCudaDataType(DataType data_type) { + switch (data_type) { + case kFloat: return HIPBLAS_R_32F; + case kDouble: return HIPBLAS_R_64F; + case kFloat16: return HIPBLAS_R_16F; +// #if CUDA_VERSION >= 11000 +// case kBFloat16: return CUDA_R_16BF; +// #endif // CUDA_VERSION >= 11000 + default: return NullOpt; + } +} + +hipblasDatatype_t GetCudaDataType(DataType data_type) { + auto cuda_data_type = OptCudaDataType(data_type); + CHECK(cuda_data_type.has_value()); + return cuda_data_type.value_or(HIPBLAS_R_32F); +} + +union CublasScalarParameter { + double d; + float s; +}; + +CublasScalarParameter GetCublasScalarParameter(Scalar scalar, hipblasDatatype_t compute_type) { + CublasScalarParameter sp{}; + if (compute_type == HIPBLAS_R_64F) { + sp.d = scalar.Value(); + } else if (compute_type == HIPBLAS_R_32F) { + sp.s = scalar.Value(); + } else if (compute_type == HIPBLAS_R_16F) { + sp.s = scalar.Value(); + } else { + UNIMPLEMENTED(); + } + return sp; +} + +hipblasDatatype_t GetComputeType(DataType data_type) { + switch (data_type) { + case kFloat: return HIPBLAS_R_32F; + case kDouble: return HIPBLAS_R_64F; + case kFloat16: return HIPBLAS_R_16F; +// #if CUDA_VERSION >= 11000 +// case kBFloat16: return HIPBLAS_R_32F; +// #endif // CUDA_VERSION >= 11000 + default: UNIMPLEMENTED(); return HIPBLAS_R_32F; + } +} + +void LaunchBroadcastMatmul(Stream* stream, DataType data_type, BlasTransposeType transpose_a, + BlasTransposeType transpose_b, int64_t num_batch_dims, + const int64_t* broadcast_batch_dims, const int64_t* a_batch_dims, + const int64_t* b_batch_dims, const int64_t* c_batch_dims, int64_t m, + int64_t n, int64_t k, Scalar alpha, const void* a, const void* b, + Scalar beta, void* c) { + auto* cuda_stream = stream->As(); + const auto cuda_data_type = GetCudaDataType(data_type); + const auto compute_type = GetComputeType(data_type); + const auto sp_alpha = GetCublasScalarParameter(alpha, compute_type); + __half h_alpha = 0; + if (compute_type == HIPBLAS_R_16F) { + h_alpha = __float2half(sp_alpha.s); + } + const auto GetCublasOperation = [](BlasTransposeType transpose_type) { + if (transpose_type == BlasTransposeType::N) { + return HIPBLAS_OP_N; + } else if (transpose_type == BlasTransposeType::T) { + return HIPBLAS_OP_T; + } else { + UNIMPLEMENTED(); + return HIPBLAS_OP_N; + } + }; + const hipblasOperation_t cublas_trans_a = GetCublasOperation(transpose_b); + const hipblasOperation_t cublas_trans_b = GetCublasOperation(transpose_a); + const int cublas_m = n; + const int cublas_n = m; + const int cublas_k = k; + int cublas_lda = 0; + if (transpose_b == BlasTransposeType::N) { + cublas_lda = n; + } else if (transpose_b == BlasTransposeType::T) { + cublas_lda = k; + } else { + UNIMPLEMENTED(); + } + int cublas_ldb = 0; + if (transpose_a == BlasTransposeType::N) { + cublas_ldb = k; + } else if (transpose_a == BlasTransposeType::T) { + cublas_ldb = m; + } else { + UNIMPLEMENTED(); + } + const int cublas_ldc = n; + // CublasMathModeGuard guard(cuda_stream->cublas_handle()); +// if (data_type == DataType::kFloat16) { +// #if CUDA_VERSION < 11000 +// guard.SetMathMode(CUBLAS_TENSOR_OP_MATH); +// #else +// guard.SetMathMode(CUBLAS_DEFAULT_MATH); +// #endif // CUDA_VERSION < 11000 +// } +// #if CUDA_VERSION >= 11000 +// hipblasGemmAlgo_t algo = HIPBLAS_GEMM_DEFAULT; + hipblasGemmAlgo_t algo = HIPBLAS_GEMM_DEFAULT; +// #else +// hipblasGemmAlgo_t algo = +// (data_type == DataType::kFloat16) ? CUBLAS_GEMM_DFALT_TENSOR_OP : HIPBLAS_GEMM_DEFAULT; +// #endif + if (num_batch_dims == 1 && c_batch_dims[0] != 1) { + const void* cublas_a = b; + const void* cublas_b = a; + void* cublas_c = c; + const int64_t a_batch_count = a_batch_dims[0]; + const int64_t b_batch_count = b_batch_dims[0]; + CHECK(a_batch_count == 1 || b_batch_count == 1 || a_batch_count == b_batch_count); + CHECK_GT(a_batch_count, 0); + CHECK_GT(b_batch_count, 0); + const int batch_count = std::max(a_batch_count, b_batch_count); + const long long int cublas_stride_a = b_batch_count == 1 ? 0 : cublas_m * cublas_k; + const long long int cublas_stride_b = a_batch_count == 1 ? 0 : cublas_k * cublas_n; + const long long int cublas_stride_c = cublas_m * cublas_n; + const auto sp_beta = GetCublasScalarParameter(beta, compute_type); + __half h_beta = 0; + if (compute_type == HIPBLAS_R_16F) { + h_beta = __float2half(sp_beta.s); + OF_CUBLAS_CHECK(hipblasGemmStridedBatchedEx( + cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n, cublas_k, + &h_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_stride_a, cublas_b, cuda_data_type, + cublas_ldb, cublas_stride_b, &h_beta, cublas_c, cuda_data_type, cublas_ldc, + cublas_stride_c, batch_count, compute_type, algo)); + } else { + OF_CUBLAS_CHECK(hipblasGemmStridedBatchedEx( + cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n, cublas_k, + &sp_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_stride_a, cublas_b, cuda_data_type, + cublas_ldb, cublas_stride_b, &sp_beta, cublas_c, cuda_data_type, cublas_ldc, + cublas_stride_c, batch_count, compute_type, algo)); + } + + } else { + auto func = [&](const void* batch_a, const void* batch_b, void* batch_c, Scalar batch_beta) { + const auto sp_beta = GetCublasScalarParameter(batch_beta, compute_type); + __half h_beta = 0; + const void* cublas_a = batch_b; + const void* cublas_b = batch_a; + void* cublas_c = batch_c; + if (compute_type == HIPBLAS_R_16F) { + h_beta = __float2half(sp_beta.s); + OF_CUBLAS_CHECK(hipblasGemmEx( + cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n, + cublas_k, &h_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_b, cuda_data_type, + cublas_ldb, &h_beta, cublas_c, cuda_data_type, cublas_ldc, compute_type, algo)); + } else { + OF_CUBLAS_CHECK(hipblasGemmEx( + cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n, + cublas_k, &sp_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_b, cuda_data_type, + cublas_ldb, &sp_beta, cublas_c, cuda_data_type, cublas_ldc, compute_type, algo)); + } + + }; + ForEachMatmul(data_type, m, n, k, beta, num_batch_dims, broadcast_batch_dims, + a_batch_dims, b_batch_dims, c_batch_dims, a, b, c, func); + } +} + +class BroadcastMatmulFactoryImpl : public BroadcastMatmulFactory { + public: + OF_DISALLOW_COPY_AND_MOVE(BroadcastMatmulFactoryImpl); + BroadcastMatmulFactoryImpl() = default; + ~BroadcastMatmulFactoryImpl() override = default; + + std::unique_ptr New(DataType data_type, BlasTransposeType transpose_a, + BlasTransposeType transpose_b, + size_t max_num_dims) override { + auto cuda_data_type = OptCudaDataType(data_type); + if (max_num_dims <= kMaxNumDims && cuda_data_type.has_value()) { + return std::make_unique>(data_type, transpose_a, + transpose_b); + } else { + return nullptr; + } + } +}; + +REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, BroadcastMatmulFactory, BroadcastMatmulFactoryImpl); + +} // namespace + +} // namespace internal + +} // namespace broadcast_matmul + +} // namespace primitive +} // namespace ep + +} // namespace oneflow + +#endif // WITH_ROCM diff --git a/oneflow/core/ep/rocm/primitive/cast.hip.cpp b/oneflow/core/ep/rocm/primitive/cast.hip.cpp index d65d126..d2e60b9 100644 --- a/oneflow/core/ep/rocm/primitive/cast.hip.cpp +++ b/oneflow/core/ep/rocm/primitive/cast.hip.cpp @@ -1,148 +1,148 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/common/preprocessor.h" -#include "oneflow/core/ep/include/primitive/cast.h" -#include "oneflow/core/ep/rocm/primitive/type_seq.h" -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace ep { -namespace primitive { - -namespace { - -template -struct CastFunctor { - __device__ To operator()(From from) const { return static_cast(from); } -}; - -template -struct CastFunctor::value>::type> { - __device__ To operator()(half from) const { return static_cast(static_cast(from)); } - - __device__ void Apply2(To* to, const half* from) const { - const float2 f2 = __half22float2(*reinterpret_cast(from)); - to[0] = static_cast(f2.x); - to[1] = static_cast(f2.y); - } -}; - -template -struct CastFunctor::value>::type> { - __device__ half operator()(From from) const { - return static_cast(static_cast(from)); - } - - __device__ void Apply2(half* to, const From* from) const { - float2 f2; - f2.x = static_cast(from[0]); - f2.y = static_cast(from[1]); - *reinterpret_cast(to) = __float22half2_rn(f2); - } -}; - -// #if CUDA_VERSION >= 11000 - -// template -// struct CastFunctor::value -// || std::is_same::value)>::type> { -// __device__ To operator()(nv_bfloat16 from) const { -// return static_cast(static_cast(from)); -// } -// }; - -// template -// struct CastFunctor::value -// || std::is_same::value)>::type> { -// __device__ nv_bfloat16 operator()(From from) const { -// return static_cast(static_cast(from)); -// } -// }; - -// #endif // CUDA_VERSION >= 11000 - -template -class CastImpl : public Cast { - public: - OF_DISALLOW_COPY_AND_MOVE(CastImpl); - explicit CastImpl() = default; - ~CastImpl() override = default; - - void Launch(Stream* stream, const void* from, void* to, size_t count) override { - auto* cuda_stream = stream->As(); - OF_CUDA_CHECK((cuda::elementwise::Unary, To, From>( - CastFunctor(), count, reinterpret_cast(to), - reinterpret_cast(from), cuda_stream->cuda_stream()))); - } -}; - -template -std::unique_ptr NewCast() { - return std::unique_ptr(new CastImpl()); -} - -#define CUDA_PRIMITIVE_CAST_TYPE_SEQ \ - CUDA_PRIMITIVE_BOOL_TYPE_SEQ \ - CUDA_PRIMITIVE_CHAR_TYPE_SEQ \ - CUDA_PRIMITIVE_INT8_TYPE_SEQ \ - CUDA_PRIMITIVE_UINT8_TYPE_SEQ \ - CUDA_PRIMITIVE_INT32_TYPE_SEQ \ - CUDA_PRIMITIVE_UINT32_TYPE_SEQ \ - CUDA_PRIMITIVE_INT64_TYPE_SEQ \ - CUDA_PRIMITIVE_UINT64_TYPE_SEQ \ - CUDA_PRIMITIVE_FLOAT_TYPE_SEQ \ - CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ \ - CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ \ - CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ - -class CastFactoryImpl : public CastFactory { - public: - OF_DISALLOW_COPY_AND_MOVE(CastFactoryImpl); - CastFactoryImpl() = default; - ~CastFactoryImpl() override = default; - - std::unique_ptr New(DataType from, DataType to) override { -#define MAKE_NEW_CAST_ENTRY(from_pair, to_pair) \ - {std::make_pair(OF_PP_PAIR_SECOND(from_pair), OF_PP_PAIR_SECOND(to_pair)), \ - NewCast}, - - static const std::map, std::function()>> - new_cast_handle{OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE( - MAKE_NEW_CAST_ENTRY, CUDA_PRIMITIVE_CAST_TYPE_SEQ, CUDA_PRIMITIVE_CAST_TYPE_SEQ)}; - -#undef MAKE_NEW_CAST_ENTRY - - const auto it = new_cast_handle.find(std::make_pair(from, to)); - if (it != new_cast_handle.end()) { - return it->second(); - } else { - return nullptr; - } - } -}; - -REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, CastFactory, CastFactoryImpl); - -} // namespace - -} // namespace primitive -} // namespace ep - -} // namespace oneflow +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/common/preprocessor.h" +#include "oneflow/core/ep/include/primitive/cast.h" +#include "oneflow/core/ep/rocm/primitive/type_seq.h" +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace ep { +namespace primitive { + +namespace { + +template +struct CastFunctor { + __device__ To operator()(From from) const { return static_cast(from); } +}; + +template +struct CastFunctor::value>::type> { + __device__ To operator()(half from) const { return static_cast(static_cast(from)); } + + __device__ void Apply2(To* to, const half* from) const { + const float2 f2 = __half22float2(*reinterpret_cast(from)); + to[0] = static_cast(f2.x); + to[1] = static_cast(f2.y); + } +}; + +template +struct CastFunctor::value>::type> { + __device__ half operator()(From from) const { + return static_cast(static_cast(from)); + } + + __device__ void Apply2(half* to, const From* from) const { + float2 f2; + f2.x = static_cast(from[0]); + f2.y = static_cast(from[1]); + *reinterpret_cast(to) = __float22half2_rn(f2); + } +}; + +// #if CUDA_VERSION >= 11000 + +// template +// struct CastFunctor::value +// || std::is_same::value)>::type> { +// __device__ To operator()(nv_bfloat16 from) const { +// return static_cast(static_cast(from)); +// } +// }; + +// template +// struct CastFunctor::value +// || std::is_same::value)>::type> { +// __device__ nv_bfloat16 operator()(From from) const { +// return static_cast(static_cast(from)); +// } +// }; + +// #endif // CUDA_VERSION >= 11000 + +template +class CastImpl : public Cast { + public: + OF_DISALLOW_COPY_AND_MOVE(CastImpl); + explicit CastImpl() = default; + ~CastImpl() override = default; + + void Launch(Stream* stream, const void* from, void* to, size_t count) override { + auto* cuda_stream = stream->As(); + OF_CUDA_CHECK((cuda::elementwise::Unary, To, From>( + CastFunctor(), count, reinterpret_cast(to), + reinterpret_cast(from), cuda_stream->cuda_stream()))); + } +}; + +template +std::unique_ptr NewCast() { + return std::unique_ptr(new CastImpl()); +} + +#define CUDA_PRIMITIVE_CAST_TYPE_SEQ \ + CUDA_PRIMITIVE_BOOL_TYPE_SEQ \ + CUDA_PRIMITIVE_CHAR_TYPE_SEQ \ + CUDA_PRIMITIVE_INT8_TYPE_SEQ \ + CUDA_PRIMITIVE_UINT8_TYPE_SEQ \ + CUDA_PRIMITIVE_INT32_TYPE_SEQ \ + CUDA_PRIMITIVE_UINT32_TYPE_SEQ \ + CUDA_PRIMITIVE_INT64_TYPE_SEQ \ + CUDA_PRIMITIVE_UINT64_TYPE_SEQ \ + CUDA_PRIMITIVE_FLOAT_TYPE_SEQ \ + CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ \ + CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ \ + CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ + +class CastFactoryImpl : public CastFactory { + public: + OF_DISALLOW_COPY_AND_MOVE(CastFactoryImpl); + CastFactoryImpl() = default; + ~CastFactoryImpl() override = default; + + std::unique_ptr New(DataType from, DataType to) override { +#define MAKE_NEW_CAST_ENTRY(from_pair, to_pair) \ + {std::make_pair(OF_PP_PAIR_SECOND(from_pair), OF_PP_PAIR_SECOND(to_pair)), \ + NewCast}, + + static const std::map, std::function()>> + new_cast_handle{OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE( + MAKE_NEW_CAST_ENTRY, CUDA_PRIMITIVE_CAST_TYPE_SEQ, CUDA_PRIMITIVE_CAST_TYPE_SEQ)}; + +#undef MAKE_NEW_CAST_ENTRY + + const auto it = new_cast_handle.find(std::make_pair(from, to)); + if (it != new_cast_handle.end()) { + return it->second(); + } else { + return nullptr; + } + } +}; + +REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, CastFactory, CastFactoryImpl); + +} // namespace + +} // namespace primitive +} // namespace ep + +} // namespace oneflow diff --git a/oneflow/core/ep/rocm/primitive/constant_pad.hip.cpp b/oneflow/core/ep/rocm/primitive/constant_pad.hip.cpp index fd0d037..be1a539 100644 --- a/oneflow/core/ep/rocm/primitive/constant_pad.hip.cpp +++ b/oneflow/core/ep/rocm/primitive/constant_pad.hip.cpp @@ -1,255 +1,255 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/ep/include/primitive/constant_pad.h" -#include "oneflow/core/ep/common/primitive/constant_pad.h" -#include "oneflow/core/ep/rocm/primitive/type_seq.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include - -namespace oneflow { - -namespace ep { - -namespace primitive { - -namespace { - -template -__global__ void ConstantPadKernel(ConstantPadParams params, - StorageType packed_pad_val) { - const StorageType* src = reinterpret_cast(params.src); - StorageType* dst = reinterpret_cast(params.dst); - IndexType src_index[num_dims]; - IndexType dst_index[num_dims]; - CUDA_1D_KERNEL_LOOP_T(IndexType, linear_index, params.elem_cnt) { - params.dst_index_helper.OffsetToNdIndex(linear_index, dst_index); - bool if_pad = false; -#pragma unroll - for (int i = 0; i < num_dims; i++) { - if (dst_index[i] >= params.valid_start[i] && dst_index[i] < params.valid_end[i]) { - src_index[i] = dst_index[i] - params.valid_start[i]; - } else { - if_pad = true; - break; - } - } - StorageType dst_val = packed_pad_val; - if (!if_pad) { - const IndexType src_offset = params.src_index_helper.NdIndexToOffset(src_index); - dst_val = src[src_offset]; - } - dst[linear_index] = dst_val; - } -} - -template<> -half GetValue(Scalar value) { - return static_cast(GetValue(value)); -} - -// #if CUDA_VERSION >= 11000 - -// template<> -// nv_bfloat16 GetValue(Scalar value) { -// return static_cast(GetValue(value)); -// } - -// #endif // CUDA_VERSION >= 11000 - -template -void LaunchKernel(Stream* stream, ConstantPadParams params, - StorageType packed_pad_val, size_t elem_cnt) { - stream->As()->LaunchKernelDefaultWaves( - (ConstantPadKernel), elem_cnt, params, packed_pad_val); -} - -template -void LaunchKernel(Stream* stream, void* dst, const int64_t* dst_dims, const void* src, - const int64_t* src_dims, const int64_t* padding_before, - const int64_t* padding_after, StorageType packed_pad_val, size_t elem_cnt) { - ConstantPadParams params; - params.dst_index_helper = OffsetToIndexCalculator(dst_dims); - params.src_index_helper = NdIndexOffsetHelper(src_dims); - params.dst = dst; - params.src = src; - for (int i = 0; i < num_dims; i++) { - params.valid_start[i] = padding_before[i]; - params.valid_end[i] = dst_dims[i] - padding_after[i]; - } - params.elem_cnt = elem_cnt; - LaunchKernel(stream, params, packed_pad_val, elem_cnt); -} - -template -void DispatchIndexType(Stream* stream, void* dst, const int64_t* dst_dims, const void* src, - const int64_t* src_dims, const int64_t* padding_before, - const int64_t* padding_after, StorageType packed_pad_val, size_t elem_cnt) { - if (elem_cnt < GetMaxVal()) { - LaunchKernel(stream, dst, dst_dims, src, src_dims, - padding_before, padding_after, packed_pad_val, - elem_cnt); - } else { - LaunchKernel(stream, dst, dst_dims, src, src_dims, - padding_before, padding_after, packed_pad_val, - elem_cnt); - } -} - -template -void DispatchPackSize(Stream* stream, void* dst, int64_t* dst_dims, const void* src, - int64_t* src_dims, int64_t* padding_before, int64_t* padding_after, - T pad_val) { - constexpr int32_t max_packsize = GetMaxPackSize(); - size_t launch_pack_size = GetLaunchPackSize(num_dims, dst, dst_dims, src, src_dims, - padding_before, padding_after); - - dst_dims[num_dims - 1] /= launch_pack_size; - src_dims[num_dims - 1] /= launch_pack_size; - padding_before[num_dims - 1] /= launch_pack_size; - padding_after[num_dims - 1] /= launch_pack_size; - - size_t elem_cnt = 1; - for (int i = 0; i < num_dims; i++) { elem_cnt *= dst_dims[i]; } - if (launch_pack_size == 1) { - Pack packed_pad_val(pad_val); - DispatchIndexType>(stream, dst, dst_dims, src, src_dims, - padding_before, padding_after, - packed_pad_val.storage, elem_cnt); - } else if (launch_pack_size == 2) { - Pack packed_pad_val(pad_val); - DispatchIndexType>(stream, dst, dst_dims, src, src_dims, - padding_before, padding_after, - packed_pad_val.storage, elem_cnt); - } else if (launch_pack_size == 4) { - Pack packed_pad_val(pad_val); - DispatchIndexType>(stream, dst, dst_dims, src, src_dims, - padding_before, padding_after, - packed_pad_val.storage, elem_cnt); - } else if (launch_pack_size == 8) { - Pack packed_pad_val(pad_val); - DispatchIndexType>(stream, dst, dst_dims, src, src_dims, - padding_before, padding_after, - packed_pad_val.storage, elem_cnt); - } else if (launch_pack_size == 16) { - Pack packed_pad_val(pad_val); - DispatchIndexType>(stream, dst, dst_dims, src, src_dims, - padding_before, padding_after, - packed_pad_val.storage, elem_cnt); - } else { - UNIMPLEMENTED(); - } -} - -template -void LaunchWithSimplified(Stream* stream, size_t num_dims, void* dst, int64_t* dst_dims, - const void* src, int64_t* src_dims, int64_t* padding_before, - int64_t* padding_after, T pad_val) { - void (*func)(Stream* /*stream*/, void* /*dst*/, int64_t* /*dst_dims*/, const void* /*src*/, - int64_t* /*src_dims*/, int64_t* /*padding_before*/, int64_t* /*padding_after*/, T) = - nullptr; - if (num_dims == 1) { - func = DispatchPackSize<1, T>; - } else if (num_dims == 2) { - func = DispatchPackSize<2, T>; - } else if (num_dims == 3) { - func = DispatchPackSize<3, T>; - } else if (num_dims == 4) { - func = DispatchPackSize<4, T>; - } else if (num_dims == 5) { - func = DispatchPackSize<5, T>; - } else if (num_dims == 6) { - func = DispatchPackSize<6, T>; - } else if (num_dims == 7) { - func = DispatchPackSize<7, T>; - } else if (num_dims == 8) { - func = DispatchPackSize<8, T>; - } else { - UNIMPLEMENTED(); - } - func(stream, dst, dst_dims, src, src_dims, padding_before, padding_after, pad_val); -} - -template -void SimplifyThenLaunch(Stream* stream, size_t num_dims, const int64_t* src_dims, const void* src, - const int64_t* padding_before, const int64_t* padding_after, T pad_val, - void* dst) { - CHECK_LE(num_dims, kMaxNumDims); - int64_t simplified_dst_dims[kMaxNumDims]; - int64_t simplified_src_dims[kMaxNumDims]; - int64_t simplified_padding_before[kMaxNumDims]; - int64_t simplified_padding_after[kMaxNumDims]; - size_t simplified_num_dims = 1; - SimplifyPadDims(num_dims, src_dims, padding_before, padding_after, &simplified_num_dims, - simplified_dst_dims, simplified_src_dims, simplified_padding_before, - simplified_padding_after); - LaunchWithSimplified(stream, simplified_num_dims, dst, simplified_dst_dims, src, - simplified_src_dims, simplified_padding_before, simplified_padding_after, - pad_val); -} - -template -class ConstantPadImpl : public ConstantPad { - public: - OF_DISALLOW_COPY_AND_MOVE(ConstantPadImpl); - ConstantPadImpl() = default; - ~ConstantPadImpl() override = default; - - void Launch(Stream* stream, size_t num_dims, const int64_t* src_dims, const void* src, - const int64_t* padding_before, const int64_t* padding_after, Scalar pad_val, - void* dst) override { - SimplifyThenLaunch(stream, num_dims, src_dims, src, padding_before, padding_after, - GetValue(pad_val), dst); - } -}; - -template -std::unique_ptr NewConstantPad() { - return std::unique_ptr(new ConstantPadImpl()); -} - -class ConstantPadFactoryImpl : public ConstantPadFactory { - public: - OF_DISALLOW_COPY_AND_MOVE(ConstantPadFactoryImpl); - ConstantPadFactoryImpl() = default; - ~ConstantPadFactoryImpl() override = default; - - std::unique_ptr New(DataType data_type) override { -#define MAKE_NEW_CONSTANT_PAD_ENTRY(type_cpp, type_proto) {type_proto, NewConstantPad}, - - static const std::map()>> - new_constant_pad_handle{ - OF_PP_FOR_EACH_TUPLE(MAKE_NEW_CONSTANT_PAD_ENTRY, CUDA_PRIMITIVE_ALL_TYPE_SEQ)}; - -#undef MAKE_NEW_CONSTANT_PAD_ENTRY - - const auto it = new_constant_pad_handle.find(data_type); - if (it != new_constant_pad_handle.end()) { - return it->second(); - } else { - return nullptr; - } - } -}; - -REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, ConstantPadFactory, ConstantPadFactoryImpl); - -} // namespace - -} // namespace primitive - -} // namespace ep - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/ep/include/primitive/constant_pad.h" +#include "oneflow/core/ep/common/primitive/constant_pad.h" +#include "oneflow/core/ep/rocm/primitive/type_seq.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include + +namespace oneflow { + +namespace ep { + +namespace primitive { + +namespace { + +template +__global__ void ConstantPadKernel(ConstantPadParams params, + StorageType packed_pad_val) { + const StorageType* src = reinterpret_cast(params.src); + StorageType* dst = reinterpret_cast(params.dst); + IndexType src_index[num_dims]; + IndexType dst_index[num_dims]; + CUDA_1D_KERNEL_LOOP_T(IndexType, linear_index, params.elem_cnt) { + params.dst_index_helper.OffsetToNdIndex(linear_index, dst_index); + bool if_pad = false; +#pragma unroll + for (int i = 0; i < num_dims; i++) { + if (dst_index[i] >= params.valid_start[i] && dst_index[i] < params.valid_end[i]) { + src_index[i] = dst_index[i] - params.valid_start[i]; + } else { + if_pad = true; + break; + } + } + StorageType dst_val = packed_pad_val; + if (!if_pad) { + const IndexType src_offset = params.src_index_helper.NdIndexToOffset(src_index); + dst_val = src[src_offset]; + } + dst[linear_index] = dst_val; + } +} + +template<> +half GetValue(Scalar value) { + return static_cast(GetValue(value)); +} + +// #if CUDA_VERSION >= 11000 + +// template<> +// nv_bfloat16 GetValue(Scalar value) { +// return static_cast(GetValue(value)); +// } + +// #endif // CUDA_VERSION >= 11000 + +template +void LaunchKernel(Stream* stream, ConstantPadParams params, + StorageType packed_pad_val, size_t elem_cnt) { + stream->As()->LaunchKernelDefaultWaves( + (ConstantPadKernel), elem_cnt, params, packed_pad_val); +} + +template +void LaunchKernel(Stream* stream, void* dst, const int64_t* dst_dims, const void* src, + const int64_t* src_dims, const int64_t* padding_before, + const int64_t* padding_after, StorageType packed_pad_val, size_t elem_cnt) { + ConstantPadParams params; + params.dst_index_helper = OffsetToIndexCalculator(dst_dims); + params.src_index_helper = NdIndexOffsetHelper(src_dims); + params.dst = dst; + params.src = src; + for (int i = 0; i < num_dims; i++) { + params.valid_start[i] = padding_before[i]; + params.valid_end[i] = dst_dims[i] - padding_after[i]; + } + params.elem_cnt = elem_cnt; + LaunchKernel(stream, params, packed_pad_val, elem_cnt); +} + +template +void DispatchIndexType(Stream* stream, void* dst, const int64_t* dst_dims, const void* src, + const int64_t* src_dims, const int64_t* padding_before, + const int64_t* padding_after, StorageType packed_pad_val, size_t elem_cnt) { + if (elem_cnt < GetMaxVal()) { + LaunchKernel(stream, dst, dst_dims, src, src_dims, + padding_before, padding_after, packed_pad_val, + elem_cnt); + } else { + LaunchKernel(stream, dst, dst_dims, src, src_dims, + padding_before, padding_after, packed_pad_val, + elem_cnt); + } +} + +template +void DispatchPackSize(Stream* stream, void* dst, int64_t* dst_dims, const void* src, + int64_t* src_dims, int64_t* padding_before, int64_t* padding_after, + T pad_val) { + constexpr int32_t max_packsize = GetMaxPackSize(); + size_t launch_pack_size = GetLaunchPackSize(num_dims, dst, dst_dims, src, src_dims, + padding_before, padding_after); + + dst_dims[num_dims - 1] /= launch_pack_size; + src_dims[num_dims - 1] /= launch_pack_size; + padding_before[num_dims - 1] /= launch_pack_size; + padding_after[num_dims - 1] /= launch_pack_size; + + size_t elem_cnt = 1; + for (int i = 0; i < num_dims; i++) { elem_cnt *= dst_dims[i]; } + if (launch_pack_size == 1) { + Pack packed_pad_val(pad_val); + DispatchIndexType>(stream, dst, dst_dims, src, src_dims, + padding_before, padding_after, + packed_pad_val.storage, elem_cnt); + } else if (launch_pack_size == 2) { + Pack packed_pad_val(pad_val); + DispatchIndexType>(stream, dst, dst_dims, src, src_dims, + padding_before, padding_after, + packed_pad_val.storage, elem_cnt); + } else if (launch_pack_size == 4) { + Pack packed_pad_val(pad_val); + DispatchIndexType>(stream, dst, dst_dims, src, src_dims, + padding_before, padding_after, + packed_pad_val.storage, elem_cnt); + } else if (launch_pack_size == 8) { + Pack packed_pad_val(pad_val); + DispatchIndexType>(stream, dst, dst_dims, src, src_dims, + padding_before, padding_after, + packed_pad_val.storage, elem_cnt); + } else if (launch_pack_size == 16) { + Pack packed_pad_val(pad_val); + DispatchIndexType>(stream, dst, dst_dims, src, src_dims, + padding_before, padding_after, + packed_pad_val.storage, elem_cnt); + } else { + UNIMPLEMENTED(); + } +} + +template +void LaunchWithSimplified(Stream* stream, size_t num_dims, void* dst, int64_t* dst_dims, + const void* src, int64_t* src_dims, int64_t* padding_before, + int64_t* padding_after, T pad_val) { + void (*func)(Stream* /*stream*/, void* /*dst*/, int64_t* /*dst_dims*/, const void* /*src*/, + int64_t* /*src_dims*/, int64_t* /*padding_before*/, int64_t* /*padding_after*/, T) = + nullptr; + if (num_dims == 1) { + func = DispatchPackSize<1, T>; + } else if (num_dims == 2) { + func = DispatchPackSize<2, T>; + } else if (num_dims == 3) { + func = DispatchPackSize<3, T>; + } else if (num_dims == 4) { + func = DispatchPackSize<4, T>; + } else if (num_dims == 5) { + func = DispatchPackSize<5, T>; + } else if (num_dims == 6) { + func = DispatchPackSize<6, T>; + } else if (num_dims == 7) { + func = DispatchPackSize<7, T>; + } else if (num_dims == 8) { + func = DispatchPackSize<8, T>; + } else { + UNIMPLEMENTED(); + } + func(stream, dst, dst_dims, src, src_dims, padding_before, padding_after, pad_val); +} + +template +void SimplifyThenLaunch(Stream* stream, size_t num_dims, const int64_t* src_dims, const void* src, + const int64_t* padding_before, const int64_t* padding_after, T pad_val, + void* dst) { + CHECK_LE(num_dims, kMaxNumDims); + int64_t simplified_dst_dims[kMaxNumDims]; + int64_t simplified_src_dims[kMaxNumDims]; + int64_t simplified_padding_before[kMaxNumDims]; + int64_t simplified_padding_after[kMaxNumDims]; + size_t simplified_num_dims = 1; + SimplifyPadDims(num_dims, src_dims, padding_before, padding_after, &simplified_num_dims, + simplified_dst_dims, simplified_src_dims, simplified_padding_before, + simplified_padding_after); + LaunchWithSimplified(stream, simplified_num_dims, dst, simplified_dst_dims, src, + simplified_src_dims, simplified_padding_before, simplified_padding_after, + pad_val); +} + +template +class ConstantPadImpl : public ConstantPad { + public: + OF_DISALLOW_COPY_AND_MOVE(ConstantPadImpl); + ConstantPadImpl() = default; + ~ConstantPadImpl() override = default; + + void Launch(Stream* stream, size_t num_dims, const int64_t* src_dims, const void* src, + const int64_t* padding_before, const int64_t* padding_after, Scalar pad_val, + void* dst) override { + SimplifyThenLaunch(stream, num_dims, src_dims, src, padding_before, padding_after, + GetValue(pad_val), dst); + } +}; + +template +std::unique_ptr NewConstantPad() { + return std::unique_ptr(new ConstantPadImpl()); +} + +class ConstantPadFactoryImpl : public ConstantPadFactory { + public: + OF_DISALLOW_COPY_AND_MOVE(ConstantPadFactoryImpl); + ConstantPadFactoryImpl() = default; + ~ConstantPadFactoryImpl() override = default; + + std::unique_ptr New(DataType data_type) override { +#define MAKE_NEW_CONSTANT_PAD_ENTRY(type_cpp, type_proto) {type_proto, NewConstantPad}, + + static const std::map()>> + new_constant_pad_handle{ + OF_PP_FOR_EACH_TUPLE(MAKE_NEW_CONSTANT_PAD_ENTRY, CUDA_PRIMITIVE_ALL_TYPE_SEQ)}; + +#undef MAKE_NEW_CONSTANT_PAD_ENTRY + + const auto it = new_constant_pad_handle.find(data_type); + if (it != new_constant_pad_handle.end()) { + return it->second(); + } else { + return nullptr; + } + } +}; + +REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, ConstantPadFactory, ConstantPadFactoryImpl); + +} // namespace + +} // namespace primitive + +} // namespace ep + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/ep/rocm/primitive/copy_nd.hip.cpp b/oneflow/core/ep/rocm/primitive/copy_nd.hip.cpp index 8dc4589..8b60dcf 100644 --- a/oneflow/core/ep/rocm/primitive/copy_nd.hip.cpp +++ b/oneflow/core/ep/rocm/primitive/copy_nd.hip.cpp @@ -1,95 +1,95 @@ -#include "hip/hip_runtime.h" -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#include "oneflow/core/ep/include/primitive/copy_nd.h" -#include "oneflow/core/ep/common/primitive/copy_nd.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include - -namespace oneflow { - -namespace ep { -namespace primitive { - -namespace { - -template -__global__ void CopyNdKernel(CopyNdKernelParams params) { - using T = typename std::aligned_storage::type; - const T* src = reinterpret_cast(params.src); - T* dst = reinterpret_cast(params.dst); - IndexType copy_index[num_dims]; - IndexType src_index[num_dims]; - IndexType dst_index[num_dims]; - CUDA_1D_KERNEL_LOOP_T(IndexType, i, params.count) { - params.copy_index_helper.OffsetToNdIndex(i, copy_index); -#pragma unroll - for (size_t j = 0; j < num_dims; ++j) { - src_index[j] = params.src_pos[j] + copy_index[j]; - dst_index[j] = params.dst_pos[j] + copy_index[j]; - } - const IndexType src_offset = params.src_index_helper.NdIndexToOffset(src_index); - const IndexType dst_offset = params.dst_index_helper.NdIndexToOffset(dst_index); - dst[dst_offset] = src[src_offset]; - } -} - -template -void LaunchKernel(Stream* stream, CopyNdKernelParams params) { - hipStream_t cuda_stream = stream->As()->cuda_stream(); - CopyNdKernel - <<>>(params); -} - -class CopyNdImpl : public CopyNd { - public: - OF_DISALLOW_COPY_AND_MOVE(CopyNdImpl); - CopyNdImpl() = default; - ~CopyNdImpl() override = default; - - void Launch(Stream* stream, DataType data_type, size_t num_dims, void* dst, - const int64_t* dst_dims, const int64_t* dst_pos, const void* src, - const int64_t* src_dims, const int64_t* src_pos, - const int64_t* extent) const override { - SimplifyThenLaunch(stream, data_type, num_dims, dst, dst_dims, dst_pos, src, src_dims, src_pos, - extent); - } -}; - -class CopyNdFactoryImpl : public CopyNdFactory { - public: - OF_DISALLOW_COPY_AND_MOVE(CopyNdFactoryImpl); - CopyNdFactoryImpl() = default; - ~CopyNdFactoryImpl() override = default; - - std::unique_ptr New(size_t max_num_dims) override { - if (max_num_dims <= kMaxNumDims) { - return std::unique_ptr(new CopyNdImpl()); - } else { - return nullptr; - } - } -}; - -REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, CopyNdFactory, CopyNdFactoryImpl); - -} // namespace - -} // namespace primitive -} // namespace ep - -} // namespace oneflow +#include "hip/hip_runtime.h" +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "oneflow/core/ep/include/primitive/copy_nd.h" +#include "oneflow/core/ep/common/primitive/copy_nd.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include + +namespace oneflow { + +namespace ep { +namespace primitive { + +namespace { + +template +__global__ void CopyNdKernel(CopyNdKernelParams params) { + using T = typename std::aligned_storage::type; + const T* src = reinterpret_cast(params.src); + T* dst = reinterpret_cast(params.dst); + IndexType copy_index[num_dims]; + IndexType src_index[num_dims]; + IndexType dst_index[num_dims]; + CUDA_1D_KERNEL_LOOP_T(IndexType, i, params.count) { + params.copy_index_helper.OffsetToNdIndex(i, copy_index); +#pragma unroll + for (size_t j = 0; j < num_dims; ++j) { + src_index[j] = params.src_pos[j] + copy_index[j]; + dst_index[j] = params.dst_pos[j] + copy_index[j]; + } + const IndexType src_offset = params.src_index_helper.NdIndexToOffset(src_index); + const IndexType dst_offset = params.dst_index_helper.NdIndexToOffset(dst_index); + dst[dst_offset] = src[src_offset]; + } +} + +template +void LaunchKernel(Stream* stream, CopyNdKernelParams params) { + hipStream_t cuda_stream = stream->As()->cuda_stream(); + CopyNdKernel + <<>>(params); +} + +class CopyNdImpl : public CopyNd { + public: + OF_DISALLOW_COPY_AND_MOVE(CopyNdImpl); + CopyNdImpl() = default; + ~CopyNdImpl() override = default; + + void Launch(Stream* stream, DataType data_type, size_t num_dims, void* dst, + const int64_t* dst_dims, const int64_t* dst_pos, const void* src, + const int64_t* src_dims, const int64_t* src_pos, + const int64_t* extent) const override { + SimplifyThenLaunch(stream, data_type, num_dims, dst, dst_dims, dst_pos, src, src_dims, src_pos, + extent); + } +}; + +class CopyNdFactoryImpl : public CopyNdFactory { + public: + OF_DISALLOW_COPY_AND_MOVE(CopyNdFactoryImpl); + CopyNdFactoryImpl() = default; + ~CopyNdFactoryImpl() override = default; + + std::unique_ptr New(size_t max_num_dims) override { + if (max_num_dims <= kMaxNumDims) { + return std::unique_ptr(new CopyNdImpl()); + } else { + return nullptr; + } + } +}; + +REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, CopyNdFactory, CopyNdFactoryImpl); + +} // namespace + +} // namespace primitive +} // namespace ep + +} // namespace oneflow diff --git a/oneflow/core/ep/rocm/primitive/elementwise_unary.hip.cpp b/oneflow/core/ep/rocm/primitive/elementwise_unary.hip.cpp index 1f0c93b..c04763a 100644 --- a/oneflow/core/ep/rocm/primitive/elementwise_unary.hip.cpp +++ b/oneflow/core/ep/rocm/primitive/elementwise_unary.hip.cpp @@ -1,117 +1,117 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/ep/common/primitive/elementwise_unary.h" -#include "oneflow/core/ep/rocm/primitive/unary_functor.hip.h" - -namespace oneflow { - -namespace ep { -namespace primitive { - -namespace { - -template -class ElementwiseUnaryImpl : public ElementwiseUnary { - public: - OF_DISALLOW_COPY_AND_MOVE(ElementwiseUnaryImpl); - ElementwiseUnaryImpl(Scalar attr0, Scalar attr1) : attr0(attr0), attr1(attr1) {} - ~ElementwiseUnaryImpl() override = default; - - void Launch(Stream* stream, const void* src, void* dst, size_t count) override { - auto* cuda_stream = stream->As(); - auto functor = UnaryFunctor(attr0, attr1); - OF_CUDA_CHECK((cuda::elementwise::Unary( - functor, count, reinterpret_cast(dst), reinterpret_cast(src), - cuda_stream->cuda_stream()))); - } - - protected: - Scalar attr0, attr1; -}; - -template -std::unique_ptr NewElementwiseUnary(Scalar attr0, Scalar attr1) { - return std::unique_ptr( - new ElementwiseUnaryImpl(attr0, attr1)); -} - -class ElementwiseUnaryFactoryImpl : public ElementwiseUnaryFactory { - public: - OF_DISALLOW_COPY_AND_MOVE(ElementwiseUnaryFactoryImpl); - ElementwiseUnaryFactoryImpl() = default; - ~ElementwiseUnaryFactoryImpl() override = default; - - std::unique_ptr New(UnaryOp unary_op, DataType src_type, - DataType dst_dtype) override { - return New(unary_op, src_type, dst_dtype, Scalar(), Scalar()); - } - - std::unique_ptr New(UnaryOp unary_op, DataType src_type, DataType dst_dtype, - Scalar attr0) override { - return New(unary_op, src_type, dst_dtype, attr0, Scalar()); - } - - std::unique_ptr New(UnaryOp unary_op, DataType src_type, DataType dst_dtype, - Scalar attr0, Scalar attr1) override { -#define MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY(unary_op, dtype_pair) \ - {std::make_tuple(unary_op, OF_PP_PAIR_SECOND(dtype_pair), OF_PP_PAIR_SECOND(dtype_pair)), \ - NewElementwiseUnary}, - -#define MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY(unary_op, src_type_pair, dst_dtype_pair) \ - {std::make_tuple(unary_op, OF_PP_PAIR_SECOND(src_type_pair), OF_PP_PAIR_SECOND(dst_dtype_pair)), \ - NewElementwiseUnary}, - - static const std::map, - std::function(Scalar, Scalar)>> - new_elementwise_unary_handle{ - // For All Type OP - OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY, - UNARY_MATH_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ) - // For Float Type OP - OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY, - UNARY_FLOATING_MATH_OP_SEQ, - CUDA_PRIMITIVE_FLOATING_TYPE_SEQ) - - // For Utils OP - OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY, - UNARY_UTILS_OP_SEQ, UTIL_OPS_DATA_TYPE_SEQ, - CUDA_PRIMITIVE_BOOL_TYPE_SEQ) - - // For Logical OP - OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY, - UNARY_LOGICAL_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ, - CUDA_PRIMITIVE_BOOL_TYPE_SEQ)}; - -#undef MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY - -#undef MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY - const auto it = - new_elementwise_unary_handle.find(std::make_tuple(unary_op, src_type, dst_dtype)); - if (it != new_elementwise_unary_handle.end()) { - return it->second(attr0, attr1); - } else { - return nullptr; - } - } -}; - -REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, ElementwiseUnaryFactory, ElementwiseUnaryFactoryImpl); - -} // namespace -} // namespace primitive -} // namespace ep +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/ep/common/primitive/elementwise_unary.h" +#include "oneflow/core/ep/rocm/primitive/unary_functor.hip.h" + +namespace oneflow { + +namespace ep { +namespace primitive { + +namespace { + +template +class ElementwiseUnaryImpl : public ElementwiseUnary { + public: + OF_DISALLOW_COPY_AND_MOVE(ElementwiseUnaryImpl); + ElementwiseUnaryImpl(Scalar attr0, Scalar attr1) : attr0(attr0), attr1(attr1) {} + ~ElementwiseUnaryImpl() override = default; + + void Launch(Stream* stream, const void* src, void* dst, size_t count) override { + auto* cuda_stream = stream->As(); + auto functor = UnaryFunctor(attr0, attr1); + OF_CUDA_CHECK((cuda::elementwise::Unary( + functor, count, reinterpret_cast(dst), reinterpret_cast(src), + cuda_stream->cuda_stream()))); + } + + protected: + Scalar attr0, attr1; +}; + +template +std::unique_ptr NewElementwiseUnary(Scalar attr0, Scalar attr1) { + return std::unique_ptr( + new ElementwiseUnaryImpl(attr0, attr1)); +} + +class ElementwiseUnaryFactoryImpl : public ElementwiseUnaryFactory { + public: + OF_DISALLOW_COPY_AND_MOVE(ElementwiseUnaryFactoryImpl); + ElementwiseUnaryFactoryImpl() = default; + ~ElementwiseUnaryFactoryImpl() override = default; + + std::unique_ptr New(UnaryOp unary_op, DataType src_type, + DataType dst_dtype) override { + return New(unary_op, src_type, dst_dtype, Scalar(), Scalar()); + } + + std::unique_ptr New(UnaryOp unary_op, DataType src_type, DataType dst_dtype, + Scalar attr0) override { + return New(unary_op, src_type, dst_dtype, attr0, Scalar()); + } + + std::unique_ptr New(UnaryOp unary_op, DataType src_type, DataType dst_dtype, + Scalar attr0, Scalar attr1) override { +#define MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY(unary_op, dtype_pair) \ + {std::make_tuple(unary_op, OF_PP_PAIR_SECOND(dtype_pair), OF_PP_PAIR_SECOND(dtype_pair)), \ + NewElementwiseUnary}, + +#define MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY(unary_op, src_type_pair, dst_dtype_pair) \ + {std::make_tuple(unary_op, OF_PP_PAIR_SECOND(src_type_pair), OF_PP_PAIR_SECOND(dst_dtype_pair)), \ + NewElementwiseUnary}, + + static const std::map, + std::function(Scalar, Scalar)>> + new_elementwise_unary_handle{ + // For All Type OP + OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY, + UNARY_MATH_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ) + // For Float Type OP + OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY, + UNARY_FLOATING_MATH_OP_SEQ, + CUDA_PRIMITIVE_FLOATING_TYPE_SEQ) + + // For Utils OP + OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY, + UNARY_UTILS_OP_SEQ, UTIL_OPS_DATA_TYPE_SEQ, + CUDA_PRIMITIVE_BOOL_TYPE_SEQ) + + // For Logical OP + OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY, + UNARY_LOGICAL_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ, + CUDA_PRIMITIVE_BOOL_TYPE_SEQ)}; + +#undef MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY + +#undef MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY + const auto it = + new_elementwise_unary_handle.find(std::make_tuple(unary_op, src_type, dst_dtype)); + if (it != new_elementwise_unary_handle.end()) { + return it->second(attr0, attr1); + } else { + return nullptr; + } + } +}; + +REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, ElementwiseUnaryFactory, ElementwiseUnaryFactoryImpl); + +} // namespace +} // namespace primitive +} // namespace ep } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/ep/rocm/primitive/fill.hip.cpp b/oneflow/core/ep/rocm/primitive/fill.hip.cpp index c77b251..a81d6a6 100644 --- a/oneflow/core/ep/rocm/primitive/fill.hip.cpp +++ b/oneflow/core/ep/rocm/primitive/fill.hip.cpp @@ -1,151 +1,151 @@ -#include "hip/hip_runtime.h" -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/ep/include/primitive/fill.h" -#include "oneflow/core/ep/rocm/primitive/type_seq.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace ep { -namespace primitive { - -namespace { - -template -using Storage = typename std::aligned_storage::type; - -template -union Pack { - static constexpr size_t size = sizeof(T) * pack; - explicit __device__ __host__ Pack(T value) { - static_assert(sizeof(Pack) == size, ""); - static_assert(alignof(Pack) == size, ""); -#pragma unroll - for (size_t i = 0; i < pack; ++i) { elem[i] = value; } - } - T elem[pack]; - Storage storage; -}; - -template -__global__ void FillGpu(T* dst, T value, size_t count) { - const size_t pack_count = count / pack; - Pack pack_value(value); - auto* pack_dst = reinterpret_cast(dst); - CUDA_1D_KERNEL_LOOP_T(size_t, i, pack_count) { pack_dst[i] = pack_value.storage; } - T* tail_dst = dst + pack_count * pack; - const size_t tail_count = count - pack_count * pack; - CUDA_1D_KERNEL_LOOP_T(size_t, i, tail_count) { tail_dst[i] = value; } -} - -template -T GetValue(Scalar value) { - return value.Value(); -} - -template<> -half GetValue(Scalar value) { - return static_cast(GetValue(value)); -} - -// #if CUDA_VERSION >= 11000 - -// template<> -// nv_bfloat16 GetValue(Scalar value) { -// return static_cast(GetValue(value)); -// } - -// #endif // CUDA_VERSION >= 11000 - -template -typename std::enable_if<(pack != 0), void>::type LaunchPackFill(hipStream_t stream, T* dst, - T value, size_t count) { - FillGpu - <<>>(dst, value, count); -} - -template -typename std::enable_if<(pack == 0), void>::type LaunchPackFill(hipStream_t stream, T* dst, - T value, size_t count) { - LOG(FATAL) << "wrong alignment"; -} - -template -void LaunchFill(hipStream_t stream, T* dst, T value, size_t count) { - auto uintptr = reinterpret_cast(dst); - if (uintptr % 16 == 0) { - LaunchPackFill(stream, dst, value, count); - } else if (uintptr % 8 == 0) { - LaunchPackFill(stream, dst, value, count); - } else if (uintptr % 4 == 0) { - LaunchPackFill(stream, dst, value, count); - } else if (uintptr % 2 == 0) { - LaunchPackFill(stream, dst, value, count); - } else { - LaunchPackFill(stream, dst, value, count); - } -} - -template -class FillImpl : public Fill { - public: - OF_DISALLOW_COPY_AND_MOVE(FillImpl); - FillImpl() = default; - ~FillImpl() override = default; - - void Launch(Stream* stream, void* dst, Scalar value, size_t count) override { - hipStream_t cuda_stream = stream->As()->cuda_stream(); - LaunchFill(cuda_stream, reinterpret_cast(dst), GetValue(value), count); - } -}; - -template -std::unique_ptr NewFill() { - return std::unique_ptr(new FillImpl()); -} - -class FillFactoryImpl : public FillFactory { - public: - OF_DISALLOW_COPY_AND_MOVE(FillFactoryImpl); - FillFactoryImpl() = default; - ~FillFactoryImpl() override = default; - - std::unique_ptr New(DataType data_type) override { -#define MAKE_NEW_FILL_ENTRY(type_cpp, type_proto) {type_proto, NewFill}, - - static const std::map()>> new_fill_handle{ - OF_PP_FOR_EACH_TUPLE(MAKE_NEW_FILL_ENTRY, CUDA_PRIMITIVE_ALL_TYPE_SEQ)}; - -#undef MAKE_NEW_FILL_ENTRY - - const auto it = new_fill_handle.find(data_type); - if (it != new_fill_handle.end()) { - return it->second(); - } else { - return nullptr; - } - } -}; - -REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, FillFactory, FillFactoryImpl); - -} // namespace - -} // namespace primitive -} // namespace ep - -} // namespace oneflow +#include "hip/hip_runtime.h" +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/ep/include/primitive/fill.h" +#include "oneflow/core/ep/rocm/primitive/type_seq.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace ep { +namespace primitive { + +namespace { + +template +using Storage = typename std::aligned_storage::type; + +template +union Pack { + static constexpr size_t size = sizeof(T) * pack; + explicit __device__ __host__ Pack(T value) { + static_assert(sizeof(Pack) == size, ""); + static_assert(alignof(Pack) == size, ""); +#pragma unroll + for (size_t i = 0; i < pack; ++i) { elem[i] = value; } + } + T elem[pack]; + Storage storage; +}; + +template +__global__ void FillGpu(T* dst, T value, size_t count) { + const size_t pack_count = count / pack; + Pack pack_value(value); + auto* pack_dst = reinterpret_cast(dst); + CUDA_1D_KERNEL_LOOP_T(size_t, i, pack_count) { pack_dst[i] = pack_value.storage; } + T* tail_dst = dst + pack_count * pack; + const size_t tail_count = count - pack_count * pack; + CUDA_1D_KERNEL_LOOP_T(size_t, i, tail_count) { tail_dst[i] = value; } +} + +template +T GetValue(Scalar value) { + return value.Value(); +} + +template<> +half GetValue(Scalar value) { + return static_cast(GetValue(value)); +} + +// #if CUDA_VERSION >= 11000 + +// template<> +// nv_bfloat16 GetValue(Scalar value) { +// return static_cast(GetValue(value)); +// } + +// #endif // CUDA_VERSION >= 11000 + +template +typename std::enable_if<(pack != 0), void>::type LaunchPackFill(hipStream_t stream, T* dst, + T value, size_t count) { + FillGpu + <<>>(dst, value, count); +} + +template +typename std::enable_if<(pack == 0), void>::type LaunchPackFill(hipStream_t stream, T* dst, + T value, size_t count) { + LOG(FATAL) << "wrong alignment"; +} + +template +void LaunchFill(hipStream_t stream, T* dst, T value, size_t count) { + auto uintptr = reinterpret_cast(dst); + if (uintptr % 16 == 0) { + LaunchPackFill(stream, dst, value, count); + } else if (uintptr % 8 == 0) { + LaunchPackFill(stream, dst, value, count); + } else if (uintptr % 4 == 0) { + LaunchPackFill(stream, dst, value, count); + } else if (uintptr % 2 == 0) { + LaunchPackFill(stream, dst, value, count); + } else { + LaunchPackFill(stream, dst, value, count); + } +} + +template +class FillImpl : public Fill { + public: + OF_DISALLOW_COPY_AND_MOVE(FillImpl); + FillImpl() = default; + ~FillImpl() override = default; + + void Launch(Stream* stream, void* dst, Scalar value, size_t count) override { + hipStream_t cuda_stream = stream->As()->cuda_stream(); + LaunchFill(cuda_stream, reinterpret_cast(dst), GetValue(value), count); + } +}; + +template +std::unique_ptr NewFill() { + return std::unique_ptr(new FillImpl()); +} + +class FillFactoryImpl : public FillFactory { + public: + OF_DISALLOW_COPY_AND_MOVE(FillFactoryImpl); + FillFactoryImpl() = default; + ~FillFactoryImpl() override = default; + + std::unique_ptr New(DataType data_type) override { +#define MAKE_NEW_FILL_ENTRY(type_cpp, type_proto) {type_proto, NewFill}, + + static const std::map()>> new_fill_handle{ + OF_PP_FOR_EACH_TUPLE(MAKE_NEW_FILL_ENTRY, CUDA_PRIMITIVE_ALL_TYPE_SEQ)}; + +#undef MAKE_NEW_FILL_ENTRY + + const auto it = new_fill_handle.find(data_type); + if (it != new_fill_handle.end()) { + return it->second(); + } else { + return nullptr; + } + } +}; + +REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, FillFactory, FillFactoryImpl); + +} // namespace + +} // namespace primitive +} // namespace ep + +} // namespace oneflow diff --git a/oneflow/core/ep/rocm/primitive/memcpy.cpp b/oneflow/core/ep/rocm/primitive/memcpy.cpp index 19624e4..f6b2600 100644 --- a/oneflow/core/ep/rocm/primitive/memcpy.cpp +++ b/oneflow/core/ep/rocm/primitive/memcpy.cpp @@ -1,62 +1,62 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifdef WITH_ROCM - -#include "oneflow/core/ep/include/primitive/memcpy.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include - -namespace oneflow { - -namespace ep { -namespace primitive { - -namespace { - -class MemcpyImpl : public Memcpy { - public: - OF_DISALLOW_COPY_AND_MOVE(MemcpyImpl); - MemcpyImpl() = default; - ~MemcpyImpl() override = default; - - void Launch(Stream* stream, void* dst, const void* src, size_t count) override { - if (dst == src) { return; } - auto* cuda_stream = stream->As(); - OF_CUDA_CHECK(hipMemcpyAsync(dst, src, count, hipMemcpyDefault, cuda_stream->cuda_stream())); - } -}; - -class MemcpyFactoryImpl : public MemcpyFactory { - public: - OF_DISALLOW_COPY_AND_MOVE(MemcpyFactoryImpl); - MemcpyFactoryImpl() = default; - ~MemcpyFactoryImpl() override = default; - - std::unique_ptr New(MemcpyKind kind) override { - return std::unique_ptr(new MemcpyImpl()); - } -}; - -REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, MemcpyFactory, MemcpyFactoryImpl); - -} // namespace - -} // namespace primitive -} // namespace ep - -} // namespace oneflow - -#endif +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef WITH_ROCM + +#include "oneflow/core/ep/include/primitive/memcpy.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include + +namespace oneflow { + +namespace ep { +namespace primitive { + +namespace { + +class MemcpyImpl : public Memcpy { + public: + OF_DISALLOW_COPY_AND_MOVE(MemcpyImpl); + MemcpyImpl() = default; + ~MemcpyImpl() override = default; + + void Launch(Stream* stream, void* dst, const void* src, size_t count) override { + if (dst == src) { return; } + auto* cuda_stream = stream->As(); + OF_CUDA_CHECK(hipMemcpyAsync(dst, src, count, hipMemcpyDefault, cuda_stream->cuda_stream())); + } +}; + +class MemcpyFactoryImpl : public MemcpyFactory { + public: + OF_DISALLOW_COPY_AND_MOVE(MemcpyFactoryImpl); + MemcpyFactoryImpl() = default; + ~MemcpyFactoryImpl() override = default; + + std::unique_ptr New(MemcpyKind kind) override { + return std::unique_ptr(new MemcpyImpl()); + } +}; + +REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, MemcpyFactory, MemcpyFactoryImpl); + +} // namespace + +} // namespace primitive +} // namespace ep + +} // namespace oneflow + +#endif diff --git a/oneflow/core/ep/rocm/primitive/memset.cpp b/oneflow/core/ep/rocm/primitive/memset.cpp index f92fc4d..9d912b6 100644 --- a/oneflow/core/ep/rocm/primitive/memset.cpp +++ b/oneflow/core/ep/rocm/primitive/memset.cpp @@ -1,59 +1,59 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifdef WITH_ROCM - -#include "oneflow/core/ep/include/primitive/memset.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include - -namespace oneflow { - -namespace ep { -namespace primitive { - -namespace { - -class MemsetImpl : public Memset { - public: - OF_DISALLOW_COPY_AND_MOVE(MemsetImpl); - MemsetImpl() = default; - ~MemsetImpl() override = default; - - void Launch(Stream* stream, void* ptr, int value, size_t count) override { - auto* cuda_stream = stream->As(); - OF_CUDA_CHECK(hipMemsetAsync(ptr, value, count, cuda_stream->cuda_stream())); - } -}; - -class MemsetFactoryImpl : public MemsetFactory { - public: - OF_DISALLOW_COPY_AND_MOVE(MemsetFactoryImpl); - MemsetFactoryImpl() = default; - ~MemsetFactoryImpl() override = default; - - std::unique_ptr New() override { return std::unique_ptr(new MemsetImpl()); } -}; - -REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, MemsetFactory, MemsetFactoryImpl); - -} // namespace - -} // namespace primitive -} // namespace ep - -} // namespace oneflow - -#endif +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef WITH_ROCM + +#include "oneflow/core/ep/include/primitive/memset.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include + +namespace oneflow { + +namespace ep { +namespace primitive { + +namespace { + +class MemsetImpl : public Memset { + public: + OF_DISALLOW_COPY_AND_MOVE(MemsetImpl); + MemsetImpl() = default; + ~MemsetImpl() override = default; + + void Launch(Stream* stream, void* ptr, int value, size_t count) override { + auto* cuda_stream = stream->As(); + OF_CUDA_CHECK(hipMemsetAsync(ptr, value, count, cuda_stream->cuda_stream())); + } +}; + +class MemsetFactoryImpl : public MemsetFactory { + public: + OF_DISALLOW_COPY_AND_MOVE(MemsetFactoryImpl); + MemsetFactoryImpl() = default; + ~MemsetFactoryImpl() override = default; + + std::unique_ptr New() override { return std::unique_ptr(new MemsetImpl()); } +}; + +REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, MemsetFactory, MemsetFactoryImpl); + +} // namespace + +} // namespace primitive +} // namespace ep + +} // namespace oneflow + +#endif diff --git a/oneflow/core/ep/rocm/primitive/permute.hip.cpp b/oneflow/core/ep/rocm/primitive/permute.hip.cpp index afc26a3..78ae275 100644 --- a/oneflow/core/ep/rocm/primitive/permute.hip.cpp +++ b/oneflow/core/ep/rocm/primitive/permute.hip.cpp @@ -1,333 +1,333 @@ -#include "hip/hip_runtime.h" -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/ep/include/primitive/permute.h" -#include "oneflow/core/ep/common/primitive/permute_impl.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include - -namespace oneflow { - -namespace ep { -namespace primitive { - -namespace permute { - -namespace internal { - -namespace { - -constexpr int32_t kMov4TileSize = 32; -constexpr int32_t kMov2TileSize = 64; -constexpr int32_t kBlockRows = 8; - -template -__global__ void PermuteKernel(PermuteKernelParams params) { - using T = typename std::aligned_storage::type; - const T* src = reinterpret_cast(params.src); - T* dst = reinterpret_cast(params.dst); - IndexType src_index[num_dims]; - IndexType dst_index[num_dims]; - CUDA_1D_KERNEL_LOOP_T(IndexType, i, params.count) { - params.dst_index_helper.OffsetToNdIndex(i, dst_index); -#pragma unroll - for (size_t dim = 0; dim < num_dims; ++dim) { - src_index[params.permutation[dim]] = dst_index[dim]; - } - IndexType src_offset = params.src_index_helper.NdIndexToOffset(src_index); - dst[i] = src[src_offset]; - } -} - -// (B, X, Y) -> (B, Y, X) -// refer from https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/ -template -__global__ void BatchTransposeKernel(const void* src_ptr, void* dst_ptr, IndexType rows, - IndexType cols, IndexType num_tile_rows, - IndexType num_tile_cols, int32_t block_nums) { - const IndexType src_rows = rows; - const IndexType src_cols = cols; - const IndexType dst_rows = cols; - const IndexType dst_cols = rows; - - using T = typename std::aligned_storage::type; - __shared__ T tile[tile_size][tile_size + 1]; // To avoid bank conflict. - - const T* src = reinterpret_cast(src_ptr); - T* dst = reinterpret_cast(dst_ptr); - - IndexType batch_num_tile = num_tile_rows * num_tile_cols; - for (int i = blockIdx.x, step = gridDim.x; i < block_nums; i += step) { - const IndexType batch_index = i / batch_num_tile; // the index of batch. - const IndexType tile_index = - i - batch_index * batch_num_tile; // equal to i % (num_tile_rows*num_tile_cols). the - // flatten index of tile in a batch. - - const IndexType tile_row_index = - tile_index / num_tile_cols; // the row index of tile in a batch. - const IndexType tile_col_index = - tile_index - - tile_row_index - * num_tile_cols; // equal to k % num_tile_cols. the col index of tile in a batch. - - const IndexType offset = batch_index * src_rows * src_cols; - { - IndexType col_in_tile = threadIdx.x; - IndexType col_in_matrix = tile_col_index * tile_size + threadIdx.x; -#pragma unroll - for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size; - row_in_tile += kBlockRows) { - IndexType row_in_matrix = row_in_tile + tile_row_index * tile_size; - if (col_in_matrix < src_cols && row_in_matrix < src_rows) { - tile[row_in_tile][col_in_tile] = src[offset + row_in_matrix * src_cols + col_in_matrix]; - } - } - } - __syncthreads(); - { - IndexType col_in_tile = threadIdx.x; - IndexType col_in_matrix = tile_row_index * tile_size + threadIdx.x; -#pragma unroll - for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size; - row_in_tile += kBlockRows) { - IndexType row_in_matrix = row_in_tile + tile_col_index * tile_size; - if (col_in_matrix < dst_cols && row_in_matrix < dst_rows) { - dst[offset + row_in_matrix * dst_cols + col_in_matrix] = tile[col_in_tile][row_in_tile]; - } - } - } - __syncthreads(); - } -} - -/* -Here is a Movementsie=2 version of Batch Transpose. -When the H W can be divided by 2. we can read data use movementsize=4, and write back as -movementsize=4. -*/ -template -__global__ void BatchTransposeMovement2Kernel(const void* src_ptr, void* dst_ptr, IndexType rows, - IndexType cols, IndexType num_tile_rows, - IndexType num_tile_cols, int32_t block_nums) { - const IndexType src_rows = rows; - const IndexType src_cols = cols; - const IndexType dst_rows = cols; - const IndexType dst_cols = rows; - - static_assert(tile_size % 2 == 0, ""); - using T_MOV2 = typename std::aligned_storage<2, 2>::type; - using T_MOV4 = typename std::aligned_storage<4, 4>::type; - - const T_MOV4* src = reinterpret_cast(src_ptr); - T_MOV4* dst = reinterpret_cast(dst_ptr); - - // Use union structure to process Load and Store. - __shared__ union { - T_MOV2 tile_m2[tile_size][tile_size + 2]; // half [64][66] - T_MOV4 tile_m4[tile_size][tile_size / 2 + 1]; // half2 [64][33] - } tile_mem; - - IndexType batch_num_tile = num_tile_rows * num_tile_cols; - for (int i = blockIdx.x, step = gridDim.x; i < block_nums; i += step) { - const IndexType batch_index = i / batch_num_tile; // the index of batch. - const IndexType tile_index = - i - batch_index * batch_num_tile; // equal to i % (num_tile_rows*num_tile_cols). the - // flatten index of tile in a batch. - - const IndexType tile_row_index = - tile_index / num_tile_cols; // the row index of tile in a batch. - const IndexType tile_col_index = - tile_index - - tile_row_index - * num_tile_cols; // equal to k % num_tile_cols. the col index of tile in a batch. - - const IndexType offset = batch_index * src_rows * src_cols; - { - IndexType col_in_tile = threadIdx.x; - IndexType col_in_matrix = tile_col_index * tile_size + threadIdx.x * 2; -#pragma unroll - for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size; - row_in_tile += kBlockRows) { - IndexType row_in_matrix = row_in_tile + tile_row_index * tile_size; - if (col_in_matrix < src_cols && row_in_matrix < src_rows) { - tile_mem.tile_m4[row_in_tile][col_in_tile] = - src[(offset + row_in_matrix * src_cols + col_in_matrix) / 2]; - } - } - } - __syncthreads(); - { - IndexType col_in_tile = threadIdx.x; - IndexType col_in_matrix = tile_row_index * tile_size + threadIdx.x * 2; -#pragma unroll - for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size; - row_in_tile += kBlockRows) { - IndexType row_in_matrix = row_in_tile + tile_col_index * tile_size; - union { - T_MOV4 m4; - T_MOV2 m2[2]; - } tmp_storage; - - if (col_in_matrix < dst_cols && row_in_matrix < dst_rows) { - tmp_storage.m2[0] = tile_mem.tile_m2[col_in_tile * 2][row_in_tile]; - tmp_storage.m2[1] = tile_mem.tile_m2[col_in_tile * 2 + 1][row_in_tile]; - dst[(offset + row_in_matrix * dst_cols + col_in_matrix) / 2] = tmp_storage.m4; - } - } - } - __syncthreads(); - } -} - -template -void LaunchBatchTransposeKernel(hipStream_t& cuda_stream, - const PermuteKernelParams& params, - const IndexType& num_batches, const IndexType& rows, - const IndexType& cols) { - IndexType num_tile_rows = (rows + tile_size - 1) / tile_size; - IndexType num_tile_cols = (cols + tile_size - 1) / tile_size; - const int32_t block_nums = num_batches * num_tile_rows * num_tile_cols; - int32_t launched_block_nums = std::min(block_nums, kCudaMaxBlocksNum); - if (tile_size == kMov2TileSize) { - const int32_t half2_thread = tile_size / 2; // cause each thread process two half elements. - BatchTransposeMovement2Kernel - <<>>( - params.src, params.dst, rows, cols, num_tile_rows, num_tile_cols, - block_nums); // Set threads num as 32x8 cause each threads - // process 4 elements to 64x66 half share memory. - } else { - BatchTransposeKernel - <<>>( - params.src, params.dst, rows, cols, num_tile_rows, num_tile_cols, block_nums); - } -} - -template -bool CheckIfGreaterEqualThanTileSize(const IndexType& rows, const IndexType& cols) { - if (rows < tile_size || cols < tile_size) { return false; } - return true; -} - -template -bool CheckLaunchBatchTranspose(const int* permutation, const IndexType& num_batches, - const IndexType& rows, const IndexType& cols) { - if (CheckIfGreaterEqualThanTileSize(rows, cols)) { - if (num_batches == 1 && permutation[1] == 0 && permutation[0] == 1) { - // 2d tensor case: (0, 1) -> (1, 0) - return true; - } else if (num_dims == 3 && permutation[2] == 1 && permutation[1] == 2) { - // 3d tensor case: (0, 1, 2) -> (0, 2, 1) - return true; - } else { - return false; - } - } - return false; -} - -template -bool CheckUseMov2(const IndexType& rows, const IndexType& cols, const void* src, void* dst) { - auto src_ptr = reinterpret_cast(src); - auto dst_ptr = reinterpret_cast(dst); - return (movement_size == 2) && (rows % 2 == 0) && (cols % 2 == 0) && (src_ptr % 4 == 0) - && (dst_ptr % 4 == 0); -} - -template -void InferBatchTransposeShape(const int64_t* src_dims, IndexType* num_batches, IndexType* rows, - IndexType* cols) { - if (num_dims == 2) { - *num_batches = 1; - *rows = src_dims[0]; - *cols = src_dims[1]; - } else { - *num_batches = src_dims[0]; - *rows = src_dims[1]; - *cols = src_dims[2]; - } -} - -template -void LaunchKernel(Stream* stream, const int64_t* src_dims, const void* src, const int* permutation, - void* dst, size_t count) { - PermuteKernelParams params = - MakePermuteParams(src_dims, src, permutation, dst, count); - hipStream_t cuda_stream = stream->As()->cuda_stream(); - - if (num_dims == 2 || num_dims == 3) { - IndexType num_batches; - IndexType rows; - IndexType cols; - InferBatchTransposeShape(src_dims, &num_batches, &rows, &cols); - if (CheckLaunchBatchTranspose(params.permutation, num_batches, rows, - cols)) { - if (CheckUseMov2(rows, cols, src, dst)) { - LaunchBatchTransposeKernel(cuda_stream, params, - num_batches, rows, cols); - } else { - LaunchBatchTransposeKernel( - cuda_stream, params, num_batches, rows, cols); - } - } else { - PermuteKernel - <<>>(params); - } - } else { - PermuteKernel - <<>>(params); - } -} - -class PermuteImpl : public Permute { - public: - OF_DISALLOW_COPY_AND_MOVE(PermuteImpl); - PermuteImpl() = default; - ~PermuteImpl() override = default; - - using Permute::Launch; - void Launch(Stream* stream, DataType data_type, size_t num_dims, const int64_t* src_dims, - const void* src, const int* permutation, void* dst) override { - SimplifyThenLaunch(stream, data_type, num_dims, src_dims, src, permutation, dst); - } -}; - -class PermuteFactoryImpl : public PermuteFactory { - public: - OF_DISALLOW_COPY_AND_MOVE(PermuteFactoryImpl); - PermuteFactoryImpl() = default; - ~PermuteFactoryImpl() override = default; - - std::unique_ptr New(size_t max_num_dims) override { - if (max_num_dims <= kMaxNumDims) { - return std::unique_ptr(new PermuteImpl()); - } else { - return nullptr; - } - } -}; - -REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, PermuteFactory, PermuteFactoryImpl); - -} // namespace - -} // namespace internal - -} // namespace permute - -} // namespace primitive -} // namespace ep - -} // namespace oneflow +#include "hip/hip_runtime.h" +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/ep/include/primitive/permute.h" +#include "oneflow/core/ep/common/primitive/permute_impl.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include + +namespace oneflow { + +namespace ep { +namespace primitive { + +namespace permute { + +namespace internal { + +namespace { + +constexpr int32_t kMov4TileSize = 32; +constexpr int32_t kMov2TileSize = 64; +constexpr int32_t kBlockRows = 8; + +template +__global__ void PermuteKernel(PermuteKernelParams params) { + using T = typename std::aligned_storage::type; + const T* src = reinterpret_cast(params.src); + T* dst = reinterpret_cast(params.dst); + IndexType src_index[num_dims]; + IndexType dst_index[num_dims]; + CUDA_1D_KERNEL_LOOP_T(IndexType, i, params.count) { + params.dst_index_helper.OffsetToNdIndex(i, dst_index); +#pragma unroll + for (size_t dim = 0; dim < num_dims; ++dim) { + src_index[params.permutation[dim]] = dst_index[dim]; + } + IndexType src_offset = params.src_index_helper.NdIndexToOffset(src_index); + dst[i] = src[src_offset]; + } +} + +// (B, X, Y) -> (B, Y, X) +// refer from https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/ +template +__global__ void BatchTransposeKernel(const void* src_ptr, void* dst_ptr, IndexType rows, + IndexType cols, IndexType num_tile_rows, + IndexType num_tile_cols, int32_t block_nums) { + const IndexType src_rows = rows; + const IndexType src_cols = cols; + const IndexType dst_rows = cols; + const IndexType dst_cols = rows; + + using T = typename std::aligned_storage::type; + __shared__ T tile[tile_size][tile_size + 1]; // To avoid bank conflict. + + const T* src = reinterpret_cast(src_ptr); + T* dst = reinterpret_cast(dst_ptr); + + IndexType batch_num_tile = num_tile_rows * num_tile_cols; + for (int i = blockIdx.x, step = gridDim.x; i < block_nums; i += step) { + const IndexType batch_index = i / batch_num_tile; // the index of batch. + const IndexType tile_index = + i - batch_index * batch_num_tile; // equal to i % (num_tile_rows*num_tile_cols). the + // flatten index of tile in a batch. + + const IndexType tile_row_index = + tile_index / num_tile_cols; // the row index of tile in a batch. + const IndexType tile_col_index = + tile_index + - tile_row_index + * num_tile_cols; // equal to k % num_tile_cols. the col index of tile in a batch. + + const IndexType offset = batch_index * src_rows * src_cols; + { + IndexType col_in_tile = threadIdx.x; + IndexType col_in_matrix = tile_col_index * tile_size + threadIdx.x; +#pragma unroll + for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size; + row_in_tile += kBlockRows) { + IndexType row_in_matrix = row_in_tile + tile_row_index * tile_size; + if (col_in_matrix < src_cols && row_in_matrix < src_rows) { + tile[row_in_tile][col_in_tile] = src[offset + row_in_matrix * src_cols + col_in_matrix]; + } + } + } + __syncthreads(); + { + IndexType col_in_tile = threadIdx.x; + IndexType col_in_matrix = tile_row_index * tile_size + threadIdx.x; +#pragma unroll + for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size; + row_in_tile += kBlockRows) { + IndexType row_in_matrix = row_in_tile + tile_col_index * tile_size; + if (col_in_matrix < dst_cols && row_in_matrix < dst_rows) { + dst[offset + row_in_matrix * dst_cols + col_in_matrix] = tile[col_in_tile][row_in_tile]; + } + } + } + __syncthreads(); + } +} + +/* +Here is a Movementsie=2 version of Batch Transpose. +When the H W can be divided by 2. we can read data use movementsize=4, and write back as +movementsize=4. +*/ +template +__global__ void BatchTransposeMovement2Kernel(const void* src_ptr, void* dst_ptr, IndexType rows, + IndexType cols, IndexType num_tile_rows, + IndexType num_tile_cols, int32_t block_nums) { + const IndexType src_rows = rows; + const IndexType src_cols = cols; + const IndexType dst_rows = cols; + const IndexType dst_cols = rows; + + static_assert(tile_size % 2 == 0, ""); + using T_MOV2 = typename std::aligned_storage<2, 2>::type; + using T_MOV4 = typename std::aligned_storage<4, 4>::type; + + const T_MOV4* src = reinterpret_cast(src_ptr); + T_MOV4* dst = reinterpret_cast(dst_ptr); + + // Use union structure to process Load and Store. + __shared__ union { + T_MOV2 tile_m2[tile_size][tile_size + 2]; // half [64][66] + T_MOV4 tile_m4[tile_size][tile_size / 2 + 1]; // half2 [64][33] + } tile_mem; + + IndexType batch_num_tile = num_tile_rows * num_tile_cols; + for (int i = blockIdx.x, step = gridDim.x; i < block_nums; i += step) { + const IndexType batch_index = i / batch_num_tile; // the index of batch. + const IndexType tile_index = + i - batch_index * batch_num_tile; // equal to i % (num_tile_rows*num_tile_cols). the + // flatten index of tile in a batch. + + const IndexType tile_row_index = + tile_index / num_tile_cols; // the row index of tile in a batch. + const IndexType tile_col_index = + tile_index + - tile_row_index + * num_tile_cols; // equal to k % num_tile_cols. the col index of tile in a batch. + + const IndexType offset = batch_index * src_rows * src_cols; + { + IndexType col_in_tile = threadIdx.x; + IndexType col_in_matrix = tile_col_index * tile_size + threadIdx.x * 2; +#pragma unroll + for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size; + row_in_tile += kBlockRows) { + IndexType row_in_matrix = row_in_tile + tile_row_index * tile_size; + if (col_in_matrix < src_cols && row_in_matrix < src_rows) { + tile_mem.tile_m4[row_in_tile][col_in_tile] = + src[(offset + row_in_matrix * src_cols + col_in_matrix) / 2]; + } + } + } + __syncthreads(); + { + IndexType col_in_tile = threadIdx.x; + IndexType col_in_matrix = tile_row_index * tile_size + threadIdx.x * 2; +#pragma unroll + for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size; + row_in_tile += kBlockRows) { + IndexType row_in_matrix = row_in_tile + tile_col_index * tile_size; + union { + T_MOV4 m4; + T_MOV2 m2[2]; + } tmp_storage; + + if (col_in_matrix < dst_cols && row_in_matrix < dst_rows) { + tmp_storage.m2[0] = tile_mem.tile_m2[col_in_tile * 2][row_in_tile]; + tmp_storage.m2[1] = tile_mem.tile_m2[col_in_tile * 2 + 1][row_in_tile]; + dst[(offset + row_in_matrix * dst_cols + col_in_matrix) / 2] = tmp_storage.m4; + } + } + } + __syncthreads(); + } +} + +template +void LaunchBatchTransposeKernel(hipStream_t& cuda_stream, + const PermuteKernelParams& params, + const IndexType& num_batches, const IndexType& rows, + const IndexType& cols) { + IndexType num_tile_rows = (rows + tile_size - 1) / tile_size; + IndexType num_tile_cols = (cols + tile_size - 1) / tile_size; + const int32_t block_nums = num_batches * num_tile_rows * num_tile_cols; + int32_t launched_block_nums = std::min(block_nums, kCudaMaxBlocksNum); + if (tile_size == kMov2TileSize) { + const int32_t half2_thread = tile_size / 2; // cause each thread process two half elements. + BatchTransposeMovement2Kernel + <<>>( + params.src, params.dst, rows, cols, num_tile_rows, num_tile_cols, + block_nums); // Set threads num as 32x8 cause each threads + // process 4 elements to 64x66 half share memory. + } else { + BatchTransposeKernel + <<>>( + params.src, params.dst, rows, cols, num_tile_rows, num_tile_cols, block_nums); + } +} + +template +bool CheckIfGreaterEqualThanTileSize(const IndexType& rows, const IndexType& cols) { + if (rows < tile_size || cols < tile_size) { return false; } + return true; +} + +template +bool CheckLaunchBatchTranspose(const int* permutation, const IndexType& num_batches, + const IndexType& rows, const IndexType& cols) { + if (CheckIfGreaterEqualThanTileSize(rows, cols)) { + if (num_batches == 1 && permutation[1] == 0 && permutation[0] == 1) { + // 2d tensor case: (0, 1) -> (1, 0) + return true; + } else if (num_dims == 3 && permutation[2] == 1 && permutation[1] == 2) { + // 3d tensor case: (0, 1, 2) -> (0, 2, 1) + return true; + } else { + return false; + } + } + return false; +} + +template +bool CheckUseMov2(const IndexType& rows, const IndexType& cols, const void* src, void* dst) { + auto src_ptr = reinterpret_cast(src); + auto dst_ptr = reinterpret_cast(dst); + return (movement_size == 2) && (rows % 2 == 0) && (cols % 2 == 0) && (src_ptr % 4 == 0) + && (dst_ptr % 4 == 0); +} + +template +void InferBatchTransposeShape(const int64_t* src_dims, IndexType* num_batches, IndexType* rows, + IndexType* cols) { + if (num_dims == 2) { + *num_batches = 1; + *rows = src_dims[0]; + *cols = src_dims[1]; + } else { + *num_batches = src_dims[0]; + *rows = src_dims[1]; + *cols = src_dims[2]; + } +} + +template +void LaunchKernel(Stream* stream, const int64_t* src_dims, const void* src, const int* permutation, + void* dst, size_t count) { + PermuteKernelParams params = + MakePermuteParams(src_dims, src, permutation, dst, count); + hipStream_t cuda_stream = stream->As()->cuda_stream(); + + if (num_dims == 2 || num_dims == 3) { + IndexType num_batches; + IndexType rows; + IndexType cols; + InferBatchTransposeShape(src_dims, &num_batches, &rows, &cols); + if (CheckLaunchBatchTranspose(params.permutation, num_batches, rows, + cols)) { + if (CheckUseMov2(rows, cols, src, dst)) { + LaunchBatchTransposeKernel(cuda_stream, params, + num_batches, rows, cols); + } else { + LaunchBatchTransposeKernel( + cuda_stream, params, num_batches, rows, cols); + } + } else { + PermuteKernel + <<>>(params); + } + } else { + PermuteKernel + <<>>(params); + } +} + +class PermuteImpl : public Permute { + public: + OF_DISALLOW_COPY_AND_MOVE(PermuteImpl); + PermuteImpl() = default; + ~PermuteImpl() override = default; + + using Permute::Launch; + void Launch(Stream* stream, DataType data_type, size_t num_dims, const int64_t* src_dims, + const void* src, const int* permutation, void* dst) override { + SimplifyThenLaunch(stream, data_type, num_dims, src_dims, src, permutation, dst); + } +}; + +class PermuteFactoryImpl : public PermuteFactory { + public: + OF_DISALLOW_COPY_AND_MOVE(PermuteFactoryImpl); + PermuteFactoryImpl() = default; + ~PermuteFactoryImpl() override = default; + + std::unique_ptr New(size_t max_num_dims) override { + if (max_num_dims <= kMaxNumDims) { + return std::unique_ptr(new PermuteImpl()); + } else { + return nullptr; + } + } +}; + +REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, PermuteFactory, PermuteFactoryImpl); + +} // namespace + +} // namespace internal + +} // namespace permute + +} // namespace primitive +} // namespace ep + +} // namespace oneflow diff --git a/oneflow/core/ep/rocm/primitive/softmax.hip.cpp b/oneflow/core/ep/rocm/primitive/softmax.hip.cpp index c1d2d00..85d065c 100644 --- a/oneflow/core/ep/rocm/primitive/softmax.hip.cpp +++ b/oneflow/core/ep/rocm/primitive/softmax.hip.cpp @@ -1,107 +1,107 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/common/preprocessor.h" -#include "oneflow/core/ep/include/primitive/softmax.h" -#include "oneflow/core/ep/include/primitive/log_softmax.h" -#include "oneflow/core/ep/rocm/primitive/type_seq.h" -#include "oneflow/core/hip/softmax.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace ep { -namespace primitive { - -namespace { - -enum class Algorithm { - kSoftmax, - kLogSoftmax, -}; - -template -void SoftmaxGpu(hipStream_t cuda_stream, size_t rows, size_t cols, const T* x, T* y) { - using ComputeType = typename cuda::softmax::DefaultComputeType::type; - oneflow::cuda::softmax::DirectLoad load(x, cols); - oneflow::cuda::softmax::DirectStore store(y, cols); - if (algorithm == Algorithm::kSoftmax) { - OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax( - cuda_stream, load, store, rows, cols))); - } else if (algorithm == Algorithm::kLogSoftmax) { - OF_CUDA_CHECK((cuda::softmax::DispatchLogSoftmax( - cuda_stream, load, store, rows, cols))); - } else { - UNIMPLEMENTED(); - } -} - -template -class SoftmaxImpl : public SoftmaxBase { - public: - OF_DISALLOW_COPY_AND_MOVE(SoftmaxImpl); - SoftmaxImpl() = default; - ~SoftmaxImpl() override = default; - - void Launch(Stream* stream, size_t rows, size_t cols, const void* x, void* y) override { - hipStream_t cuda_stream = stream->As()->cuda_stream(); - SoftmaxGpu(cuda_stream, rows, cols, reinterpret_cast(x), - reinterpret_cast(y)); - } -}; - -template -std::unique_ptr NewSoftmax() { - return std::unique_ptr(new SoftmaxImpl()); -} - -template -class GenericSoftmaxFactoryImpl : public FactoryBase { - public: - OF_DISALLOW_COPY_AND_MOVE(GenericSoftmaxFactoryImpl); - GenericSoftmaxFactoryImpl() = default; - ~GenericSoftmaxFactoryImpl() override = default; - - std::unique_ptr New(DataType data_type) override { -#define MAKE_NEW_SOFTMAX_ENTRY(type_cpp, type_proto) \ - {type_proto, NewSoftmax}, - - static const std::map()>> - new_softmax_handle{ - OF_PP_FOR_EACH_TUPLE(MAKE_NEW_SOFTMAX_ENTRY, CUDA_PRIMITIVE_FLOATING_TYPE_SEQ)}; - -#undef MAKE_NEW_SOFTMAX_ENTRY - - const auto it = new_softmax_handle.find(data_type); - if (it != new_softmax_handle.end()) { - return it->second(); - } else { - return nullptr; - } - } -}; - -using SoftmaxFactoryImpl = GenericSoftmaxFactoryImpl; -using LogSoftmaxFactoryImpl = - GenericSoftmaxFactoryImpl; -REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, SoftmaxFactory, SoftmaxFactoryImpl); -REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, LogSoftmaxFactory, LogSoftmaxFactoryImpl); - -} // namespace - -} // namespace primitive -} // namespace ep - -} // namespace oneflow +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/common/preprocessor.h" +#include "oneflow/core/ep/include/primitive/softmax.h" +#include "oneflow/core/ep/include/primitive/log_softmax.h" +#include "oneflow/core/ep/rocm/primitive/type_seq.h" +#include "oneflow/core/hip/softmax.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace ep { +namespace primitive { + +namespace { + +enum class Algorithm { + kSoftmax, + kLogSoftmax, +}; + +template +void SoftmaxGpu(hipStream_t cuda_stream, size_t rows, size_t cols, const T* x, T* y) { + using ComputeType = typename cuda::softmax::DefaultComputeType::type; + oneflow::cuda::softmax::DirectLoad load(x, cols); + oneflow::cuda::softmax::DirectStore store(y, cols); + if (algorithm == Algorithm::kSoftmax) { + OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax( + cuda_stream, load, store, rows, cols))); + } else if (algorithm == Algorithm::kLogSoftmax) { + OF_CUDA_CHECK((cuda::softmax::DispatchLogSoftmax( + cuda_stream, load, store, rows, cols))); + } else { + UNIMPLEMENTED(); + } +} + +template +class SoftmaxImpl : public SoftmaxBase { + public: + OF_DISALLOW_COPY_AND_MOVE(SoftmaxImpl); + SoftmaxImpl() = default; + ~SoftmaxImpl() override = default; + + void Launch(Stream* stream, size_t rows, size_t cols, const void* x, void* y) override { + hipStream_t cuda_stream = stream->As()->cuda_stream(); + SoftmaxGpu(cuda_stream, rows, cols, reinterpret_cast(x), + reinterpret_cast(y)); + } +}; + +template +std::unique_ptr NewSoftmax() { + return std::unique_ptr(new SoftmaxImpl()); +} + +template +class GenericSoftmaxFactoryImpl : public FactoryBase { + public: + OF_DISALLOW_COPY_AND_MOVE(GenericSoftmaxFactoryImpl); + GenericSoftmaxFactoryImpl() = default; + ~GenericSoftmaxFactoryImpl() override = default; + + std::unique_ptr New(DataType data_type) override { +#define MAKE_NEW_SOFTMAX_ENTRY(type_cpp, type_proto) \ + {type_proto, NewSoftmax}, + + static const std::map()>> + new_softmax_handle{ + OF_PP_FOR_EACH_TUPLE(MAKE_NEW_SOFTMAX_ENTRY, CUDA_PRIMITIVE_FLOATING_TYPE_SEQ)}; + +#undef MAKE_NEW_SOFTMAX_ENTRY + + const auto it = new_softmax_handle.find(data_type); + if (it != new_softmax_handle.end()) { + return it->second(); + } else { + return nullptr; + } + } +}; + +using SoftmaxFactoryImpl = GenericSoftmaxFactoryImpl; +using LogSoftmaxFactoryImpl = + GenericSoftmaxFactoryImpl; +REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, SoftmaxFactory, SoftmaxFactoryImpl); +REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, LogSoftmaxFactory, LogSoftmaxFactoryImpl); + +} // namespace + +} // namespace primitive +} // namespace ep + +} // namespace oneflow diff --git a/oneflow/core/ep/rocm/primitive/softmax_backward.hip.cpp b/oneflow/core/ep/rocm/primitive/softmax_backward.hip.cpp index d184890..12088b8 100644 --- a/oneflow/core/ep/rocm/primitive/softmax_backward.hip.cpp +++ b/oneflow/core/ep/rocm/primitive/softmax_backward.hip.cpp @@ -1,116 +1,116 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/common/preprocessor.h" -#include "oneflow/core/ep/include/primitive/softmax_backward.h" -#include "oneflow/core/ep/include/primitive/log_softmax_backward.h" -#include "oneflow/core/ep/rocm/primitive/type_seq.h" -#include "oneflow/core/hip/softmax.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace ep { -namespace primitive { - -namespace { - -enum class Algorithm { - kSoftmax, - kLogSoftmax, -}; - -template -void SoftmaxBackwardGpu(hipStream_t cuda_stream, size_t rows, size_t cols, const T* y, const T* dy, - T* dx) { - using ComputeType = typename cuda::softmax::DefaultComputeType::type; - cuda::softmax::DirectLoad load_y(y, cols); - cuda::softmax::DirectLoad load_dy(dy, cols); - cuda::softmax::DirectStore store(dx, cols); - if (algorithm == Algorithm::kSoftmax) { - OF_CUDA_CHECK((cuda::softmax::DispatchSoftmaxGrad( - cuda_stream, load_y, load_dy, store, rows, cols))); - } else if (algorithm == Algorithm::kLogSoftmax) { - OF_CUDA_CHECK((cuda::softmax::DispatchLogSoftmaxGrad( - cuda_stream, load_y, load_dy, store, rows, cols))); - } else { - UNIMPLEMENTED(); - } -} - -template -class SoftmaxBackwardImpl : public SoftmaxBackwardBase { - public: - OF_DISALLOW_COPY_AND_MOVE(SoftmaxBackwardImpl); - SoftmaxBackwardImpl() = default; - ~SoftmaxBackwardImpl() override = default; - - void Launch(Stream* stream, size_t rows, size_t cols, const void* y, const void* dy, - void* dx) override { - hipStream_t cuda_stream = stream->As()->cuda_stream(); - SoftmaxBackwardGpu(cuda_stream, rows, cols, reinterpret_cast(y), - reinterpret_cast(dy), reinterpret_cast(dx)); - } -}; - -template -std::unique_ptr NewSoftmaxBackward() { - return std::unique_ptr( - new SoftmaxBackwardImpl()); -} - -template -class GenericSoftmaxBackwardFactoryImpl : public BackwardFactoryBase { - public: - OF_DISALLOW_COPY_AND_MOVE(GenericSoftmaxBackwardFactoryImpl); - GenericSoftmaxBackwardFactoryImpl() = default; - ~GenericSoftmaxBackwardFactoryImpl() override = default; - - std::unique_ptr New(DataType data_type) override { -#define MAKE_NEW_SOFTMAX_ENTRY(type_cpp, type_proto) \ - {type_proto, NewSoftmaxBackward}, - - static const std::map()>> - new_softmax_backward_handle{ - OF_PP_FOR_EACH_TUPLE(MAKE_NEW_SOFTMAX_ENTRY, CUDA_PRIMITIVE_FLOATING_TYPE_SEQ)}; - -#undef MAKE_NEW_SOFTMAX_ENTRY - - const auto it = new_softmax_backward_handle.find(data_type); - if (it != new_softmax_backward_handle.end()) { - return it->second(); - } else { - return nullptr; - } - } -}; - -using SoftmaxBackwardFactoryImpl = - GenericSoftmaxBackwardFactoryImpl; -using LogSoftmaxBackwardFactoryImpl = - GenericSoftmaxBackwardFactoryImpl; -REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, SoftmaxBackwardFactory, SoftmaxBackwardFactoryImpl); -REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, LogSoftmaxBackwardFactory, - LogSoftmaxBackwardFactoryImpl); - -} // namespace - -} // namespace primitive -} // namespace ep - -} // namespace oneflow +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/common/preprocessor.h" +#include "oneflow/core/ep/include/primitive/softmax_backward.h" +#include "oneflow/core/ep/include/primitive/log_softmax_backward.h" +#include "oneflow/core/ep/rocm/primitive/type_seq.h" +#include "oneflow/core/hip/softmax.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace ep { +namespace primitive { + +namespace { + +enum class Algorithm { + kSoftmax, + kLogSoftmax, +}; + +template +void SoftmaxBackwardGpu(hipStream_t cuda_stream, size_t rows, size_t cols, const T* y, const T* dy, + T* dx) { + using ComputeType = typename cuda::softmax::DefaultComputeType::type; + cuda::softmax::DirectLoad load_y(y, cols); + cuda::softmax::DirectLoad load_dy(dy, cols); + cuda::softmax::DirectStore store(dx, cols); + if (algorithm == Algorithm::kSoftmax) { + OF_CUDA_CHECK((cuda::softmax::DispatchSoftmaxGrad( + cuda_stream, load_y, load_dy, store, rows, cols))); + } else if (algorithm == Algorithm::kLogSoftmax) { + OF_CUDA_CHECK((cuda::softmax::DispatchLogSoftmaxGrad( + cuda_stream, load_y, load_dy, store, rows, cols))); + } else { + UNIMPLEMENTED(); + } +} + +template +class SoftmaxBackwardImpl : public SoftmaxBackwardBase { + public: + OF_DISALLOW_COPY_AND_MOVE(SoftmaxBackwardImpl); + SoftmaxBackwardImpl() = default; + ~SoftmaxBackwardImpl() override = default; + + void Launch(Stream* stream, size_t rows, size_t cols, const void* y, const void* dy, + void* dx) override { + hipStream_t cuda_stream = stream->As()->cuda_stream(); + SoftmaxBackwardGpu(cuda_stream, rows, cols, reinterpret_cast(y), + reinterpret_cast(dy), reinterpret_cast(dx)); + } +}; + +template +std::unique_ptr NewSoftmaxBackward() { + return std::unique_ptr( + new SoftmaxBackwardImpl()); +} + +template +class GenericSoftmaxBackwardFactoryImpl : public BackwardFactoryBase { + public: + OF_DISALLOW_COPY_AND_MOVE(GenericSoftmaxBackwardFactoryImpl); + GenericSoftmaxBackwardFactoryImpl() = default; + ~GenericSoftmaxBackwardFactoryImpl() override = default; + + std::unique_ptr New(DataType data_type) override { +#define MAKE_NEW_SOFTMAX_ENTRY(type_cpp, type_proto) \ + {type_proto, NewSoftmaxBackward}, + + static const std::map()>> + new_softmax_backward_handle{ + OF_PP_FOR_EACH_TUPLE(MAKE_NEW_SOFTMAX_ENTRY, CUDA_PRIMITIVE_FLOATING_TYPE_SEQ)}; + +#undef MAKE_NEW_SOFTMAX_ENTRY + + const auto it = new_softmax_backward_handle.find(data_type); + if (it != new_softmax_backward_handle.end()) { + return it->second(); + } else { + return nullptr; + } + } +}; + +using SoftmaxBackwardFactoryImpl = + GenericSoftmaxBackwardFactoryImpl; +using LogSoftmaxBackwardFactoryImpl = + GenericSoftmaxBackwardFactoryImpl; +REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, SoftmaxBackwardFactory, SoftmaxBackwardFactoryImpl); +REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, LogSoftmaxBackwardFactory, + LogSoftmaxBackwardFactoryImpl); + +} // namespace + +} // namespace primitive +} // namespace ep + +} // namespace oneflow diff --git a/oneflow/core/ep/rocm/primitive/type_seq.h b/oneflow/core/ep/rocm/primitive/type_seq.h index d82aa05..fff3643 100644 --- a/oneflow/core/ep/rocm/primitive/type_seq.h +++ b/oneflow/core/ep/rocm/primitive/type_seq.h @@ -1,78 +1,78 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_EP_CUDA_PRIMITIVE_TYPE_SEQ_H_ -#define ONEFLOW_CORE_EP_CUDA_PRIMITIVE_TYPE_SEQ_H_ - -#include "oneflow/core/common/preprocessor.h" -#include "oneflow/core/common/data_type.h" - -#ifdef WITH_ROCM -#include -#include - -// #if CUDA_VERSION >= 11000 -// #include -// #endif // CUDA_VERSION >= 11000 - -#define CUDA_PRIMITIVE_BOOL_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(bool, DataType::kBool) -#define CUDA_PRIMITIVE_CHAR_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(char, DataType::kChar) -#define CUDA_PRIMITIVE_INT8_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int8_t, DataType::kInt8) -#define CUDA_PRIMITIVE_UINT8_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint8_t, DataType::kUInt8) -#define CUDA_PRIMITIVE_INT32_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) -#define CUDA_PRIMITIVE_UINT32_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) -#define CUDA_PRIMITIVE_INT64_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64) -#define CUDA_PRIMITIVE_UINT64_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64) -#define CUDA_PRIMITIVE_FLOAT_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(float, DataType::kFloat) -#define CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(double, DataType::kDouble) -#define CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(half, DataType::kFloat16) - -// #if CUDA_VERSION >= 11000 -// #define CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(nv_bfloat16, DataType::kBFloat16) -// #else -#define CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ -// #endif // CUDA_VERSION >= 11000 - -#define CUDA_PRIMITIVE_ALL_TYPE_SEQ \ - CUDA_PRIMITIVE_BOOL_TYPE_SEQ \ - CUDA_PRIMITIVE_CHAR_TYPE_SEQ \ - CUDA_PRIMITIVE_INT8_TYPE_SEQ \ - CUDA_PRIMITIVE_UINT8_TYPE_SEQ \ - CUDA_PRIMITIVE_INT32_TYPE_SEQ \ - CUDA_PRIMITIVE_INT64_TYPE_SEQ \ - CUDA_PRIMITIVE_FLOAT_TYPE_SEQ \ - CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ \ - CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ \ - CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ - -#define CUDA_PRIMITIVE_FLOATING_TYPE_SEQ \ - CUDA_PRIMITIVE_FLOAT_TYPE_SEQ \ - CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ \ - CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ \ - CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ - -#define UTIL_OPS_DATA_TYPE_SEQ \ - CUDA_PRIMITIVE_INT8_TYPE_SEQ \ - CUDA_PRIMITIVE_UINT8_TYPE_SEQ \ - CUDA_PRIMITIVE_INT32_TYPE_SEQ \ - CUDA_PRIMITIVE_INT64_TYPE_SEQ \ - CUDA_PRIMITIVE_FLOAT_TYPE_SEQ \ - CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ \ - CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ \ - CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ - -#endif // WITH_ROCM - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_EP_CUDA_PRIMITIVE_TYPE_SEQ_H_ +#define ONEFLOW_CORE_EP_CUDA_PRIMITIVE_TYPE_SEQ_H_ + +#include "oneflow/core/common/preprocessor.h" +#include "oneflow/core/common/data_type.h" + +#ifdef WITH_ROCM +#include +#include + +// #if CUDA_VERSION >= 11000 +// #include +// #endif // CUDA_VERSION >= 11000 + +#define CUDA_PRIMITIVE_BOOL_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(bool, DataType::kBool) +#define CUDA_PRIMITIVE_CHAR_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(char, DataType::kChar) +#define CUDA_PRIMITIVE_INT8_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int8_t, DataType::kInt8) +#define CUDA_PRIMITIVE_UINT8_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint8_t, DataType::kUInt8) +#define CUDA_PRIMITIVE_INT32_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) +#define CUDA_PRIMITIVE_UINT32_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) +#define CUDA_PRIMITIVE_INT64_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64) +#define CUDA_PRIMITIVE_UINT64_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64) +#define CUDA_PRIMITIVE_FLOAT_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(float, DataType::kFloat) +#define CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(double, DataType::kDouble) +#define CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(half, DataType::kFloat16) + +// #if CUDA_VERSION >= 11000 +// #define CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(nv_bfloat16, DataType::kBFloat16) +// #else +#define CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ +// #endif // CUDA_VERSION >= 11000 + +#define CUDA_PRIMITIVE_ALL_TYPE_SEQ \ + CUDA_PRIMITIVE_BOOL_TYPE_SEQ \ + CUDA_PRIMITIVE_CHAR_TYPE_SEQ \ + CUDA_PRIMITIVE_INT8_TYPE_SEQ \ + CUDA_PRIMITIVE_UINT8_TYPE_SEQ \ + CUDA_PRIMITIVE_INT32_TYPE_SEQ \ + CUDA_PRIMITIVE_INT64_TYPE_SEQ \ + CUDA_PRIMITIVE_FLOAT_TYPE_SEQ \ + CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ \ + CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ \ + CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ + +#define CUDA_PRIMITIVE_FLOATING_TYPE_SEQ \ + CUDA_PRIMITIVE_FLOAT_TYPE_SEQ \ + CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ \ + CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ \ + CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ + +#define UTIL_OPS_DATA_TYPE_SEQ \ + CUDA_PRIMITIVE_INT8_TYPE_SEQ \ + CUDA_PRIMITIVE_UINT8_TYPE_SEQ \ + CUDA_PRIMITIVE_INT32_TYPE_SEQ \ + CUDA_PRIMITIVE_INT64_TYPE_SEQ \ + CUDA_PRIMITIVE_FLOAT_TYPE_SEQ \ + CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ \ + CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ \ + CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ + +#endif // WITH_ROCM + #endif // ONEFLOW_CORE_EP_CUDA_PRIMITIVE_TYPE_SEQ_H_ \ No newline at end of file diff --git a/oneflow/core/ep/rocm/primitive/unary_functor.hip.h b/oneflow/core/ep/rocm/primitive/unary_functor.hip.h index f3ff395..2dcec8d 100644 --- a/oneflow/core/ep/rocm/primitive/unary_functor.hip.h +++ b/oneflow/core/ep/rocm/primitive/unary_functor.hip.h @@ -1,170 +1,170 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/ep/common/primitive/unary_functor.h" -#include "oneflow/core/ep/rocm/primitive/type_seq.h" -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { -namespace ep { -namespace primitive { - -template -struct UnaryFunctor { - UnaryFunctor(Scalar attr0, Scalar attr1) {} - - OF_DEVICE_FUNC Dst operator()(Src src) const { - return static_cast(0.5) * src - * (static_cast(1.0) + erf(static_cast(M_SQRT1_2) * src)); - } -}; - -template<> -struct UnaryFunctor { - UnaryFunctor(Scalar attr0, Scalar attr1) {} - - OF_DEVICE_FUNC float operator()(float src) const { return tanhf(src); } -}; - -template<> -struct UnaryFunctor { - UnaryFunctor(Scalar attr0, Scalar attr1) {} - - OF_DEVICE_FUNC double operator()(double src) const { return tanh(src); } -}; - -template<> -struct UnaryFunctor { - UnaryFunctor(Scalar attr0, Scalar attr1) {} - - OF_DEVICE_FUNC half operator()(half src) const { return __float2half(tanhf(__half2float(src))); } -}; - -template<> -struct UnaryFunctor { - UnaryFunctor(Scalar attr0, Scalar attr1) {} - - OF_DEVICE_FUNC bool operator()(half src) const { return isinf(__half2float(src)); } -}; - -template<> -struct UnaryFunctor { - UnaryFunctor(Scalar attr0, Scalar attr1) {} - - OF_DEVICE_FUNC bool operator()(float src) const { return isinf(src); } -}; - -template<> -struct UnaryFunctor { - UnaryFunctor(Scalar attr0, Scalar attr1) {} - - OF_DEVICE_FUNC bool operator()(double src) const { return isinf(src); } -}; - -template<> -struct UnaryFunctor { - UnaryFunctor(Scalar attr0, Scalar attr1) {} - - OF_DEVICE_FUNC bool operator()(half src) const { return isnan(__half2float(src)); } -}; - -template<> -struct UnaryFunctor { - UnaryFunctor(Scalar attr0, Scalar attr1) {} - - OF_DEVICE_FUNC bool operator()(float src) const { return isnan(src); } -}; - -template<> -struct UnaryFunctor { - UnaryFunctor(Scalar attr0, Scalar attr1) {} - - OF_DEVICE_FUNC bool operator()(double src) const { return isnan(src); } -}; - -#define SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(op) \ - template<> \ - struct UnaryFunctor { \ - UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \ - \ - UnaryFunctor float_functor; \ - OF_DEVICE_FUNC half operator()(half src) const { \ - return __float2half(float_functor(__half2float(src))); \ - } \ - }; - -SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kElu); -SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kCelu); -SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kGelu); -SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kMish); -SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSelu); -SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSilu); -SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSoftSign); -SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSoftPlus); - -// /*********nv_bfloat16_kernel*******/ - -// #if CUDA_VERSION >= 11000 - -// #define SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(op) \ -// template<> \ -// struct UnaryFunctor { \ -// UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \ -// \ -// UnaryFunctor float_functor; \ -// OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src) const { \ -// return __float2bfloat16(float_functor(__bfloat162float(src))); \ -// } \ -// }; - -// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kElu); -// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kCelu); -// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kGelu); -// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardSwish); -// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardSigmoid); -// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardShrink); -// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardTanh); -// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kLeakyRelu); -// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kMish); -// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSelu); -// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSilu); -// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftShrink); -// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftSign); -// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftPlus); -// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kTanh); -// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kThreshold); - -// template<> -// struct UnaryFunctor { -// UnaryFunctor(Scalar attr0, Scalar attr1) {} - -// OF_DEVICE_FUNC bool operator()(nv_bfloat16 src) const { return isinf(__bfloat162float(src)); } -// }; - -// template<> -// struct UnaryFunctor { -// UnaryFunctor(Scalar attr0, Scalar attr1) {} - -// OF_DEVICE_FUNC bool operator()(nv_bfloat16 src) const { return isnan(__bfloat162float(src)); } -// }; - -// #endif - -} // namespace primitive -} // namespace ep -} // namespace oneflow - - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/ep/common/primitive/unary_functor.h" +#include "oneflow/core/ep/rocm/primitive/type_seq.h" +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { +namespace ep { +namespace primitive { + +template +struct UnaryFunctor { + UnaryFunctor(Scalar attr0, Scalar attr1) {} + + OF_DEVICE_FUNC Dst operator()(Src src) const { + return static_cast(0.5) * src + * (static_cast(1.0) + erf(static_cast(M_SQRT1_2) * src)); + } +}; + +template<> +struct UnaryFunctor { + UnaryFunctor(Scalar attr0, Scalar attr1) {} + + OF_DEVICE_FUNC float operator()(float src) const { return tanhf(src); } +}; + +template<> +struct UnaryFunctor { + UnaryFunctor(Scalar attr0, Scalar attr1) {} + + OF_DEVICE_FUNC double operator()(double src) const { return tanh(src); } +}; + +template<> +struct UnaryFunctor { + UnaryFunctor(Scalar attr0, Scalar attr1) {} + + OF_DEVICE_FUNC half operator()(half src) const { return __float2half(tanhf(__half2float(src))); } +}; + +template<> +struct UnaryFunctor { + UnaryFunctor(Scalar attr0, Scalar attr1) {} + + OF_DEVICE_FUNC bool operator()(half src) const { return isinf(__half2float(src)); } +}; + +template<> +struct UnaryFunctor { + UnaryFunctor(Scalar attr0, Scalar attr1) {} + + OF_DEVICE_FUNC bool operator()(float src) const { return isinf(src); } +}; + +template<> +struct UnaryFunctor { + UnaryFunctor(Scalar attr0, Scalar attr1) {} + + OF_DEVICE_FUNC bool operator()(double src) const { return isinf(src); } +}; + +template<> +struct UnaryFunctor { + UnaryFunctor(Scalar attr0, Scalar attr1) {} + + OF_DEVICE_FUNC bool operator()(half src) const { return isnan(__half2float(src)); } +}; + +template<> +struct UnaryFunctor { + UnaryFunctor(Scalar attr0, Scalar attr1) {} + + OF_DEVICE_FUNC bool operator()(float src) const { return isnan(src); } +}; + +template<> +struct UnaryFunctor { + UnaryFunctor(Scalar attr0, Scalar attr1) {} + + OF_DEVICE_FUNC bool operator()(double src) const { return isnan(src); } +}; + +#define SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(op) \ + template<> \ + struct UnaryFunctor { \ + UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \ + \ + UnaryFunctor float_functor; \ + OF_DEVICE_FUNC half operator()(half src) const { \ + return __float2half(float_functor(__half2float(src))); \ + } \ + }; + +SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kElu); +SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kCelu); +SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kGelu); +SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kMish); +SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSelu); +SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSilu); +SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSoftSign); +SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSoftPlus); + +// /*********nv_bfloat16_kernel*******/ + +// #if CUDA_VERSION >= 11000 + +// #define SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(op) \ +// template<> \ +// struct UnaryFunctor { \ +// UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \ +// \ +// UnaryFunctor float_functor; \ +// OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src) const { \ +// return __float2bfloat16(float_functor(__bfloat162float(src))); \ +// } \ +// }; + +// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kElu); +// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kCelu); +// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kGelu); +// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardSwish); +// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardSigmoid); +// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardShrink); +// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardTanh); +// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kLeakyRelu); +// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kMish); +// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSelu); +// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSilu); +// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftShrink); +// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftSign); +// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftPlus); +// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kTanh); +// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kThreshold); + +// template<> +// struct UnaryFunctor { +// UnaryFunctor(Scalar attr0, Scalar attr1) {} + +// OF_DEVICE_FUNC bool operator()(nv_bfloat16 src) const { return isinf(__bfloat162float(src)); } +// }; + +// template<> +// struct UnaryFunctor { +// UnaryFunctor(Scalar attr0, Scalar attr1) {} + +// OF_DEVICE_FUNC bool operator()(nv_bfloat16 src) const { return isnan(__bfloat162float(src)); } +// }; + +// #endif + +} // namespace primitive +} // namespace ep +} // namespace oneflow + + diff --git a/oneflow/core/framework/random_generator_impl.hip.cpp b/oneflow/core/framework/random_generator_impl.hip.cpp index d0285cc..7fdbc19 100644 --- a/oneflow/core/framework/random_generator_impl.hip.cpp +++ b/oneflow/core/framework/random_generator_impl.hip.cpp @@ -1,46 +1,46 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/random_generator_impl.h" - -namespace oneflow { -namespace one { - -namespace { - -__global__ void InitCurandStatesKernel(uint64_t seed, hiprandState* states, - CUDAGeneratorState* cuda_gen_state) { - const int id = blockIdx.x * blockDim.x + threadIdx.x; - size_t local_seed = (static_cast(seed) + 0x9e3779b9U + (static_cast(id) << 6U) - + (static_cast(id) >> 2U)); - hiprand_init(local_seed, 0, 0, &states[id]); - cuda_gen_state->dev_counter = static_cast(0); - cuda_gen_state->dev_offset = static_cast(0); -} - -} // namespace - -namespace detail { - -void InitCurandStates(uint64_t seed, int32_t block_num, int32_t thread_num, hiprandState* states, - CUDAGeneratorState* cuda_gen_state) { - hipLaunchKernelGGL(InitCurandStatesKernel, block_num, thread_num, 0, 0, seed, states, cuda_gen_state); -} - -} // namespace detail - -} // namespace one +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/random_generator_impl.h" + +namespace oneflow { +namespace one { + +namespace { + +__global__ void InitCurandStatesKernel(uint64_t seed, hiprandState* states, + CUDAGeneratorState* cuda_gen_state) { + const int id = blockIdx.x * blockDim.x + threadIdx.x; + size_t local_seed = (static_cast(seed) + 0x9e3779b9U + (static_cast(id) << 6U) + + (static_cast(id) >> 2U)); + hiprand_init(local_seed, 0, 0, &states[id]); + cuda_gen_state->dev_counter = static_cast(0); + cuda_gen_state->dev_offset = static_cast(0); +} + +} // namespace + +namespace detail { + +void InitCurandStates(uint64_t seed, int32_t block_num, int32_t thread_num, hiprandState* states, + CUDAGeneratorState* cuda_gen_state) { + hipLaunchKernelGGL(InitCurandStatesKernel, block_num, thread_num, 0, 0, seed, states, cuda_gen_state); +} + +} // namespace detail + +} // namespace one } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/hip/atomic.hip.h b/oneflow/core/hip/atomic.hip.h index cf1e001..c4daf4d 100644 --- a/oneflow/core/hip/atomic.hip.h +++ b/oneflow/core/hip/atomic.hip.h @@ -1,214 +1,214 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_HIP_ATOMIC_H_ -#define ONEFLOW_CORE_HIP_ATOMIC_H_ - -#if defined(__HIPCC__) - -#include -#include -#include - -namespace oneflow { - -namespace cuda { - -namespace atomic { - -namespace internal { - -template -__device__ __forceinline__ T CastCASImpl(T* address, T compare, T val) { - static_assert(sizeof(T) == sizeof(U), ""); - U ret = atomicCAS(reinterpret_cast(address), *(reinterpret_cast(&compare)), - *(reinterpret_cast(&val))); - return *(reinterpret_cast(&ret)); -} - -template -__device__ __forceinline__ typename std::enable_if::type -CASImpl(T* address, T compare, T val) { - return CastCASImpl(address, compare, val); -} - -template -__device__ __forceinline__ - typename std::enable_if::type - CASImpl(T* address, T compare, T val) { - return CastCASImpl(address, compare, val); -} - -__device__ __forceinline__ int CASImpl(int* address, int compare, int val) { - return atomicCAS(address, compare, val); -} - -__device__ __forceinline__ unsigned int CASImpl(unsigned int* address, unsigned int compare, - unsigned int val) { - return atomicCAS(address, compare, val); -} - -__device__ __forceinline__ unsigned long long int CASImpl(unsigned long long int* address, - unsigned long long int compare, - unsigned long long int val) { - return atomicCAS(address, compare, val); -} - -// #if __CUDA_ARCH__ >= 700 - -// __device__ __forceinline__ unsigned short int CASImpl(unsigned short int* address, -// unsigned short int compare, -// unsigned short int val) { -// return atomicCAS(address, compare, val); -// } - -// #endif // __CUDA_ARCH__ >= 700 - -template -struct AddOp { - __device__ __forceinline__ T operator()(T a, T b) { return a + b; } -}; - -template class BinaryOp> -__device__ __forceinline__ T AtomicCASBinaryImpl(T* address, T val) { - T old = *address; - T assumed; - do { - assumed = old; - old = CASImpl(address, assumed, BinaryOp()(old, val)); - } while (old != assumed); - return old; -} - -template -__device__ __forceinline__ T AddImpl(T* address, T val) { - return AtomicCASBinaryImpl(address, val); -} - -__device__ __forceinline__ int AddImpl(int* address, int val) { return atomicAdd(address, val); } - -__device__ __forceinline__ unsigned int AddImpl(unsigned int* address, unsigned int val) { - return atomicAdd(address, val); -} - -__device__ __forceinline__ unsigned long long int AddImpl(unsigned long long int* address, - unsigned long long int val) { - return atomicAdd(address, val); -} - -__device__ __forceinline__ uint64_t AddImpl(uint64_t* address, uint64_t val) { - static_assert(sizeof(uint64_t) == sizeof(unsigned long long int), ""); - return static_cast(atomicAdd(reinterpret_cast(address), - static_cast(val))); -} - -__device__ __forceinline__ float AddImpl(float* address, float val) { - return atomicAdd(address, val); -} - -// #if __CUDA_ARCH__ >= 600 - -__device__ __forceinline__ double AddImpl(double* address, double val) { - return atomicAdd(address, val); -} - -// __device__ __forceinline__ half2 AddImpl(half2* address, half2 val) { -// return atomicAdd(address, val); -// } - -// #endif // __CUDA_ARCH__ >= 600 - -// #if __CUDA_ARCH__ >= 700 - -__device__ __forceinline__ half AddImpl(half* address, half val) { - float address_value = __half2float(*address); - return __float2half(atomicAdd(&address_value, __half2float(val))); } - -// #endif // __CUDA_ARCH__ >= 700 - -// #if __CUDA_ARCH__ >= 800 - -// __device__ __forceinline__ nv_bfloat16 AddImpl(nv_bfloat16* address, nv_bfloat16 val) { -// return atomicAdd(address, val); -// } - -// #endif // __CUDA_ARCH__ >= 800 - -// #if __CUDA_ARCH__ < 530 - -__device__ __forceinline__ half2 AddImpl(half2* address, half2 val) { - half2 res; - float2 address_value = __half22float2(*address); - res.data.x = __float2half(atomicAdd(&address_value.x, __half2float(val.data.x))); - res.data.y = __float2half(atomicAdd(&address_value.y, __half2float(val.data.y))); - return res; -} - -// #endif // __CUDA_ARCH__ < 530 - -} // namespace internal - -template -__device__ __forceinline__ typename std::enable_if::value, T>::type Cast(U v) { - return static_cast(v); -} - -template -__device__ __forceinline__ typename std::enable_if::value, T>::type Cast(U v) { - return v; -} - -template -__device__ __forceinline__ T CAS(T* address, U compare, V val) { - return internal::CASImpl(address, Cast(compare), Cast(val)); -} - -template -__device__ __forceinline__ T Add(T* address, U val) { - return internal::AddImpl(address, Cast(val)); -} - -__device__ __forceinline__ float Max(float* address, const float val) { - int* address_as_i = (int*)address; - int old = *address_as_i; - int assumed = 0; - do { - assumed = old; - old = atomicCAS(address_as_i, assumed, __float_as_int(fmaxf(val, __int_as_float(assumed)))); - } while (assumed != old); - return __int_as_float(old); -} - -__device__ __forceinline__ double Max(double* address, const double val) { - unsigned long long int* address_as_i = (unsigned long long int*)address; - unsigned long long int old = *address_as_i; - unsigned long long int assumed = 0; - do { - assumed = old; - old = atomicCAS(address_as_i, assumed, - __double_as_longlong(fmax(val, __longlong_as_double(assumed)))); - } while (assumed != old); - return __longlong_as_double(old); -} - -} // namespace atomic - -} // namespace cuda - -} // namespace oneflow - -#endif // defined(__HIPCC__) - -#endif // ONEFLOW_CORE_CUDA_ATOMIC_H_ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_HIP_ATOMIC_H_ +#define ONEFLOW_CORE_HIP_ATOMIC_H_ + +#if defined(__HIPCC__) + +#include +#include +#include + +namespace oneflow { + +namespace cuda { + +namespace atomic { + +namespace internal { + +template +__device__ __forceinline__ T CastCASImpl(T* address, T compare, T val) { + static_assert(sizeof(T) == sizeof(U), ""); + U ret = atomicCAS(reinterpret_cast(address), *(reinterpret_cast(&compare)), + *(reinterpret_cast(&val))); + return *(reinterpret_cast(&ret)); +} + +template +__device__ __forceinline__ typename std::enable_if::type +CASImpl(T* address, T compare, T val) { + return CastCASImpl(address, compare, val); +} + +template +__device__ __forceinline__ + typename std::enable_if::type + CASImpl(T* address, T compare, T val) { + return CastCASImpl(address, compare, val); +} + +__device__ __forceinline__ int CASImpl(int* address, int compare, int val) { + return atomicCAS(address, compare, val); +} + +__device__ __forceinline__ unsigned int CASImpl(unsigned int* address, unsigned int compare, + unsigned int val) { + return atomicCAS(address, compare, val); +} + +__device__ __forceinline__ unsigned long long int CASImpl(unsigned long long int* address, + unsigned long long int compare, + unsigned long long int val) { + return atomicCAS(address, compare, val); +} + +// #if __CUDA_ARCH__ >= 700 + +// __device__ __forceinline__ unsigned short int CASImpl(unsigned short int* address, +// unsigned short int compare, +// unsigned short int val) { +// return atomicCAS(address, compare, val); +// } + +// #endif // __CUDA_ARCH__ >= 700 + +template +struct AddOp { + __device__ __forceinline__ T operator()(T a, T b) { return a + b; } +}; + +template class BinaryOp> +__device__ __forceinline__ T AtomicCASBinaryImpl(T* address, T val) { + T old = *address; + T assumed; + do { + assumed = old; + old = CASImpl(address, assumed, BinaryOp()(old, val)); + } while (old != assumed); + return old; +} + +template +__device__ __forceinline__ T AddImpl(T* address, T val) { + return AtomicCASBinaryImpl(address, val); +} + +__device__ __forceinline__ int AddImpl(int* address, int val) { return atomicAdd(address, val); } + +__device__ __forceinline__ unsigned int AddImpl(unsigned int* address, unsigned int val) { + return atomicAdd(address, val); +} + +__device__ __forceinline__ unsigned long long int AddImpl(unsigned long long int* address, + unsigned long long int val) { + return atomicAdd(address, val); +} + +__device__ __forceinline__ uint64_t AddImpl(uint64_t* address, uint64_t val) { + static_assert(sizeof(uint64_t) == sizeof(unsigned long long int), ""); + return static_cast(atomicAdd(reinterpret_cast(address), + static_cast(val))); +} + +__device__ __forceinline__ float AddImpl(float* address, float val) { + return atomicAdd(address, val); +} + +// #if __CUDA_ARCH__ >= 600 + +__device__ __forceinline__ double AddImpl(double* address, double val) { + return atomicAdd(address, val); +} + +// __device__ __forceinline__ half2 AddImpl(half2* address, half2 val) { +// return atomicAdd(address, val); +// } + +// #endif // __CUDA_ARCH__ >= 600 + +// #if __CUDA_ARCH__ >= 700 + +__device__ __forceinline__ half AddImpl(half* address, half val) { + float address_value = __half2float(*address); + return __float2half(atomicAdd(&address_value, __half2float(val))); } + +// #endif // __CUDA_ARCH__ >= 700 + +// #if __CUDA_ARCH__ >= 800 + +// __device__ __forceinline__ nv_bfloat16 AddImpl(nv_bfloat16* address, nv_bfloat16 val) { +// return atomicAdd(address, val); +// } + +// #endif // __CUDA_ARCH__ >= 800 + +// #if __CUDA_ARCH__ < 530 + +__device__ __forceinline__ half2 AddImpl(half2* address, half2 val) { + half2 res; + float2 address_value = __half22float2(*address); + res.data.x = __float2half(atomicAdd(&address_value.x, __half2float(val.data.x))); + res.data.y = __float2half(atomicAdd(&address_value.y, __half2float(val.data.y))); + return res; +} + +// #endif // __CUDA_ARCH__ < 530 + +} // namespace internal + +template +__device__ __forceinline__ typename std::enable_if::value, T>::type Cast(U v) { + return static_cast(v); +} + +template +__device__ __forceinline__ typename std::enable_if::value, T>::type Cast(U v) { + return v; +} + +template +__device__ __forceinline__ T CAS(T* address, U compare, V val) { + return internal::CASImpl(address, Cast(compare), Cast(val)); +} + +template +__device__ __forceinline__ T Add(T* address, U val) { + return internal::AddImpl(address, Cast(val)); +} + +__device__ __forceinline__ float Max(float* address, const float val) { + int* address_as_i = (int*)address; + int old = *address_as_i; + int assumed = 0; + do { + assumed = old; + old = atomicCAS(address_as_i, assumed, __float_as_int(fmaxf(val, __int_as_float(assumed)))); + } while (assumed != old); + return __int_as_float(old); +} + +__device__ __forceinline__ double Max(double* address, const double val) { + unsigned long long int* address_as_i = (unsigned long long int*)address; + unsigned long long int old = *address_as_i; + unsigned long long int assumed = 0; + do { + assumed = old; + old = atomicCAS(address_as_i, assumed, + __double_as_longlong(fmax(val, __longlong_as_double(assumed)))); + } while (assumed != old); + return __longlong_as_double(old); +} + +} // namespace atomic + +} // namespace cuda + +} // namespace oneflow + +#endif // defined(__HIPCC__) + +#endif // ONEFLOW_CORE_CUDA_ATOMIC_H_ diff --git a/oneflow/core/hip/elementwise.hip.h b/oneflow/core/hip/elementwise.hip.h index 29b51fd..8c5ae25 100644 --- a/oneflow/core/hip/elementwise.hip.h +++ b/oneflow/core/hip/elementwise.hip.h @@ -1,243 +1,243 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_HIP_ELEMENTWISE_H_ -#define ONEFLOW_CORE_HIP_ELEMENTWISE_H_ - -#ifdef WITH_ROCM - -#include -#include -#include -#include - -namespace oneflow { - -namespace cuda { - -namespace elementwise { - -constexpr int kBlockSize = 256; -constexpr int kNumWaves = 32; - -inline hipError_t GetNumBlocks(int64_t n, int* num_blocks) { - int dev; - { - hipError_t err = hipGetDevice(&dev); - if (err != hipSuccess) { return err; } - } - int sm_count; - { - hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev); - if (err != hipSuccess) { return err; } - } - int tpm; - { - hipError_t err = hipDeviceGetAttribute(&tpm, hipDeviceAttributeMaxThreadsPerMultiProcessor, dev); - if (err != hipSuccess) { return err; } - } - *num_blocks = std::max(1, std::min((n + kBlockSize - 1) / kBlockSize, - sm_count * tpm / kBlockSize * kNumWaves)); - return hipSuccess; -} - -template -struct GetPackType { - using type = typename std::aligned_storage::type; -}; - -template -using PackType = typename GetPackType::type; - -template -union Pack { - static_assert(sizeof(PackType) == sizeof(T) * pack_size, ""); - __device__ Pack() { - // do nothing - } - PackType storage; - T elem[pack_size]; -}; - -template -struct alignas(sizeof(T) * pack_size) Packed { - __device__ Packed() { - // do nothing - } - union { - T elem[pack_size]; - }; -}; - -constexpr int kMaxPackBytes = 128 / 8; -constexpr int kMaxPackSize = 8; - -constexpr int Min(int a, int b) { return a < b ? a : b; } - -template -constexpr int PackSize() { - return Min(kMaxPackBytes / sizeof(T), kMaxPackSize); -} - -template -constexpr int PackSize() { - return Min(PackSize(), PackSize()); -} - -template -class HasApply2 { - typedef char one; - struct two { - char x[2]; - }; - - template - static one test(decltype(&C::Apply2)); - template - static two test(...); - - public: - enum { value = sizeof(test(0)) == sizeof(char) }; -}; - -template -__device__ typename std::enable_if::value == true && pack_size % 2 == 0, - Packed>::type -ApplyPack(const FunctorT& functor, const IN... in[pack_size]) { - Packed ret; -#pragma unroll - for (int j = 0; j < pack_size; j += 2) { functor.Apply2(ret.elem + j, (in + j)...); } - return ret; -} - -template -__device__ typename std::enable_if::value == false || pack_size % 2 != 0, - Packed>::type -ApplyPack(const FunctorT& functor, const IN... in[pack_size]) { - Packed ret; -#pragma unroll - for (int j = 0; j < pack_size; ++j) { ret.elem[j] = functor((in[j])...); } - return ret; -} - -template -__global__ void __launch_bounds__(kBlockSize) - ApplyGeneric(FactoryT factory, int64_t n_pack, Packed* pack_r, - const Packed*... pack_in, int64_t n_tail, R* tail_r, - const IN*... tail_in) { - auto functor = factory(); - const int global_tid = blockIdx.x * kBlockSize + threadIdx.x; - for (int64_t i = global_tid; i < n_pack; i += blockDim.x * gridDim.x) { - pack_r[i] = ApplyPack(functor, (pack_in[i].elem)...); - } - if (tail && global_tid < n_tail) { tail_r[global_tid] = functor((tail_in[global_tid])...); } -} - -template -struct SimpleFactory { - explicit SimpleFactory(FunctorT functor) : tpl(functor) {} - __device__ FunctorT operator()() const { return tpl; } - - private: - FunctorT tpl; -}; - -template -bool IsAligendForPack() { - return true; -} - -template -bool IsAligendForPack(const T* ptr, const Args*... others) { - return reinterpret_cast(ptr) % sizeof(Pack) == 0 - && IsAligendForPack(others...); -} - -template -hipError_t LaunchKernel(FactoryT factory, int64_t n, R* r, const IN*... in, hipStream_t stream) { - const int64_t n_pack = n / pack_size; - const int64_t tail_offset = n_pack * pack_size; - const int64_t n_tail = n - tail_offset; - int num_blocks; - { - hipError_t err = GetNumBlocks(n_pack, &num_blocks); - if (err != hipSuccess) { return err; } - } - auto func = n_tail > 0 ? ApplyGeneric - : ApplyGeneric; - hipLaunchKernelGGL(func, num_blocks, kBlockSize, 0, stream, - factory, n_pack, reinterpret_cast*>(r), - (reinterpret_cast*>(in))..., n_tail, r + tail_offset, - (in + tail_offset)...); - return hipPeekAtLastError(); -} - -template -struct GenericLauncher { - static hipError_t Launch(FactoryT factory, int64_t n, R* r, const IN*... in, - hipStream_t stream) { - constexpr int max_pack_size = PackSize(); - if (IsAligendForPack(r, in...)) { - return LaunchKernel(factory, n, r, in..., stream); - } else { - return LaunchKernel<1, FactoryT, R, IN...>(factory, n, r, in..., stream); - } - } -}; - -template -inline hipError_t UnaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a, - hipStream_t stream) { - return GenericLauncher::Launch(factory, n, r, a, stream); -} - -template -inline hipError_t Unary(FunctorT functor, int64_t n, R* r, const A* a, hipStream_t stream) { - return UnaryWithFactory(SimpleFactory(functor), n, r, a, stream); -} - -template -inline hipError_t BinaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a, const B* b, - hipStream_t stream) { - return GenericLauncher::Launch(factory, n, r, a, b, stream); -} - -template -inline hipError_t Binary(FunctorT functor, int64_t n, R* r, const A* a, const B* b, - hipStream_t stream) { - return BinaryWithFactory(SimpleFactory(functor), n, r, a, b, stream); -} - -template -inline hipError_t TernaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a, const B* b, - const C* c, hipStream_t stream) { - return GenericLauncher::Launch(factory, n, r, a, b, c, stream); -} - -template -inline hipError_t Ternary(FunctorT functor, int64_t n, R* r, const A* a, const B* b, const C* c, - hipStream_t stream) { - return TernaryWithFactory(SimpleFactory(functor), n, r, a, b, c, stream); -} - -} // namespace elementwise - -} // namespace cuda - -} // namespace oneflow - -#endif // WITH_ROCM - -#endif // ONEFLOW_CORE_CUDA_ELEMENTWISE_H_ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_HIP_ELEMENTWISE_H_ +#define ONEFLOW_CORE_HIP_ELEMENTWISE_H_ + +#ifdef WITH_ROCM + +#include +#include +#include +#include + +namespace oneflow { + +namespace cuda { + +namespace elementwise { + +constexpr int kBlockSize = 256; +constexpr int kNumWaves = 32; + +inline hipError_t GetNumBlocks(int64_t n, int* num_blocks) { + int dev; + { + hipError_t err = hipGetDevice(&dev); + if (err != hipSuccess) { return err; } + } + int sm_count; + { + hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev); + if (err != hipSuccess) { return err; } + } + int tpm; + { + hipError_t err = hipDeviceGetAttribute(&tpm, hipDeviceAttributeMaxThreadsPerMultiProcessor, dev); + if (err != hipSuccess) { return err; } + } + *num_blocks = std::max(1, std::min((n + kBlockSize - 1) / kBlockSize, + sm_count * tpm / kBlockSize * kNumWaves)); + return hipSuccess; +} + +template +struct GetPackType { + using type = typename std::aligned_storage::type; +}; + +template +using PackType = typename GetPackType::type; + +template +union Pack { + static_assert(sizeof(PackType) == sizeof(T) * pack_size, ""); + __device__ Pack() { + // do nothing + } + PackType storage; + T elem[pack_size]; +}; + +template +struct alignas(sizeof(T) * pack_size) Packed { + __device__ Packed() { + // do nothing + } + union { + T elem[pack_size]; + }; +}; + +constexpr int kMaxPackBytes = 128 / 8; +constexpr int kMaxPackSize = 8; + +constexpr int Min(int a, int b) { return a < b ? a : b; } + +template +constexpr int PackSize() { + return Min(kMaxPackBytes / sizeof(T), kMaxPackSize); +} + +template +constexpr int PackSize() { + return Min(PackSize(), PackSize()); +} + +template +class HasApply2 { + typedef char one; + struct two { + char x[2]; + }; + + template + static one test(decltype(&C::Apply2)); + template + static two test(...); + + public: + enum { value = sizeof(test(0)) == sizeof(char) }; +}; + +template +__device__ typename std::enable_if::value == true && pack_size % 2 == 0, + Packed>::type +ApplyPack(const FunctorT& functor, const IN... in[pack_size]) { + Packed ret; +#pragma unroll + for (int j = 0; j < pack_size; j += 2) { functor.Apply2(ret.elem + j, (in + j)...); } + return ret; +} + +template +__device__ typename std::enable_if::value == false || pack_size % 2 != 0, + Packed>::type +ApplyPack(const FunctorT& functor, const IN... in[pack_size]) { + Packed ret; +#pragma unroll + for (int j = 0; j < pack_size; ++j) { ret.elem[j] = functor((in[j])...); } + return ret; +} + +template +__global__ void __launch_bounds__(kBlockSize) + ApplyGeneric(FactoryT factory, int64_t n_pack, Packed* pack_r, + const Packed*... pack_in, int64_t n_tail, R* tail_r, + const IN*... tail_in) { + auto functor = factory(); + const int global_tid = blockIdx.x * kBlockSize + threadIdx.x; + for (int64_t i = global_tid; i < n_pack; i += blockDim.x * gridDim.x) { + pack_r[i] = ApplyPack(functor, (pack_in[i].elem)...); + } + if (tail && global_tid < n_tail) { tail_r[global_tid] = functor((tail_in[global_tid])...); } +} + +template +struct SimpleFactory { + explicit SimpleFactory(FunctorT functor) : tpl(functor) {} + __device__ FunctorT operator()() const { return tpl; } + + private: + FunctorT tpl; +}; + +template +bool IsAligendForPack() { + return true; +} + +template +bool IsAligendForPack(const T* ptr, const Args*... others) { + return reinterpret_cast(ptr) % sizeof(Pack) == 0 + && IsAligendForPack(others...); +} + +template +hipError_t LaunchKernel(FactoryT factory, int64_t n, R* r, const IN*... in, hipStream_t stream) { + const int64_t n_pack = n / pack_size; + const int64_t tail_offset = n_pack * pack_size; + const int64_t n_tail = n - tail_offset; + int num_blocks; + { + hipError_t err = GetNumBlocks(n_pack, &num_blocks); + if (err != hipSuccess) { return err; } + } + auto func = n_tail > 0 ? ApplyGeneric + : ApplyGeneric; + hipLaunchKernelGGL(func, num_blocks, kBlockSize, 0, stream, + factory, n_pack, reinterpret_cast*>(r), + (reinterpret_cast*>(in))..., n_tail, r + tail_offset, + (in + tail_offset)...); + return hipPeekAtLastError(); +} + +template +struct GenericLauncher { + static hipError_t Launch(FactoryT factory, int64_t n, R* r, const IN*... in, + hipStream_t stream) { + constexpr int max_pack_size = PackSize(); + if (IsAligendForPack(r, in...)) { + return LaunchKernel(factory, n, r, in..., stream); + } else { + return LaunchKernel<1, FactoryT, R, IN...>(factory, n, r, in..., stream); + } + } +}; + +template +inline hipError_t UnaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a, + hipStream_t stream) { + return GenericLauncher::Launch(factory, n, r, a, stream); +} + +template +inline hipError_t Unary(FunctorT functor, int64_t n, R* r, const A* a, hipStream_t stream) { + return UnaryWithFactory(SimpleFactory(functor), n, r, a, stream); +} + +template +inline hipError_t BinaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a, const B* b, + hipStream_t stream) { + return GenericLauncher::Launch(factory, n, r, a, b, stream); +} + +template +inline hipError_t Binary(FunctorT functor, int64_t n, R* r, const A* a, const B* b, + hipStream_t stream) { + return BinaryWithFactory(SimpleFactory(functor), n, r, a, b, stream); +} + +template +inline hipError_t TernaryWithFactory(FactoryT factory, int64_t n, R* r, const A* a, const B* b, + const C* c, hipStream_t stream) { + return GenericLauncher::Launch(factory, n, r, a, b, c, stream); +} + +template +inline hipError_t Ternary(FunctorT functor, int64_t n, R* r, const A* a, const B* b, const C* c, + hipStream_t stream) { + return TernaryWithFactory(SimpleFactory(functor), n, r, a, b, c, stream); +} + +} // namespace elementwise + +} // namespace cuda + +} // namespace oneflow + +#endif // WITH_ROCM + +#endif // ONEFLOW_CORE_CUDA_ELEMENTWISE_H_ diff --git a/oneflow/core/hip/layer_norm.hip.h b/oneflow/core/hip/layer_norm.hip.h index 97891c3..183197f 100644 --- a/oneflow/core/hip/layer_norm.hip.h +++ b/oneflow/core/hip/layer_norm.hip.h @@ -1,1606 +1,1606 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#ifndef ONEFLOW_CORE_HIP_LAYER_NORM_H_ -#define ONEFLOW_CORE_HIP_LAYER_NORM_H_ - -#ifdef WITH_ROCM - -#include "hip/hip_runtime.h" -#include -// #include -#include - -namespace oneflow { - -namespace cuda { - -namespace layer_norm { - -constexpr int kWarpSize = 64; - -template -struct SumOp { - __device__ __forceinline__ T operator()(const T& a, const T& b) const { return a + b; } -}; - -template -struct MaxOp { - __device__ __forceinline__ T operator()(const T& a, const T& b) const { return max(a, b); } -}; - -template class ReductionOp, typename T, int thread_group_width = kWarpSize> -__inline__ __device__ T WarpAllReduce(T val) { - for (int mask = thread_group_width / 2; mask > 0; mask /= 2) { - // val = ReductionOp()(val, __shfl_xor(0xffffffff, val, mask, thread_group_width)); - val = ReductionOp()(val, __shfl_xor(val, mask, thread_group_width)); - } - return val; -} - -template class ReductionOp, typename T, int block_size> -__inline__ __device__ T BlockAllReduce(T val) { - typedef hipcub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - __shared__ T result_broadcast; - T result = BlockReduce(temp_storage).Reduce(val, ReductionOp()); - if (threadIdx.x == 0) { result_broadcast = result; } - __syncthreads(); - return result_broadcast; -} - -template -__inline__ __device__ T Div(T a, T b); - -template<> -__inline__ __device__ float Div(float a, float b) { -#ifdef OF_LAYER_NORM_USE_FAST_MATH - return __fdividef(a, b); -#else - return a / b; -#endif -} - -template<> -__inline__ __device__ double Div(double a, double b) { - return a / b; -} - -template -__inline__ __device__ T Rsqrt(T x); - -template<> -__inline__ __device__ float Rsqrt(float x) { -#ifdef OF_LAYER_NORM_USE_FAST_MATH - return __frsqrt_rn(x); -#else - return rsqrt(x); -#endif -} - -template<> -__inline__ __device__ double Rsqrt(double x) { - return rsqrt(x); -} - -template -inline hipError_t GetNumBlocks(Func func, int64_t block_size, size_t dynamic_smem_size, - int64_t max_blocks, int64_t waves, int* num_blocks) { - int dev; - { - hipError_t err = hipGetDevice(&dev); - if (err != hipSuccess) { return err; } - } - int sm_count; - { - hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev); - if (err != hipSuccess) { return err; } - } - int max_active_blocks; - { - hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func, - block_size, dynamic_smem_size); - } - *num_blocks = - std::max(1, std::min(max_blocks, sm_count * max_active_blocks * waves)); - return hipSuccess; -} - -template -struct DefaultComputeType { - using type = T; -}; - -template<> -struct DefaultComputeType { - using type = float; -}; - -// #if CUDA_VERSION >= 11000 -// template<> -// struct DefaultComputeType { -// using type = float; -// }; -// #endif // CUDA_VERSION >= 11000 - -template -struct GetPackType { - using type = typename std::aligned_storage::type; -}; - -template -using PackType = typename GetPackType::type; - -template -union Pack { - static_assert(sizeof(PackType) == sizeof(T) * N, ""); - __device__ Pack() { - // do nothing - } - PackType storage; - T elem[N]; -}; - -template -struct DirectLoad { - DirectLoad(const SRC* src, int64_t row_size) : src(src), row_size(row_size) {} - template - __device__ void load(DST* dst, int64_t row, int64_t col) const { - Pack pack; - const int64_t offset = (row * row_size + col) / N; - pack.storage = *(reinterpret_cast*>(src) + offset); -#pragma unroll - for (int i = 0; i < N; ++i) { dst[i] = static_cast(pack.elem[i]); } - } - const SRC* src; - int64_t row_size; -}; - -template -struct DirectStore { - DirectStore(DST* dst, int64_t row_size) : dst(dst), row_size(row_size) {} - template - __device__ void store(const SRC* src, int64_t row, int64_t col) { - Pack pack; - const int64_t offset = (row * row_size + col) / N; -#pragma unroll - for (int i = 0; i < N; ++i) { pack.elem[i] = static_cast(src[i]); } - *(reinterpret_cast*>(dst) + offset) = pack.storage; - } - DST* dst; - int64_t row_size; -}; - -template -inline __device__ void WelfordCombine(T val, T* mean, T* m2, T* count) { - // Use Welford Online algorithem to compute mean and variance - // For more details you can refer to: - // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm - *count += 1; - T delta1 = val - *mean; - *mean += Div(delta1, *count); - T delta2 = val - *mean; - *m2 += delta1 * delta2; -} - -template -inline __device__ void WelfordCombine(T b_mean, T b_m2, T b_count, T* mean, T* m2, T* count) { - if (b_count == 0) { return; } - T new_count = *count + b_count; - T nb_over_n = Div(b_count, new_count); - T delta = b_mean - *mean; - *mean += delta * nb_over_n; - *m2 += b_m2 + delta * delta * (*count) * nb_over_n; - *count = new_count; -} - -template -__inline__ __device__ void WelfordWarpReduce(T thread_mean, T thread_m2, T thread_count, T* mean, - T* m2, T* count) { - *mean = thread_mean; - *m2 = thread_m2; - *count = thread_count; - for (int mask = thread_group_width / 2; mask > 0; mask /= 2) { - // T b_mean = __shfl_down(0xffffffff, *mean, mask, thread_group_width); - // T b_m2 = __shfl_down(0xffffffff, *m2, mask, thread_group_width); - // T b_count = __shfl_down(0xffffffff, *count, mask, thread_group_width); - T b_mean = __shfl_down(*mean, mask, thread_group_width); - T b_m2 = __shfl_down(*m2, mask, thread_group_width); - T b_count = __shfl_down(*count, mask, thread_group_width); - WelfordCombine(b_mean, b_m2, b_count, mean, m2, count); - } -} - -template -__inline__ __device__ void WelfordWarpAllReduce(T thread_mean, T thread_m2, T thread_count, T* mean, - T* m2, T* count) { - WelfordWarpReduce(thread_mean, thread_m2, thread_count, mean, m2, count); - // *mean = __shfl(0xffffffff, *mean, 0, thread_group_width); - // *m2 = __shfl(0xffffffff, *m2, 0, thread_group_width); - // *count = __shfl(0xffffffff, *count, 0, thread_group_width); - *mean = __shfl(*mean, 0, thread_group_width); - *m2 = __shfl(*m2, 0, thread_group_width); - *count = __shfl(*count, 0, thread_group_width); -} - -template -__inline__ __device__ void WelfordBlockAllReduce(T thread_mean, T thread_m2, T thread_count, - T* result_mean, T* result_m2, T* result_count) { - __shared__ T mean_shared[kWarpSize]; - __shared__ T m2_shared[kWarpSize]; - __shared__ T count_shared[kWarpSize]; - __shared__ T mean_result_broadcast; - __shared__ T m2_result_broadcast; - __shared__ T count_result_broadcast; - const int lid = threadIdx.x % kWarpSize; - const int wid = threadIdx.x / kWarpSize; - T warp_mean = 0; - T warp_m2 = 0; - T warp_count = 0; - WelfordWarpReduce(thread_mean, thread_m2, thread_count, &warp_mean, &warp_m2, &warp_count); - __syncthreads(); - if (lid == 0) { - mean_shared[wid] = warp_mean; - m2_shared[wid] = warp_m2; - count_shared[wid] = warp_count; - } - __syncthreads(); - if (wid == 0) { - if (threadIdx.x < blockDim.x / kWarpSize) { - warp_mean = mean_shared[lid]; - warp_m2 = m2_shared[lid]; - warp_count = count_shared[lid]; - } else { - warp_mean = static_cast(0); - warp_m2 = static_cast(0); - warp_count = static_cast(0); - } - __syncthreads(); - T block_mean = 0; - T block_m2 = 0; - T block_count = 0; - WelfordWarpReduce(warp_mean, warp_m2, warp_count, &block_mean, &block_m2, &block_count); - if (lid == 0) { - mean_result_broadcast = block_mean; - m2_result_broadcast = block_m2; - count_result_broadcast = block_count; - } - } - __syncthreads(); - *result_mean = mean_result_broadcast; - *result_m2 = m2_result_broadcast; - *result_count = count_result_broadcast; -} - -template -__global__ void LayerNormWarpImpl(LOAD load, STORE store, const int64_t rows, const int64_t cols, - const double epsilon, ComputeType* mean, - ComputeType* inv_variance) { - static_assert(cols_per_thread % pack_size == 0, ""); - static_assert(thread_group_width <= kWarpSize, ""); - static_assert(kWarpSize % thread_group_width == 0, ""); - constexpr int num_packs = cols_per_thread / pack_size; - assert(cols <= cols_per_thread * thread_group_width); - ComputeType buf[rows_per_access][cols_per_thread]; - const int64_t global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y; - const int64_t num_global_thread_group = gridDim.x * blockDim.y; - const int64_t lane_id = threadIdx.x; - const int64_t step = num_global_thread_group * rows_per_access; - for (int64_t row = global_thread_group_id * rows_per_access; row < rows; row += step) { - ComputeType thread_mean[rows_per_access]; - ComputeType thread_m2[rows_per_access]; - ComputeType thread_count[rows_per_access]; -#pragma unroll - for (int row_id = 0; row_id < rows_per_access; ++row_id) { - thread_mean[row_id] = 0; - thread_m2[row_id] = 0; - thread_count[row_id] = 0; - ComputeType* row_buf = buf[row_id]; -#pragma unroll - for (int pack_id = 0; pack_id < num_packs; ++pack_id) { - const int col = (pack_id * thread_group_width + lane_id) * pack_size; - const int pack_offset = pack_id * pack_size; - if (!padding || col < cols) { - load.template load(row_buf + pack_offset, row + row_id, col); -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - WelfordCombine(row_buf[pack_offset + i], thread_mean + row_id, thread_m2 + row_id, - thread_count + row_id); - } - } else { -#pragma unroll - for (int i = 0; i < pack_size; ++i) { row_buf[pack_offset + i] = 0; } - } - } - } - ComputeType warp_mean[rows_per_access]; - ComputeType warp_m2[rows_per_access]; - ComputeType warp_count[rows_per_access]; -#pragma unroll - for (int row_id = 0; row_id < rows_per_access; ++row_id) { - int global_row_id = row + row_id; - ComputeType* row_buf = buf[row_id]; - WelfordWarpAllReduce( - thread_mean[row_id], thread_m2[row_id], thread_count[row_id], warp_mean + row_id, - warp_m2 + row_id, warp_count + row_id); - ComputeType row_mean = warp_mean[row_id]; - ComputeType row_variance = - max(Div(warp_m2[row_id], warp_count[row_id]), static_cast(0.0)); - ComputeType row_inv_var = Rsqrt(row_variance + static_cast(epsilon)); - if (lane_id == 0) { - mean[global_row_id] = row_mean; - inv_variance[global_row_id] = row_inv_var; - } -#pragma unroll - for (int i = 0; i < cols_per_thread; ++i) { - row_buf[i] = (row_buf[i] - row_mean) * row_inv_var; - } -#pragma unroll - for (int i = 0; i < num_packs; ++i) { - const int col = (i * thread_group_width + lane_id) * pack_size; - if (!padding || col < cols) { - store.template store(row_buf + i * pack_size, global_row_id, col); - } - } - } - } -} - -template -inline hipError_t LaunchLayerNormWarpImpl(hipStream_t stream, LOAD load, STORE store, - const int64_t rows, const int64_t cols, - const double epsilon, ComputeType* mean, - ComputeType* inv_variance) { - constexpr int block_size = 128; - constexpr int waves = 32; - static_assert(block_size % thread_group_width == 0, ""); - constexpr int thread_groups_per_block = block_size / thread_group_width; - dim3 block_dim(thread_group_width, thread_groups_per_block); - const int64_t num_blocks = - (rows / rows_per_access + thread_groups_per_block - 1) / thread_groups_per_block; - int grid_dim_x; - { - hipError_t err = - GetNumBlocks(LayerNormWarpImpl, - block_size, 0, num_blocks, waves, &grid_dim_x); - if (err != hipSuccess) { return err; } - } - LayerNormWarpImpl - <<>>(load, store, rows, cols, epsilon, mean, inv_variance); - return hipPeekAtLastError(); -} - -template -inline hipError_t DispatchLayerNormWarpImplPadding(hipStream_t stream, LOAD load, STORE store, - const int64_t rows, const int64_t cols, - const double epsilon, ComputeType* mean, - ComputeType* inv_variance) { - if (cols == cols_per_thread * thread_group_width) { - return LaunchLayerNormWarpImpl( - stream, load, store, rows, cols, epsilon, mean, inv_variance); - } else { - return LaunchLayerNormWarpImpl( - stream, load, store, rows, cols, epsilon, mean, inv_variance); - } -} - -template -typename std::enable_if::type DispatchLayerNormWarpImplCols( - hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols, - const double epsilon, ComputeType* mean, ComputeType* inv_variance) { - if (cols <= 0) { return hipErrorInvalidValue; } -#define DEFINE_ONE_ELIF(thread_group_width) \ - else if (cols <= (thread_group_width)*pack_size) { \ - if (rows % 2 == 0) { \ - return DispatchLayerNormWarpImplPadding( \ - stream, load, store, rows, cols, epsilon, mean, inv_variance); \ - } else { \ - return DispatchLayerNormWarpImplPadding( \ - stream, load, store, rows, cols, epsilon, mean, inv_variance); \ - } \ - } - DEFINE_ONE_ELIF(1) - DEFINE_ONE_ELIF(2) - DEFINE_ONE_ELIF(4) - DEFINE_ONE_ELIF(8) - DEFINE_ONE_ELIF(16) - DEFINE_ONE_ELIF(32) -#undef DEFINE_ONE_ELIF -#define DEFINE_ONE_ELIF(col) \ - else if (cols <= (col)*kWarpSize) { \ - return DispatchLayerNormWarpImplPadding(stream, load, store, rows, cols, epsilon, mean, \ - inv_variance); \ - } - DEFINE_ONE_ELIF(2) - DEFINE_ONE_ELIF(3) - DEFINE_ONE_ELIF(4) - DEFINE_ONE_ELIF(5) - DEFINE_ONE_ELIF(6) - DEFINE_ONE_ELIF(7) - DEFINE_ONE_ELIF(8) - DEFINE_ONE_ELIF(9) - DEFINE_ONE_ELIF(10) - DEFINE_ONE_ELIF(11) - DEFINE_ONE_ELIF(12) - DEFINE_ONE_ELIF(13) - DEFINE_ONE_ELIF(14) - DEFINE_ONE_ELIF(15) - DEFINE_ONE_ELIF(16) - DEFINE_ONE_ELIF(17) - DEFINE_ONE_ELIF(18) - DEFINE_ONE_ELIF(19) - DEFINE_ONE_ELIF(20) - DEFINE_ONE_ELIF(21) - DEFINE_ONE_ELIF(22) - DEFINE_ONE_ELIF(23) - DEFINE_ONE_ELIF(24) - DEFINE_ONE_ELIF(25) - DEFINE_ONE_ELIF(26) - DEFINE_ONE_ELIF(27) - DEFINE_ONE_ELIF(28) - DEFINE_ONE_ELIF(29) - DEFINE_ONE_ELIF(30) - DEFINE_ONE_ELIF(31) - DEFINE_ONE_ELIF(32) -#undef DEFINE_ONE_ELIF - else { - return hipErrorInvalidValue; - } -} - -template -typename std::enable_if::type DispatchLayerNormWarpImplCols( - hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols, - const double epsilon, ComputeType* mean, ComputeType* inv_variance) { - if (cols <= 0) { return hipErrorInvalidValue; } -#define DEFINE_ONE_ELIF(thread_group_width) \ - else if (cols <= (thread_group_width)*pack_size) { \ - if (rows % 2 == 0) { \ - return DispatchLayerNormWarpImplPadding( \ - stream, load, store, rows, cols, epsilon, mean, inv_variance); \ - } else { \ - return DispatchLayerNormWarpImplPadding( \ - stream, load, store, rows, cols, epsilon, mean, inv_variance); \ - } \ - } - DEFINE_ONE_ELIF(1) - DEFINE_ONE_ELIF(2) - DEFINE_ONE_ELIF(4) - DEFINE_ONE_ELIF(8) - DEFINE_ONE_ELIF(16) - DEFINE_ONE_ELIF(32) -#undef DEFINE_ONE_ELIF -#define DEFINE_ONE_ELIF(col) \ - else if (cols <= (col)*kWarpSize) { \ - return DispatchLayerNormWarpImplPadding(stream, load, store, rows, cols, epsilon, mean, \ - inv_variance); \ - } - DEFINE_ONE_ELIF(4) - DEFINE_ONE_ELIF(6) - DEFINE_ONE_ELIF(8) - DEFINE_ONE_ELIF(10) - DEFINE_ONE_ELIF(12) - DEFINE_ONE_ELIF(14) - DEFINE_ONE_ELIF(16) - DEFINE_ONE_ELIF(18) - DEFINE_ONE_ELIF(20) - DEFINE_ONE_ELIF(22) - DEFINE_ONE_ELIF(24) - DEFINE_ONE_ELIF(26) - DEFINE_ONE_ELIF(28) - DEFINE_ONE_ELIF(30) - DEFINE_ONE_ELIF(32) -#undef DEFINE_ONE_ELIF - else { - return hipErrorInvalidValue; - } -} -template -typename std::enable_if::type DispatchLayerNormWarpImplCols( - hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols, - const double epsilon, ComputeType* mean, ComputeType* inv_variance) { - if (cols <= 0) { return hipErrorInvalidValue; } -#define DEFINE_ONE_ELIF(thread_group_width) \ - else if (cols <= (thread_group_width)*pack_size) { \ - if (rows % 2 == 0) { \ - return DispatchLayerNormWarpImplPadding( \ - stream, load, store, rows, cols, epsilon, mean, inv_variance); \ - } else { \ - return DispatchLayerNormWarpImplPadding( \ - stream, load, store, rows, cols, epsilon, mean, inv_variance); \ - } \ - } - DEFINE_ONE_ELIF(1) - DEFINE_ONE_ELIF(2) - DEFINE_ONE_ELIF(4) - DEFINE_ONE_ELIF(8) - DEFINE_ONE_ELIF(16) - DEFINE_ONE_ELIF(32) -#undef DEFINE_ONE_ELIF -#define DEFINE_ONE_ELIF(col) \ - else if (cols <= (col)*kWarpSize) { \ - return DispatchLayerNormWarpImplPadding(stream, load, store, rows, cols, epsilon, mean, \ - inv_variance); \ - } - DEFINE_ONE_ELIF(8) - DEFINE_ONE_ELIF(12) - DEFINE_ONE_ELIF(16) - DEFINE_ONE_ELIF(20) - DEFINE_ONE_ELIF(24) - DEFINE_ONE_ELIF(28) - DEFINE_ONE_ELIF(32) -#undef DEFINE_ONE_ELIF - else { - return hipErrorInvalidValue; - } -} - -template -struct DispatchLayerNormWarpImplPackSize { - hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows, - const int64_t cols, const double epsilon, ComputeType* mean, - ComputeType* inv_variance) { - if (cols % 4 == 0) { - return DispatchLayerNormWarpImplCols( - stream, load, store, rows, cols, epsilon, mean, inv_variance); - } else if (cols % 2 == 0) { - return DispatchLayerNormWarpImplCols( - stream, load, store, rows, cols, epsilon, mean, inv_variance); - } else { - return DispatchLayerNormWarpImplCols( - stream, load, store, rows, cols, epsilon, mean, inv_variance); - } - } -}; - -template -inline hipError_t DispatchLayerNormWarpImpl(hipStream_t stream, LOAD load, STORE store, - const int64_t rows, const int64_t cols, - const double epsilon, ComputeType* mean, - ComputeType* inv_variance) { - return DispatchLayerNormWarpImplPackSize()( - stream, load, store, rows, cols, epsilon, mean, inv_variance); -} - -template -__global__ void LayerNormBlockSMemImpl(LOAD load, STORE store, const int64_t rows, - const int64_t cols, const double epsilon, ComputeType* mean, - ComputeType* inv_variance) { - extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[]; - auto* buf = reinterpret_cast(shared_buf); - const int tid = threadIdx.x; - assert(cols % pack_size == 0); - const int num_packs = static_cast(cols) / pack_size; - for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { - ComputeType thread_mean = 0; - ComputeType thread_m2 = 0; - ComputeType thread_count = 0; - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType pack[pack_size]; - load.template load(pack, row, pack_id * pack_size); -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - buf[i * num_packs + pack_id] = pack[i]; - WelfordCombine(pack[i], &thread_mean, &thread_m2, &thread_count); - } - } - ComputeType row_mean = 0; - ComputeType row_m2 = 0; - ComputeType row_count = 0; - WelfordBlockAllReduce(thread_mean, thread_m2, thread_count, &row_mean, &row_m2, - &row_count); - ComputeType row_variance = max(Div(row_m2, row_count), static_cast(0.0)); - ComputeType row_inv_var = Rsqrt(row_variance + static_cast(epsilon)); - if (threadIdx.x == 0) { - mean[row] = row_mean; - inv_variance[row] = row_inv_var; - } - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType pack[pack_size]; -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - pack[i] = (buf[i * num_packs + pack_id] - row_mean) * row_inv_var; - } - store.template store(pack, row, pack_id * pack_size); - } - } -} - -template -__global__ void LayerNormBlockSMemImpl_1024(LOAD load, STORE store, const int64_t rows, - const int64_t cols, const double epsilon, ComputeType* mean, - ComputeType* inv_variance) __attribute__((amdgpu_flat_work_group_size(1,1024))) { - extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[]; - auto* buf = reinterpret_cast(shared_buf); - const int tid = threadIdx.x; - assert(cols % pack_size == 0); - const int num_packs = static_cast(cols) / pack_size; - for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { - ComputeType thread_mean = 0; - ComputeType thread_m2 = 0; - ComputeType thread_count = 0; - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType pack[pack_size]; - load.template load(pack, row, pack_id * pack_size); -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - buf[i * num_packs + pack_id] = pack[i]; - WelfordCombine(pack[i], &thread_mean, &thread_m2, &thread_count); - } - } - ComputeType row_mean = 0; - ComputeType row_m2 = 0; - ComputeType row_count = 0; - WelfordBlockAllReduce(thread_mean, thread_m2, thread_count, &row_mean, &row_m2, - &row_count); - ComputeType row_variance = max(Div(row_m2, row_count), static_cast(0.0)); - ComputeType row_inv_var = Rsqrt(row_variance + static_cast(epsilon)); - if (threadIdx.x == 0) { - mean[row] = row_mean; - inv_variance[row] = row_inv_var; - } - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType pack[pack_size]; -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - pack[i] = (buf[i * num_packs + pack_id] - row_mean) * row_inv_var; - } - store.template store(pack, row, pack_id * pack_size); - } - } -} - - -template -inline hipError_t LaunchLayerNormBlockSMemImpl(hipStream_t stream, LOAD load, STORE store, - int smem, const int64_t rows, const int64_t cols, - const double epsilon, ComputeType* mean, - ComputeType* inv_variance) { - constexpr int waves = 32; - int grid_dim_x; - { - hipError_t err = - GetNumBlocks(LayerNormBlockSMemImpl, - block_size, smem, rows, waves, &grid_dim_x); - if (err != hipSuccess) { return err; } - } - LayerNormBlockSMemImpl - <<>>(load, store, rows, cols, epsilon, mean, - inv_variance); - return hipPeekAtLastError(); -} - -template -inline hipError_t LaunchLayerNormBlockSMemImpl_1024(hipStream_t stream, LOAD load, STORE store, - int smem, const int64_t rows, const int64_t cols, - const double epsilon, ComputeType* mean, - ComputeType* inv_variance) { - constexpr int waves = 32; - int grid_dim_x; - { - hipError_t err = - GetNumBlocks(LayerNormBlockSMemImpl_1024, - block_size, smem, rows, waves, &grid_dim_x); - if (err != hipSuccess) { return err; } - } - LayerNormBlockSMemImpl_1024 - <<>>(load, store, rows, cols, epsilon, mean, - inv_variance); - return hipPeekAtLastError(); -} - -template -inline hipError_t TryDispatchLayerNormBlockSMemImplBlockSize( - hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols, - const double epsilon, ComputeType* mean, ComputeType* inv_variance, bool* success) { - constexpr int block_size_conf_1 = 128; - constexpr int block_size_conf_2 = 256; - constexpr int block_size_conf_3 = 512; - constexpr int block_size_conf_4 = 1024; - const size_t smem = cols * sizeof(ComputeType); - int max_active_blocks_conf_1; - - { - hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks_conf_1, - LayerNormBlockSMemImpl, - block_size_conf_1, smem); - if (err != hipSuccess) { return err; } - } - if (max_active_blocks_conf_1 <= 0) { - *success = false; - return hipSuccess; - } - int max_active_blocks_conf_4; - { - hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks_conf_4, - LayerNormBlockSMemImpl_1024, - block_size_conf_4, smem); - if (err != hipSuccess) { return err; } - } - - if (max_active_blocks_conf_4 == max_active_blocks_conf_1) { - *success = true; - return LaunchLayerNormBlockSMemImpl_1024( - stream, load, store, smem, rows, cols, epsilon, mean, inv_variance); - } - int max_active_blocks_conf_3; - { - hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks_conf_3, - LayerNormBlockSMemImpl, - block_size_conf_3, smem); - if (err != hipSuccess) { return err; } - } - - if (max_active_blocks_conf_3 == max_active_blocks_conf_1) { - *success = true; - return LaunchLayerNormBlockSMemImpl( - stream, load, store, smem, rows, cols, epsilon, mean, inv_variance); - } - int max_active_blocks_conf_2; - { - hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks_conf_2, - LayerNormBlockSMemImpl, - block_size_conf_2, smem); - if (err != hipSuccess) { return err; } - } - - if (max_active_blocks_conf_2 == max_active_blocks_conf_1) { - *success = true; - return LaunchLayerNormBlockSMemImpl( - stream, load, store, smem, rows, cols, epsilon, mean, inv_variance); - } - *success = true; - return LaunchLayerNormBlockSMemImpl( - stream, load, store, smem, rows, cols, epsilon, mean, inv_variance); -} - -template -struct TryDispatchLayerNormBlockSMemImplPackSize { - hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows, - const int64_t cols, const double epsilon, ComputeType* mean, - ComputeType* inv_variance, bool* success) { - if (cols % 4 == 0) { - return TryDispatchLayerNormBlockSMemImplBlockSize( - stream, load, store, rows, cols, epsilon, mean, inv_variance, success); - } else if (cols % 2 == 0) { - return TryDispatchLayerNormBlockSMemImplBlockSize( - stream, load, store, rows, cols, epsilon, mean, inv_variance, success); - } else { - return TryDispatchLayerNormBlockSMemImplBlockSize( - stream, load, store, rows, cols, epsilon, mean, inv_variance, success); - } - } -}; - -template -inline hipError_t TryDispatchLayerNormBlockSMemImpl(hipStream_t stream, LOAD load, STORE store, - const int64_t rows, const int64_t cols, - const double epsilon, ComputeType* mean, - ComputeType* inv_variance, bool* success) { - return TryDispatchLayerNormBlockSMemImplPackSize()( - stream, load, store, rows, cols, epsilon, mean, inv_variance, success); -} - -template -__global__ void LayerNormBlockUncachedImpl(LOAD load, STORE store, const int64_t rows, - const int64_t cols, const double epsilon, - ComputeType* mean, ComputeType* inv_variance) __attribute__((amdgpu_flat_work_group_size(1,1024))) { - const int tid = threadIdx.x; - assert(cols % pack_size == 0); - const int num_packs = static_cast(cols) / pack_size; - for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { - ComputeType thread_mean = 0; - ComputeType thread_m2 = 0; - ComputeType thread_count = 0; - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType pack[pack_size]; - load.template load(pack, row, pack_id * pack_size); -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - WelfordCombine(pack[i], &thread_mean, &thread_m2, &thread_count); - } - } - ComputeType row_mean = 0; - ComputeType row_m2 = 0; - ComputeType row_count = 0; - WelfordBlockAllReduce(thread_mean, thread_m2, thread_count, &row_mean, &row_m2, - &row_count); - ComputeType row_variance = max(Div(row_m2, row_count), static_cast(0.0)); - ComputeType row_inv_var = Rsqrt(row_variance + static_cast(epsilon)); - if (threadIdx.x == 0) { - mean[row] = row_mean; - inv_variance[row] = row_inv_var; - } - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType pack[pack_size]; - const int pack_offset = pack_id * pack_size; - load.template load(pack, row, pack_offset); -#pragma unroll - for (int i = 0; i < pack_size; ++i) { pack[i] = (pack[i] - row_mean) * row_inv_var; } - store.template store(pack, row, pack_offset); - } - } -} - -template -inline hipError_t LaunchLayerNormBlockUncachedImpl(hipStream_t stream, LOAD load, STORE store, - const int64_t rows, const int64_t cols, - const double epsilon, ComputeType* mean, - ComputeType* inv_variance) { - constexpr int block_size = 1024; - constexpr int waves = 32; - int grid_dim_x; - { - hipError_t err = - GetNumBlocks(LayerNormBlockUncachedImpl, - block_size, 0, rows, waves, &grid_dim_x); - if (err != hipSuccess) { return err; } - } - LayerNormBlockUncachedImpl - <<>>(load, store, rows, cols, epsilon, mean, inv_variance); - return hipPeekAtLastError(); -} - -template -struct DispatchLayerNormBlockUncachedImplPackSize { - hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows, - const int64_t cols, const double epsilon, ComputeType* mean, - ComputeType* inv_variance) { - if (cols % 4 == 0) { - return LaunchLayerNormBlockUncachedImpl( - stream, load, store, rows, cols, epsilon, mean, inv_variance); - } else if (cols % 2 == 0) { - return LaunchLayerNormBlockUncachedImpl( - stream, load, store, rows, cols, epsilon, mean, inv_variance); - } else { - return LaunchLayerNormBlockUncachedImpl( - stream, load, store, rows, cols, epsilon, mean, inv_variance); - } - } -}; - -template -inline hipError_t DispatchLayerNormBlockUncachedImpl(hipStream_t stream, LOAD load, STORE store, - const int64_t rows, const int64_t cols, - const double epsilon, ComputeType* mean, - ComputeType* inv_variance) { - return DispatchLayerNormBlockUncachedImplPackSize()( - stream, load, store, rows, cols, epsilon, mean, inv_variance); -} - -template -inline typename std::enable_if::value, hipError_t>::type -DispatchLayerNorm(hipStream_t stream, LOAD load, STORE store, const int64_t rows, - const int64_t cols, const double epsilon, ComputeType* mean, - ComputeType* inv_variance) { - if (cols <= 1024) { - return DispatchLayerNormWarpImpl(stream, load, store, rows, cols, - epsilon, mean, inv_variance); - } else { - bool dispatch_smem_impl_success; - { - hipError_t err = TryDispatchLayerNormBlockSMemImpl( - stream, load, store, rows, cols, epsilon, mean, inv_variance, - &dispatch_smem_impl_success); - if (err != hipSuccess) { return err; } - } - if (!dispatch_smem_impl_success) { - return DispatchLayerNormBlockUncachedImpl( - stream, load, store, rows, cols, epsilon, mean, inv_variance); - } - return hipSuccess; - } -} - -template -inline typename std::enable_if::value, hipError_t>::type -DispatchLayerNorm(hipStream_t stream, LOAD load, STORE store, const int64_t rows, - const int64_t cols, const double epsilon, ComputeType* mean, - ComputeType* inv_variance) { - return DispatchLayerNormBlockUncachedImpl( - stream, load, store, rows, cols, epsilon, mean, inv_variance); -} - -/* -LayerNormGrad dx: -normalized = (x - mean) * inv_var -sum_stats1 = sum(scaled_dy) -sum_stats2 = sum(scaled_dy * normalized) -dx = cols * dy - sum_stats1 - normalized * sum_stats2 -dx *= inv_var / cols -*/ -template -__global__ void LayerNormGradWarpImpl(LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, STORE store, - const ComputeType* mean, const ComputeType* inv_variance, - const int64_t rows, const int64_t cols) { - static_assert(cols_per_thread % pack_size == 0, ""); - constexpr int pack_per_thread = cols_per_thread / pack_size; - assert(cols <= cols_per_thread * thread_group_width); - static_assert(thread_group_width <= kWarpSize, ""); - static_assert(kWarpSize % thread_group_width == 0, ""); - ComputeType normalized_buf[rows_per_access][cols_per_thread]; - ComputeType dy_buf[rows_per_access][cols_per_thread]; - const ComputeType one_over_cols = static_cast(1.0) / static_cast(cols); - const int64_t global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y; - const int64_t num_global_thread_group = gridDim.x * blockDim.y; - const int lane_id = threadIdx.x; - const int64_t step = num_global_thread_group * rows_per_access; - for (int64_t row = global_thread_group_id * rows_per_access; row < rows; row += step) { - ComputeType sum_stats1[rows_per_access]; - ComputeType sum_stats2[rows_per_access]; - ComputeType inv_variance_buf[rows_per_access]; -#pragma unroll - for (int row_id = 0; row_id < rows_per_access; ++row_id) { - const int global_row_id = row + row_id; - ComputeType mean_val = mean[global_row_id]; - inv_variance_buf[row_id] = inv_variance[global_row_id]; - sum_stats1[row_id] = 0; - sum_stats2[row_id] = 0; - ComputeType* row_normalized_buf = normalized_buf[row_id]; - ComputeType* row_dy_buf = dy_buf[row_id]; -#pragma unroll - for (int pack_id = 0; pack_id < pack_per_thread; ++pack_id) { - const int col = (pack_id * thread_group_width + lane_id) * pack_size; - const int pack_offset = pack_id * pack_size; - if (!padding || col < cols) { - load_x.template load(row_normalized_buf + pack_offset, global_row_id, col); - load_scaled_dy.template load(row_dy_buf + pack_offset, global_row_id, col); -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - const int col_id = pack_offset + i; - // row_normalized_buf store x - row_normalized_buf[col_id] = - (row_normalized_buf[col_id] - mean_val) * inv_variance_buf[row_id]; - sum_stats1[row_id] += row_dy_buf[col_id]; - sum_stats2[row_id] += row_dy_buf[col_id] * row_normalized_buf[col_id]; - } - } - } - } - ComputeType warp_sum_stats1[rows_per_access]; - ComputeType warp_sum_stats2[rows_per_access]; -#pragma unroll - for (int row_id = 0; row_id < rows_per_access; ++row_id) { - warp_sum_stats1[row_id] = - WarpAllReduce(sum_stats1[row_id]); - warp_sum_stats2[row_id] = - WarpAllReduce(sum_stats2[row_id]); - } -#pragma unroll - for (int row_id = 0; row_id < rows_per_access; ++row_id) { - const int global_row_id = row + row_id; - ComputeType* row_normalized_buf = normalized_buf[row_id]; - ComputeType* row_dy_buf = dy_buf[row_id]; - const ComputeType inv_variance_over_cols = inv_variance_buf[row_id] * one_over_cols; -#pragma unroll - for (int pack_id = 0; pack_id < pack_per_thread; ++pack_id) { - const int col = (pack_id * thread_group_width + lane_id) * pack_size; - if (!padding || col < cols) { - for (int i = 0; i < pack_size; ++i) { - const int col_id = pack_id * pack_size + i; - row_dy_buf[col_id] = (cols * row_dy_buf[col_id] - warp_sum_stats1[row_id] - - row_normalized_buf[col_id] * warp_sum_stats2[row_id]) - * inv_variance_over_cols; - } - store.template store(row_dy_buf + pack_id * pack_size, global_row_id, col); - } - } - } - } -} - -template -inline hipError_t LaunchLayerNormGradWarpImpl(hipStream_t stream, LOAD_X load_x, - LOAD_SCALED_DY load_scaled_dy, STORE store, - const ComputeType* mean, - const ComputeType* inv_variance, const int64_t rows, - const int64_t cols) { - constexpr int block_size = 128; - constexpr int waves = 32; - static_assert(block_size % thread_group_width == 0, ""); - constexpr int thread_groups_per_block = block_size / thread_group_width; - dim3 block_dim(thread_group_width, thread_groups_per_block); - const int64_t num_blocks = - (rows / rows_per_access + thread_groups_per_block - 1) / thread_groups_per_block; - int grid_dim_x; - { - hipError_t err = GetNumBlocks( - LayerNormGradWarpImpl, - block_size, 0, num_blocks, waves, &grid_dim_x); - if (err != hipSuccess) { return err; } - } - LayerNormGradWarpImpl - <<>>(load_x, load_scaled_dy, store, mean, inv_variance, - rows, cols); - return hipPeekAtLastError(); -} - -template -inline hipError_t DispatchLayerNormGradWarpImplPadding(hipStream_t stream, LOAD_X load_x, - LOAD_SCALED_DY load_scaled_dy, STORE store, - const ComputeType* mean, - const ComputeType* inv_variance, - const int64_t rows, const int64_t cols) { - if (cols == cols_per_thread * thread_group_width) { - return LaunchLayerNormGradWarpImpl( - stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); - } else { - return LaunchLayerNormGradWarpImpl( - stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); - } -} - -template -typename std::enable_if::type DispatchLayerNormGradWarpImplCols( - hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, STORE store, - const ComputeType* mean, const ComputeType* inv_variance, const int64_t rows, - const int64_t cols) { - if (cols <= 0) { return hipErrorInvalidValue; } -#define DEFINE_ONE_ELIF(thread_group_width) \ - else if (cols <= (thread_group_width)*pack_size) { \ - if (rows % 2 == 0) { \ - return DispatchLayerNormGradWarpImplPadding( \ - stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); \ - } else { \ - return DispatchLayerNormGradWarpImplPadding( \ - stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); \ - } \ - } - DEFINE_ONE_ELIF(1) - DEFINE_ONE_ELIF(2) - DEFINE_ONE_ELIF(4) - DEFINE_ONE_ELIF(8) - DEFINE_ONE_ELIF(16) - DEFINE_ONE_ELIF(32) -#undef DEFINE_ONE_ELIF -#define DEFINE_ONE_ELIF(col) \ - else if (cols <= (col)*kWarpSize) { \ - return DispatchLayerNormGradWarpImplPadding( \ - stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); \ - } - DEFINE_ONE_ELIF(2) - DEFINE_ONE_ELIF(3) - DEFINE_ONE_ELIF(4) - DEFINE_ONE_ELIF(5) - DEFINE_ONE_ELIF(6) - DEFINE_ONE_ELIF(7) - DEFINE_ONE_ELIF(8) - DEFINE_ONE_ELIF(9) - DEFINE_ONE_ELIF(10) - DEFINE_ONE_ELIF(11) - DEFINE_ONE_ELIF(12) - DEFINE_ONE_ELIF(13) - DEFINE_ONE_ELIF(14) - DEFINE_ONE_ELIF(15) - DEFINE_ONE_ELIF(16) - DEFINE_ONE_ELIF(17) - DEFINE_ONE_ELIF(18) - DEFINE_ONE_ELIF(19) - DEFINE_ONE_ELIF(20) - DEFINE_ONE_ELIF(21) - DEFINE_ONE_ELIF(22) - DEFINE_ONE_ELIF(23) - DEFINE_ONE_ELIF(24) - DEFINE_ONE_ELIF(25) - DEFINE_ONE_ELIF(26) - DEFINE_ONE_ELIF(27) - DEFINE_ONE_ELIF(28) - DEFINE_ONE_ELIF(29) - DEFINE_ONE_ELIF(30) - DEFINE_ONE_ELIF(31) - DEFINE_ONE_ELIF(32) -#undef DEFINE_ONE_ELIF - else { - return hipErrorInvalidValue; - } -} - -template -typename std::enable_if::type DispatchLayerNormGradWarpImplCols( - hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, STORE store, - const ComputeType* mean, const ComputeType* inv_variance, const int64_t rows, - const int64_t cols) { - if (cols <= 0) { return hipErrorInvalidValue; } -#define DEFINE_ONE_ELIF(thread_group_width) \ - else if (cols <= (thread_group_width)*pack_size) { \ - if (rows % 2 == 0) { \ - return DispatchLayerNormGradWarpImplPadding( \ - stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); \ - } else { \ - return DispatchLayerNormGradWarpImplPadding( \ - stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); \ - } \ - } - DEFINE_ONE_ELIF(1) - DEFINE_ONE_ELIF(2) - DEFINE_ONE_ELIF(4) - DEFINE_ONE_ELIF(8) - DEFINE_ONE_ELIF(16) - DEFINE_ONE_ELIF(32) -#undef DEFINE_ONE_ELIF -#define DEFINE_ONE_ELIF(col) \ - else if (cols <= (col)*kWarpSize) { \ - return DispatchLayerNormGradWarpImplPadding( \ - stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); \ - } - DEFINE_ONE_ELIF(4) - DEFINE_ONE_ELIF(6) - DEFINE_ONE_ELIF(8) - DEFINE_ONE_ELIF(10) - DEFINE_ONE_ELIF(12) - DEFINE_ONE_ELIF(14) - DEFINE_ONE_ELIF(16) - DEFINE_ONE_ELIF(18) - DEFINE_ONE_ELIF(20) - DEFINE_ONE_ELIF(22) - DEFINE_ONE_ELIF(24) - DEFINE_ONE_ELIF(26) - DEFINE_ONE_ELIF(28) - DEFINE_ONE_ELIF(30) - DEFINE_ONE_ELIF(32) -#undef DEFINE_ONE_ELIF - else { - return hipErrorInvalidValue; - } -} - -template -struct DispatchLayerNormGradWarpImplPackSize { - hipError_t operator()(hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, - STORE store, const ComputeType* mean, const ComputeType* inv_variance, - const int64_t rows, const int64_t cols) { - if (cols % 2 == 0) { - return DispatchLayerNormGradWarpImplCols( - stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); - } else { - return DispatchLayerNormGradWarpImplCols( - stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); - } - } -}; - -template -inline hipError_t DispatchLayerNormGradWarpImpl(hipStream_t stream, LOAD_X load_x, - LOAD_SCALED_DY load_scaled_dy, STORE store, - const ComputeType* mean, - const ComputeType* inv_variance, - const int64_t rows, const int64_t cols) { - return DispatchLayerNormGradWarpImplPackSize()( - stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); -} - -template -__global__ void LayerNormGradBlockSMemImpl(LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, - STORE store, const ComputeType* mean, - const ComputeType* inv_variance, const int64_t rows, - const int64_t cols) { - extern __shared__ __align__(sizeof(double)) unsigned char grad_shared_buf[]; - auto* normalized_buf = reinterpret_cast(grad_shared_buf); - auto* dy_buf = normalized_buf + cols; - const int tid = threadIdx.x; - assert(cols % pack_size == 0); - const int num_packs = static_cast(cols) / pack_size; - const ComputeType one_over_cols = static_cast(1.0) / static_cast(cols); - for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { - ComputeType sum_stats1 = 0; - ComputeType sum_stats2 = 0; - const ComputeType mean_val = mean[row]; - const ComputeType inv_variance_val = inv_variance[row]; - const ComputeType inv_variance_over_cols = inv_variance_val * one_over_cols; - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType x_pack[pack_size]; - ComputeType dy_pack[pack_size]; - load_x.template load(x_pack, row, pack_id * pack_size); - load_scaled_dy.template load(dy_pack, row, pack_id * pack_size); -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - const int buf_offset = i * num_packs + pack_id; - ComputeType normalized = (x_pack[i] - mean_val) * inv_variance_val; - normalized_buf[buf_offset] = normalized; - dy_buf[buf_offset] = dy_pack[i]; - sum_stats1 += dy_pack[i]; - sum_stats2 += dy_pack[i] * normalized; - } - } - const ComputeType row_sum_stats1 = BlockAllReduce(sum_stats1); - const ComputeType row_sum_stats2 = BlockAllReduce(sum_stats2); - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType pack[pack_size]; -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - const int buf_offset = i * num_packs + pack_id; - pack[i] = (cols * dy_buf[buf_offset] - row_sum_stats1 - - normalized_buf[buf_offset] * row_sum_stats2) - * inv_variance_over_cols; - } - store.template store(pack, row, pack_id * pack_size); - } - } -} - -template -__global__ void LayerNormGradBlockSMemImpl_1024(LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, - STORE store, const ComputeType* mean, - const ComputeType* inv_variance, const int64_t rows, - const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) { - extern __shared__ __align__(sizeof(double)) unsigned char grad_shared_buf[]; - auto* normalized_buf = reinterpret_cast(grad_shared_buf); - auto* dy_buf = normalized_buf + cols; - const int tid = threadIdx.x; - assert(cols % pack_size == 0); - const int num_packs = static_cast(cols) / pack_size; - const ComputeType one_over_cols = static_cast(1.0) / static_cast(cols); - for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { - ComputeType sum_stats1 = 0; - ComputeType sum_stats2 = 0; - const ComputeType mean_val = mean[row]; - const ComputeType inv_variance_val = inv_variance[row]; - const ComputeType inv_variance_over_cols = inv_variance_val * one_over_cols; - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType x_pack[pack_size]; - ComputeType dy_pack[pack_size]; - load_x.template load(x_pack, row, pack_id * pack_size); - load_scaled_dy.template load(dy_pack, row, pack_id * pack_size); -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - const int buf_offset = i * num_packs + pack_id; - ComputeType normalized = (x_pack[i] - mean_val) * inv_variance_val; - normalized_buf[buf_offset] = normalized; - dy_buf[buf_offset] = dy_pack[i]; - sum_stats1 += dy_pack[i]; - sum_stats2 += dy_pack[i] * normalized; - } - } - const ComputeType row_sum_stats1 = BlockAllReduce(sum_stats1); - const ComputeType row_sum_stats2 = BlockAllReduce(sum_stats2); - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType pack[pack_size]; -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - const int buf_offset = i * num_packs + pack_id; - pack[i] = (cols * dy_buf[buf_offset] - row_sum_stats1 - - normalized_buf[buf_offset] * row_sum_stats2) - * inv_variance_over_cols; - } - store.template store(pack, row, pack_id * pack_size); - } - } -} - -template -inline hipError_t LaunchLayerNormGradBlockSMemImpl(hipStream_t stream, LOAD_X load_x, - LOAD_SCALED_DY load_scaled_dy, STORE store, - const ComputeType* mean, - const ComputeType* inv_variance, int smem, - const int64_t rows, const int64_t cols) { - constexpr int waves = 32; - int grid_dim_x; - { - hipError_t err = GetNumBlocks(LayerNormGradBlockSMemImpl, - block_size, smem, rows, waves, &grid_dim_x); - if (err != hipSuccess) { return err; } - } - LayerNormGradBlockSMemImpl - <<>>(load_x, load_scaled_dy, store, mean, inv_variance, - rows, cols); - return hipPeekAtLastError(); -} - -template -inline hipError_t LaunchLayerNormGradBlockSMemImpl_1024(hipStream_t stream, LOAD_X load_x, - LOAD_SCALED_DY load_scaled_dy, STORE store, - const ComputeType* mean, - const ComputeType* inv_variance, int smem, - const int64_t rows, const int64_t cols) { - constexpr int waves = 32; - int grid_dim_x; - { - hipError_t err = GetNumBlocks(LayerNormGradBlockSMemImpl_1024, - block_size, smem, rows, waves, &grid_dim_x); - if (err != hipSuccess) { return err; } - } - LayerNormGradBlockSMemImpl_1024 - <<>>(load_x, load_scaled_dy, store, mean, inv_variance, - rows, cols); - return hipPeekAtLastError(); -} - -template -inline hipError_t TryDispatchLayerNormGradBlockSMemImplBlockSize( - hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, STORE store, - const ComputeType* mean, const ComputeType* inv_variance, const int64_t rows, - const int64_t cols, bool* success) { - constexpr int block_size_conf_1 = 128; - constexpr int block_size_conf_2 = 256; - constexpr int block_size_conf_3 = 512; - constexpr int block_size_conf_4 = 1024; - const size_t smem = cols * sizeof(ComputeType) * 2; - int max_active_blocks_conf_1; - { - hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks_conf_1, - LayerNormGradBlockSMemImpl, - block_size_conf_1, smem); - if (err != hipSuccess) { return err; } - } - if (max_active_blocks_conf_1 <= 0) { - *success = false; - return hipSuccess; - } - int max_active_blocks_conf_4; - { - hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks_conf_4, - LayerNormGradBlockSMemImpl_1024, - block_size_conf_4, smem); - if (err != hipSuccess) { return err; } - } - if (max_active_blocks_conf_4 == max_active_blocks_conf_1) { - *success = true; - return LaunchLayerNormGradBlockSMemImpl_1024( - stream, load_x, load_scaled_dy, store, mean, inv_variance, smem, rows, cols); - } - int max_active_blocks_conf_3; - { - hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks_conf_3, - LayerNormGradBlockSMemImpl, - block_size_conf_3, smem); - if (err != hipSuccess) { return err; } - } - if (max_active_blocks_conf_3 == max_active_blocks_conf_1) { - *success = true; - return LaunchLayerNormGradBlockSMemImpl( - stream, load_x, load_scaled_dy, store, mean, inv_variance, smem, rows, cols); - } - int max_active_blocks_conf_2; - { - hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks_conf_2, - LayerNormGradBlockSMemImpl, - block_size_conf_2, smem); - if (err != hipSuccess) { return err; } - } - if (max_active_blocks_conf_2 == max_active_blocks_conf_1) { - *success = true; - return LaunchLayerNormGradBlockSMemImpl( - stream, load_x, load_scaled_dy, store, mean, inv_variance, smem, rows, cols); - } - *success = true; - return LaunchLayerNormGradBlockSMemImpl(stream, load_x, load_scaled_dy, store, - mean, inv_variance, smem, rows, cols); -} - -template -struct TryDispatchLayerNormGradBlockSMemImplPackSize { - hipError_t operator()(hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, - STORE store, const ComputeType* mean, const ComputeType* inv_variance, - const int64_t rows, const int64_t cols, bool* success) { - if (cols % 2 == 0) { - return TryDispatchLayerNormGradBlockSMemImplBlockSize( - stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols, success); - } else { - return TryDispatchLayerNormGradBlockSMemImplBlockSize( - stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols, success); - } - } -}; - -template -inline hipError_t TryDispatchLayerNormGradBlockSMemImpl(hipStream_t stream, LOAD_X load_x, - LOAD_SCALED_DY load_scaled_dy, STORE store, - const ComputeType* mean, - const ComputeType* inv_variance, - const int64_t rows, const int64_t cols, - bool* success) { - return TryDispatchLayerNormGradBlockSMemImplPackSize()( - stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols, success); -} - -template -__global__ void LayerNormGradBlockUncachedImpl(LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, - STORE store, const ComputeType* mean, - const ComputeType* inv_variance, const int64_t rows, - const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) { - const int tid = threadIdx.x; - assert(cols % pack_size == 0); - const int num_packs = static_cast(cols) / pack_size; - const ComputeType one_over_cols = static_cast(1.0) / static_cast(cols); - for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { - const ComputeType mean_val = mean[row]; - const ComputeType inv_variance_val = inv_variance[row]; - const ComputeType inv_variance_over_cols = inv_variance_val * one_over_cols; - ComputeType sum_stats1 = 0; - ComputeType sum_stats2 = 0; - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType x_pack[pack_size]; - ComputeType dy_pack[pack_size]; - load_x.template load(x_pack, row, pack_id * pack_size); - load_scaled_dy.template load(dy_pack, row, pack_id * pack_size); - -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - sum_stats1 += dy_pack[i]; - sum_stats2 += dy_pack[i] * (x_pack[i] - mean_val) * inv_variance_val; - } - } - const ComputeType row_sum_stats1 = BlockAllReduce(sum_stats1); - const ComputeType row_sum_stats2 = BlockAllReduce(sum_stats2); - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType x_pack[pack_size]; - ComputeType dy_pack[pack_size]; - load_x.template load(x_pack, row, pack_id * pack_size); - load_scaled_dy.template load(dy_pack, row, pack_id * pack_size); -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - dy_pack[i] = (cols * dy_pack[i] - row_sum_stats1 - - (x_pack[i] - mean_val) * inv_variance_val * row_sum_stats2) - * inv_variance_over_cols; - } - store.template store(dy_pack, row, pack_id * pack_size); - } - } -} - -template -inline hipError_t LaunchLayerNormGradBlockUncachedImpl(hipStream_t stream, LOAD_X load_x, - LOAD_SCALED_DY load_scaled_dy, STORE store, - const ComputeType* mean, - const ComputeType* inv_variance, - const int64_t rows, const int64_t cols) { - constexpr int block_size = 1024; - constexpr int waves = 32; - int grid_dim_x; - { - hipError_t err = - GetNumBlocks(LayerNormGradBlockUncachedImpl, - block_size, 0, rows, waves, &grid_dim_x); - if (err != hipSuccess) { return err; } - } - LayerNormGradBlockUncachedImpl - <<>>(load_x, load_scaled_dy, store, mean, inv_variance, - rows, cols); - return hipPeekAtLastError(); -} - -template -struct DispatchLayerNormGradBlockUncachedImplPackSize { - hipError_t operator()(hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, - STORE store, const ComputeType* mean, const ComputeType* inv_variance, - const int64_t rows, const int64_t cols) { - if (cols % 2 == 0 && cols > kWarpSize) { - return LaunchLayerNormGradBlockUncachedImpl( - stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); - } else { - return LaunchLayerNormGradBlockUncachedImpl( - stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); - } - } -}; - -template -inline hipError_t DispatchLayerNormGradBlockUncachedImpl(hipStream_t stream, LOAD_X load_x, - LOAD_SCALED_DY load_scaled_dy, - STORE store, const ComputeType* mean, - const ComputeType* inv_variance, - const int64_t rows, const int64_t cols) { - return DispatchLayerNormGradBlockUncachedImplPackSize()( - stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); -} - -template -inline typename std::enable_if::value, hipError_t>::type -DispatchLayerNormGrad(hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, - STORE store, const ComputeType* mean, const ComputeType* inv_variance, - const int64_t rows, const int64_t cols) { - if (cols <= 1024) { - return DispatchLayerNormGradWarpImpl( - stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); - } else { - bool dispatch_smem_impl_success; - { - hipError_t err = - TryDispatchLayerNormGradBlockSMemImpl( - stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols, - &dispatch_smem_impl_success); - if (err != hipSuccess) { return err; } - } - if (!dispatch_smem_impl_success) { - return DispatchLayerNormGradBlockUncachedImpl( - stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); - } - return hipSuccess; - } -} - -template -inline typename std::enable_if::value, hipError_t>::type -DispatchLayerNormGrad(hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, - STORE store, const ComputeType* mean, const ComputeType* inv_variance, - const int64_t rows, const int64_t cols) { - return DispatchLayerNormGradBlockUncachedImpl( - stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); -} - -} // namespace layer_norm - -} // namespace cuda - -} // namespace oneflow - - -#endif // WITH_ROCM - -#endif // ONEFLOW_CORE_CUDA_LAYER_NORM_H_ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef ONEFLOW_CORE_HIP_LAYER_NORM_H_ +#define ONEFLOW_CORE_HIP_LAYER_NORM_H_ + +#ifdef WITH_ROCM + +#include "hip/hip_runtime.h" +#include +// #include +#include + +namespace oneflow { + +namespace cuda { + +namespace layer_norm { + +constexpr int kWarpSize = 64; + +template +struct SumOp { + __device__ __forceinline__ T operator()(const T& a, const T& b) const { return a + b; } +}; + +template +struct MaxOp { + __device__ __forceinline__ T operator()(const T& a, const T& b) const { return max(a, b); } +}; + +template class ReductionOp, typename T, int thread_group_width = kWarpSize> +__inline__ __device__ T WarpAllReduce(T val) { + for (int mask = thread_group_width / 2; mask > 0; mask /= 2) { + // val = ReductionOp()(val, __shfl_xor(0xffffffff, val, mask, thread_group_width)); + val = ReductionOp()(val, __shfl_xor(val, mask, thread_group_width)); + } + return val; +} + +template class ReductionOp, typename T, int block_size> +__inline__ __device__ T BlockAllReduce(T val) { + typedef hipcub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + __shared__ T result_broadcast; + T result = BlockReduce(temp_storage).Reduce(val, ReductionOp()); + if (threadIdx.x == 0) { result_broadcast = result; } + __syncthreads(); + return result_broadcast; +} + +template +__inline__ __device__ T Div(T a, T b); + +template<> +__inline__ __device__ float Div(float a, float b) { +#ifdef OF_LAYER_NORM_USE_FAST_MATH + return __fdividef(a, b); +#else + return a / b; +#endif +} + +template<> +__inline__ __device__ double Div(double a, double b) { + return a / b; +} + +template +__inline__ __device__ T Rsqrt(T x); + +template<> +__inline__ __device__ float Rsqrt(float x) { +#ifdef OF_LAYER_NORM_USE_FAST_MATH + return __frsqrt_rn(x); +#else + return rsqrt(x); +#endif +} + +template<> +__inline__ __device__ double Rsqrt(double x) { + return rsqrt(x); +} + +template +inline hipError_t GetNumBlocks(Func func, int64_t block_size, size_t dynamic_smem_size, + int64_t max_blocks, int64_t waves, int* num_blocks) { + int dev; + { + hipError_t err = hipGetDevice(&dev); + if (err != hipSuccess) { return err; } + } + int sm_count; + { + hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev); + if (err != hipSuccess) { return err; } + } + int max_active_blocks; + { + hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func, + block_size, dynamic_smem_size); + } + *num_blocks = + std::max(1, std::min(max_blocks, sm_count * max_active_blocks * waves)); + return hipSuccess; +} + +template +struct DefaultComputeType { + using type = T; +}; + +template<> +struct DefaultComputeType { + using type = float; +}; + +// #if CUDA_VERSION >= 11000 +// template<> +// struct DefaultComputeType { +// using type = float; +// }; +// #endif // CUDA_VERSION >= 11000 + +template +struct GetPackType { + using type = typename std::aligned_storage::type; +}; + +template +using PackType = typename GetPackType::type; + +template +union Pack { + static_assert(sizeof(PackType) == sizeof(T) * N, ""); + __device__ Pack() { + // do nothing + } + PackType storage; + T elem[N]; +}; + +template +struct DirectLoad { + DirectLoad(const SRC* src, int64_t row_size) : src(src), row_size(row_size) {} + template + __device__ void load(DST* dst, int64_t row, int64_t col) const { + Pack pack; + const int64_t offset = (row * row_size + col) / N; + pack.storage = *(reinterpret_cast*>(src) + offset); +#pragma unroll + for (int i = 0; i < N; ++i) { dst[i] = static_cast(pack.elem[i]); } + } + const SRC* src; + int64_t row_size; +}; + +template +struct DirectStore { + DirectStore(DST* dst, int64_t row_size) : dst(dst), row_size(row_size) {} + template + __device__ void store(const SRC* src, int64_t row, int64_t col) { + Pack pack; + const int64_t offset = (row * row_size + col) / N; +#pragma unroll + for (int i = 0; i < N; ++i) { pack.elem[i] = static_cast(src[i]); } + *(reinterpret_cast*>(dst) + offset) = pack.storage; + } + DST* dst; + int64_t row_size; +}; + +template +inline __device__ void WelfordCombine(T val, T* mean, T* m2, T* count) { + // Use Welford Online algorithem to compute mean and variance + // For more details you can refer to: + // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm + *count += 1; + T delta1 = val - *mean; + *mean += Div(delta1, *count); + T delta2 = val - *mean; + *m2 += delta1 * delta2; +} + +template +inline __device__ void WelfordCombine(T b_mean, T b_m2, T b_count, T* mean, T* m2, T* count) { + if (b_count == 0) { return; } + T new_count = *count + b_count; + T nb_over_n = Div(b_count, new_count); + T delta = b_mean - *mean; + *mean += delta * nb_over_n; + *m2 += b_m2 + delta * delta * (*count) * nb_over_n; + *count = new_count; +} + +template +__inline__ __device__ void WelfordWarpReduce(T thread_mean, T thread_m2, T thread_count, T* mean, + T* m2, T* count) { + *mean = thread_mean; + *m2 = thread_m2; + *count = thread_count; + for (int mask = thread_group_width / 2; mask > 0; mask /= 2) { + // T b_mean = __shfl_down(0xffffffff, *mean, mask, thread_group_width); + // T b_m2 = __shfl_down(0xffffffff, *m2, mask, thread_group_width); + // T b_count = __shfl_down(0xffffffff, *count, mask, thread_group_width); + T b_mean = __shfl_down(*mean, mask, thread_group_width); + T b_m2 = __shfl_down(*m2, mask, thread_group_width); + T b_count = __shfl_down(*count, mask, thread_group_width); + WelfordCombine(b_mean, b_m2, b_count, mean, m2, count); + } +} + +template +__inline__ __device__ void WelfordWarpAllReduce(T thread_mean, T thread_m2, T thread_count, T* mean, + T* m2, T* count) { + WelfordWarpReduce(thread_mean, thread_m2, thread_count, mean, m2, count); + // *mean = __shfl(0xffffffff, *mean, 0, thread_group_width); + // *m2 = __shfl(0xffffffff, *m2, 0, thread_group_width); + // *count = __shfl(0xffffffff, *count, 0, thread_group_width); + *mean = __shfl(*mean, 0, thread_group_width); + *m2 = __shfl(*m2, 0, thread_group_width); + *count = __shfl(*count, 0, thread_group_width); +} + +template +__inline__ __device__ void WelfordBlockAllReduce(T thread_mean, T thread_m2, T thread_count, + T* result_mean, T* result_m2, T* result_count) { + __shared__ T mean_shared[kWarpSize]; + __shared__ T m2_shared[kWarpSize]; + __shared__ T count_shared[kWarpSize]; + __shared__ T mean_result_broadcast; + __shared__ T m2_result_broadcast; + __shared__ T count_result_broadcast; + const int lid = threadIdx.x % kWarpSize; + const int wid = threadIdx.x / kWarpSize; + T warp_mean = 0; + T warp_m2 = 0; + T warp_count = 0; + WelfordWarpReduce(thread_mean, thread_m2, thread_count, &warp_mean, &warp_m2, &warp_count); + __syncthreads(); + if (lid == 0) { + mean_shared[wid] = warp_mean; + m2_shared[wid] = warp_m2; + count_shared[wid] = warp_count; + } + __syncthreads(); + if (wid == 0) { + if (threadIdx.x < blockDim.x / kWarpSize) { + warp_mean = mean_shared[lid]; + warp_m2 = m2_shared[lid]; + warp_count = count_shared[lid]; + } else { + warp_mean = static_cast(0); + warp_m2 = static_cast(0); + warp_count = static_cast(0); + } + __syncthreads(); + T block_mean = 0; + T block_m2 = 0; + T block_count = 0; + WelfordWarpReduce(warp_mean, warp_m2, warp_count, &block_mean, &block_m2, &block_count); + if (lid == 0) { + mean_result_broadcast = block_mean; + m2_result_broadcast = block_m2; + count_result_broadcast = block_count; + } + } + __syncthreads(); + *result_mean = mean_result_broadcast; + *result_m2 = m2_result_broadcast; + *result_count = count_result_broadcast; +} + +template +__global__ void LayerNormWarpImpl(LOAD load, STORE store, const int64_t rows, const int64_t cols, + const double epsilon, ComputeType* mean, + ComputeType* inv_variance) { + static_assert(cols_per_thread % pack_size == 0, ""); + static_assert(thread_group_width <= kWarpSize, ""); + static_assert(kWarpSize % thread_group_width == 0, ""); + constexpr int num_packs = cols_per_thread / pack_size; + assert(cols <= cols_per_thread * thread_group_width); + ComputeType buf[rows_per_access][cols_per_thread]; + const int64_t global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y; + const int64_t num_global_thread_group = gridDim.x * blockDim.y; + const int64_t lane_id = threadIdx.x; + const int64_t step = num_global_thread_group * rows_per_access; + for (int64_t row = global_thread_group_id * rows_per_access; row < rows; row += step) { + ComputeType thread_mean[rows_per_access]; + ComputeType thread_m2[rows_per_access]; + ComputeType thread_count[rows_per_access]; +#pragma unroll + for (int row_id = 0; row_id < rows_per_access; ++row_id) { + thread_mean[row_id] = 0; + thread_m2[row_id] = 0; + thread_count[row_id] = 0; + ComputeType* row_buf = buf[row_id]; +#pragma unroll + for (int pack_id = 0; pack_id < num_packs; ++pack_id) { + const int col = (pack_id * thread_group_width + lane_id) * pack_size; + const int pack_offset = pack_id * pack_size; + if (!padding || col < cols) { + load.template load(row_buf + pack_offset, row + row_id, col); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + WelfordCombine(row_buf[pack_offset + i], thread_mean + row_id, thread_m2 + row_id, + thread_count + row_id); + } + } else { +#pragma unroll + for (int i = 0; i < pack_size; ++i) { row_buf[pack_offset + i] = 0; } + } + } + } + ComputeType warp_mean[rows_per_access]; + ComputeType warp_m2[rows_per_access]; + ComputeType warp_count[rows_per_access]; +#pragma unroll + for (int row_id = 0; row_id < rows_per_access; ++row_id) { + int global_row_id = row + row_id; + ComputeType* row_buf = buf[row_id]; + WelfordWarpAllReduce( + thread_mean[row_id], thread_m2[row_id], thread_count[row_id], warp_mean + row_id, + warp_m2 + row_id, warp_count + row_id); + ComputeType row_mean = warp_mean[row_id]; + ComputeType row_variance = + max(Div(warp_m2[row_id], warp_count[row_id]), static_cast(0.0)); + ComputeType row_inv_var = Rsqrt(row_variance + static_cast(epsilon)); + if (lane_id == 0) { + mean[global_row_id] = row_mean; + inv_variance[global_row_id] = row_inv_var; + } +#pragma unroll + for (int i = 0; i < cols_per_thread; ++i) { + row_buf[i] = (row_buf[i] - row_mean) * row_inv_var; + } +#pragma unroll + for (int i = 0; i < num_packs; ++i) { + const int col = (i * thread_group_width + lane_id) * pack_size; + if (!padding || col < cols) { + store.template store(row_buf + i * pack_size, global_row_id, col); + } + } + } + } +} + +template +inline hipError_t LaunchLayerNormWarpImpl(hipStream_t stream, LOAD load, STORE store, + const int64_t rows, const int64_t cols, + const double epsilon, ComputeType* mean, + ComputeType* inv_variance) { + constexpr int block_size = 128; + constexpr int waves = 32; + static_assert(block_size % thread_group_width == 0, ""); + constexpr int thread_groups_per_block = block_size / thread_group_width; + dim3 block_dim(thread_group_width, thread_groups_per_block); + const int64_t num_blocks = + (rows / rows_per_access + thread_groups_per_block - 1) / thread_groups_per_block; + int grid_dim_x; + { + hipError_t err = + GetNumBlocks(LayerNormWarpImpl, + block_size, 0, num_blocks, waves, &grid_dim_x); + if (err != hipSuccess) { return err; } + } + LayerNormWarpImpl + <<>>(load, store, rows, cols, epsilon, mean, inv_variance); + return hipPeekAtLastError(); +} + +template +inline hipError_t DispatchLayerNormWarpImplPadding(hipStream_t stream, LOAD load, STORE store, + const int64_t rows, const int64_t cols, + const double epsilon, ComputeType* mean, + ComputeType* inv_variance) { + if (cols == cols_per_thread * thread_group_width) { + return LaunchLayerNormWarpImpl( + stream, load, store, rows, cols, epsilon, mean, inv_variance); + } else { + return LaunchLayerNormWarpImpl( + stream, load, store, rows, cols, epsilon, mean, inv_variance); + } +} + +template +typename std::enable_if::type DispatchLayerNormWarpImplCols( + hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols, + const double epsilon, ComputeType* mean, ComputeType* inv_variance) { + if (cols <= 0) { return hipErrorInvalidValue; } +#define DEFINE_ONE_ELIF(thread_group_width) \ + else if (cols <= (thread_group_width)*pack_size) { \ + if (rows % 2 == 0) { \ + return DispatchLayerNormWarpImplPadding( \ + stream, load, store, rows, cols, epsilon, mean, inv_variance); \ + } else { \ + return DispatchLayerNormWarpImplPadding( \ + stream, load, store, rows, cols, epsilon, mean, inv_variance); \ + } \ + } + DEFINE_ONE_ELIF(1) + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF +#define DEFINE_ONE_ELIF(col) \ + else if (cols <= (col)*kWarpSize) { \ + return DispatchLayerNormWarpImplPadding(stream, load, store, rows, cols, epsilon, mean, \ + inv_variance); \ + } + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(3) + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(5) + DEFINE_ONE_ELIF(6) + DEFINE_ONE_ELIF(7) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(9) + DEFINE_ONE_ELIF(10) + DEFINE_ONE_ELIF(11) + DEFINE_ONE_ELIF(12) + DEFINE_ONE_ELIF(13) + DEFINE_ONE_ELIF(14) + DEFINE_ONE_ELIF(15) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(17) + DEFINE_ONE_ELIF(18) + DEFINE_ONE_ELIF(19) + DEFINE_ONE_ELIF(20) + DEFINE_ONE_ELIF(21) + DEFINE_ONE_ELIF(22) + DEFINE_ONE_ELIF(23) + DEFINE_ONE_ELIF(24) + DEFINE_ONE_ELIF(25) + DEFINE_ONE_ELIF(26) + DEFINE_ONE_ELIF(27) + DEFINE_ONE_ELIF(28) + DEFINE_ONE_ELIF(29) + DEFINE_ONE_ELIF(30) + DEFINE_ONE_ELIF(31) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF + else { + return hipErrorInvalidValue; + } +} + +template +typename std::enable_if::type DispatchLayerNormWarpImplCols( + hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols, + const double epsilon, ComputeType* mean, ComputeType* inv_variance) { + if (cols <= 0) { return hipErrorInvalidValue; } +#define DEFINE_ONE_ELIF(thread_group_width) \ + else if (cols <= (thread_group_width)*pack_size) { \ + if (rows % 2 == 0) { \ + return DispatchLayerNormWarpImplPadding( \ + stream, load, store, rows, cols, epsilon, mean, inv_variance); \ + } else { \ + return DispatchLayerNormWarpImplPadding( \ + stream, load, store, rows, cols, epsilon, mean, inv_variance); \ + } \ + } + DEFINE_ONE_ELIF(1) + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF +#define DEFINE_ONE_ELIF(col) \ + else if (cols <= (col)*kWarpSize) { \ + return DispatchLayerNormWarpImplPadding(stream, load, store, rows, cols, epsilon, mean, \ + inv_variance); \ + } + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(6) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(10) + DEFINE_ONE_ELIF(12) + DEFINE_ONE_ELIF(14) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(18) + DEFINE_ONE_ELIF(20) + DEFINE_ONE_ELIF(22) + DEFINE_ONE_ELIF(24) + DEFINE_ONE_ELIF(26) + DEFINE_ONE_ELIF(28) + DEFINE_ONE_ELIF(30) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF + else { + return hipErrorInvalidValue; + } +} +template +typename std::enable_if::type DispatchLayerNormWarpImplCols( + hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols, + const double epsilon, ComputeType* mean, ComputeType* inv_variance) { + if (cols <= 0) { return hipErrorInvalidValue; } +#define DEFINE_ONE_ELIF(thread_group_width) \ + else if (cols <= (thread_group_width)*pack_size) { \ + if (rows % 2 == 0) { \ + return DispatchLayerNormWarpImplPadding( \ + stream, load, store, rows, cols, epsilon, mean, inv_variance); \ + } else { \ + return DispatchLayerNormWarpImplPadding( \ + stream, load, store, rows, cols, epsilon, mean, inv_variance); \ + } \ + } + DEFINE_ONE_ELIF(1) + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF +#define DEFINE_ONE_ELIF(col) \ + else if (cols <= (col)*kWarpSize) { \ + return DispatchLayerNormWarpImplPadding(stream, load, store, rows, cols, epsilon, mean, \ + inv_variance); \ + } + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(12) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(20) + DEFINE_ONE_ELIF(24) + DEFINE_ONE_ELIF(28) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF + else { + return hipErrorInvalidValue; + } +} + +template +struct DispatchLayerNormWarpImplPackSize { + hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows, + const int64_t cols, const double epsilon, ComputeType* mean, + ComputeType* inv_variance) { + if (cols % 4 == 0) { + return DispatchLayerNormWarpImplCols( + stream, load, store, rows, cols, epsilon, mean, inv_variance); + } else if (cols % 2 == 0) { + return DispatchLayerNormWarpImplCols( + stream, load, store, rows, cols, epsilon, mean, inv_variance); + } else { + return DispatchLayerNormWarpImplCols( + stream, load, store, rows, cols, epsilon, mean, inv_variance); + } + } +}; + +template +inline hipError_t DispatchLayerNormWarpImpl(hipStream_t stream, LOAD load, STORE store, + const int64_t rows, const int64_t cols, + const double epsilon, ComputeType* mean, + ComputeType* inv_variance) { + return DispatchLayerNormWarpImplPackSize()( + stream, load, store, rows, cols, epsilon, mean, inv_variance); +} + +template +__global__ void LayerNormBlockSMemImpl(LOAD load, STORE store, const int64_t rows, + const int64_t cols, const double epsilon, ComputeType* mean, + ComputeType* inv_variance) { + extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[]; + auto* buf = reinterpret_cast(shared_buf); + const int tid = threadIdx.x; + assert(cols % pack_size == 0); + const int num_packs = static_cast(cols) / pack_size; + for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { + ComputeType thread_mean = 0; + ComputeType thread_m2 = 0; + ComputeType thread_count = 0; + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType pack[pack_size]; + load.template load(pack, row, pack_id * pack_size); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + buf[i * num_packs + pack_id] = pack[i]; + WelfordCombine(pack[i], &thread_mean, &thread_m2, &thread_count); + } + } + ComputeType row_mean = 0; + ComputeType row_m2 = 0; + ComputeType row_count = 0; + WelfordBlockAllReduce(thread_mean, thread_m2, thread_count, &row_mean, &row_m2, + &row_count); + ComputeType row_variance = max(Div(row_m2, row_count), static_cast(0.0)); + ComputeType row_inv_var = Rsqrt(row_variance + static_cast(epsilon)); + if (threadIdx.x == 0) { + mean[row] = row_mean; + inv_variance[row] = row_inv_var; + } + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType pack[pack_size]; +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + pack[i] = (buf[i * num_packs + pack_id] - row_mean) * row_inv_var; + } + store.template store(pack, row, pack_id * pack_size); + } + } +} + +template +__global__ void LayerNormBlockSMemImpl_1024(LOAD load, STORE store, const int64_t rows, + const int64_t cols, const double epsilon, ComputeType* mean, + ComputeType* inv_variance) __attribute__((amdgpu_flat_work_group_size(1,1024))) { + extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[]; + auto* buf = reinterpret_cast(shared_buf); + const int tid = threadIdx.x; + assert(cols % pack_size == 0); + const int num_packs = static_cast(cols) / pack_size; + for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { + ComputeType thread_mean = 0; + ComputeType thread_m2 = 0; + ComputeType thread_count = 0; + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType pack[pack_size]; + load.template load(pack, row, pack_id * pack_size); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + buf[i * num_packs + pack_id] = pack[i]; + WelfordCombine(pack[i], &thread_mean, &thread_m2, &thread_count); + } + } + ComputeType row_mean = 0; + ComputeType row_m2 = 0; + ComputeType row_count = 0; + WelfordBlockAllReduce(thread_mean, thread_m2, thread_count, &row_mean, &row_m2, + &row_count); + ComputeType row_variance = max(Div(row_m2, row_count), static_cast(0.0)); + ComputeType row_inv_var = Rsqrt(row_variance + static_cast(epsilon)); + if (threadIdx.x == 0) { + mean[row] = row_mean; + inv_variance[row] = row_inv_var; + } + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType pack[pack_size]; +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + pack[i] = (buf[i * num_packs + pack_id] - row_mean) * row_inv_var; + } + store.template store(pack, row, pack_id * pack_size); + } + } +} + + +template +inline hipError_t LaunchLayerNormBlockSMemImpl(hipStream_t stream, LOAD load, STORE store, + int smem, const int64_t rows, const int64_t cols, + const double epsilon, ComputeType* mean, + ComputeType* inv_variance) { + constexpr int waves = 32; + int grid_dim_x; + { + hipError_t err = + GetNumBlocks(LayerNormBlockSMemImpl, + block_size, smem, rows, waves, &grid_dim_x); + if (err != hipSuccess) { return err; } + } + LayerNormBlockSMemImpl + <<>>(load, store, rows, cols, epsilon, mean, + inv_variance); + return hipPeekAtLastError(); +} + +template +inline hipError_t LaunchLayerNormBlockSMemImpl_1024(hipStream_t stream, LOAD load, STORE store, + int smem, const int64_t rows, const int64_t cols, + const double epsilon, ComputeType* mean, + ComputeType* inv_variance) { + constexpr int waves = 32; + int grid_dim_x; + { + hipError_t err = + GetNumBlocks(LayerNormBlockSMemImpl_1024, + block_size, smem, rows, waves, &grid_dim_x); + if (err != hipSuccess) { return err; } + } + LayerNormBlockSMemImpl_1024 + <<>>(load, store, rows, cols, epsilon, mean, + inv_variance); + return hipPeekAtLastError(); +} + +template +inline hipError_t TryDispatchLayerNormBlockSMemImplBlockSize( + hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols, + const double epsilon, ComputeType* mean, ComputeType* inv_variance, bool* success) { + constexpr int block_size_conf_1 = 128; + constexpr int block_size_conf_2 = 256; + constexpr int block_size_conf_3 = 512; + constexpr int block_size_conf_4 = 1024; + const size_t smem = cols * sizeof(ComputeType); + int max_active_blocks_conf_1; + + { + hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_conf_1, + LayerNormBlockSMemImpl, + block_size_conf_1, smem); + if (err != hipSuccess) { return err; } + } + if (max_active_blocks_conf_1 <= 0) { + *success = false; + return hipSuccess; + } + int max_active_blocks_conf_4; + { + hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_conf_4, + LayerNormBlockSMemImpl_1024, + block_size_conf_4, smem); + if (err != hipSuccess) { return err; } + } + + if (max_active_blocks_conf_4 == max_active_blocks_conf_1) { + *success = true; + return LaunchLayerNormBlockSMemImpl_1024( + stream, load, store, smem, rows, cols, epsilon, mean, inv_variance); + } + int max_active_blocks_conf_3; + { + hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_conf_3, + LayerNormBlockSMemImpl, + block_size_conf_3, smem); + if (err != hipSuccess) { return err; } + } + + if (max_active_blocks_conf_3 == max_active_blocks_conf_1) { + *success = true; + return LaunchLayerNormBlockSMemImpl( + stream, load, store, smem, rows, cols, epsilon, mean, inv_variance); + } + int max_active_blocks_conf_2; + { + hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_conf_2, + LayerNormBlockSMemImpl, + block_size_conf_2, smem); + if (err != hipSuccess) { return err; } + } + + if (max_active_blocks_conf_2 == max_active_blocks_conf_1) { + *success = true; + return LaunchLayerNormBlockSMemImpl( + stream, load, store, smem, rows, cols, epsilon, mean, inv_variance); + } + *success = true; + return LaunchLayerNormBlockSMemImpl( + stream, load, store, smem, rows, cols, epsilon, mean, inv_variance); +} + +template +struct TryDispatchLayerNormBlockSMemImplPackSize { + hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows, + const int64_t cols, const double epsilon, ComputeType* mean, + ComputeType* inv_variance, bool* success) { + if (cols % 4 == 0) { + return TryDispatchLayerNormBlockSMemImplBlockSize( + stream, load, store, rows, cols, epsilon, mean, inv_variance, success); + } else if (cols % 2 == 0) { + return TryDispatchLayerNormBlockSMemImplBlockSize( + stream, load, store, rows, cols, epsilon, mean, inv_variance, success); + } else { + return TryDispatchLayerNormBlockSMemImplBlockSize( + stream, load, store, rows, cols, epsilon, mean, inv_variance, success); + } + } +}; + +template +inline hipError_t TryDispatchLayerNormBlockSMemImpl(hipStream_t stream, LOAD load, STORE store, + const int64_t rows, const int64_t cols, + const double epsilon, ComputeType* mean, + ComputeType* inv_variance, bool* success) { + return TryDispatchLayerNormBlockSMemImplPackSize()( + stream, load, store, rows, cols, epsilon, mean, inv_variance, success); +} + +template +__global__ void LayerNormBlockUncachedImpl(LOAD load, STORE store, const int64_t rows, + const int64_t cols, const double epsilon, + ComputeType* mean, ComputeType* inv_variance) __attribute__((amdgpu_flat_work_group_size(1,1024))) { + const int tid = threadIdx.x; + assert(cols % pack_size == 0); + const int num_packs = static_cast(cols) / pack_size; + for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { + ComputeType thread_mean = 0; + ComputeType thread_m2 = 0; + ComputeType thread_count = 0; + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType pack[pack_size]; + load.template load(pack, row, pack_id * pack_size); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + WelfordCombine(pack[i], &thread_mean, &thread_m2, &thread_count); + } + } + ComputeType row_mean = 0; + ComputeType row_m2 = 0; + ComputeType row_count = 0; + WelfordBlockAllReduce(thread_mean, thread_m2, thread_count, &row_mean, &row_m2, + &row_count); + ComputeType row_variance = max(Div(row_m2, row_count), static_cast(0.0)); + ComputeType row_inv_var = Rsqrt(row_variance + static_cast(epsilon)); + if (threadIdx.x == 0) { + mean[row] = row_mean; + inv_variance[row] = row_inv_var; + } + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType pack[pack_size]; + const int pack_offset = pack_id * pack_size; + load.template load(pack, row, pack_offset); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { pack[i] = (pack[i] - row_mean) * row_inv_var; } + store.template store(pack, row, pack_offset); + } + } +} + +template +inline hipError_t LaunchLayerNormBlockUncachedImpl(hipStream_t stream, LOAD load, STORE store, + const int64_t rows, const int64_t cols, + const double epsilon, ComputeType* mean, + ComputeType* inv_variance) { + constexpr int block_size = 1024; + constexpr int waves = 32; + int grid_dim_x; + { + hipError_t err = + GetNumBlocks(LayerNormBlockUncachedImpl, + block_size, 0, rows, waves, &grid_dim_x); + if (err != hipSuccess) { return err; } + } + LayerNormBlockUncachedImpl + <<>>(load, store, rows, cols, epsilon, mean, inv_variance); + return hipPeekAtLastError(); +} + +template +struct DispatchLayerNormBlockUncachedImplPackSize { + hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows, + const int64_t cols, const double epsilon, ComputeType* mean, + ComputeType* inv_variance) { + if (cols % 4 == 0) { + return LaunchLayerNormBlockUncachedImpl( + stream, load, store, rows, cols, epsilon, mean, inv_variance); + } else if (cols % 2 == 0) { + return LaunchLayerNormBlockUncachedImpl( + stream, load, store, rows, cols, epsilon, mean, inv_variance); + } else { + return LaunchLayerNormBlockUncachedImpl( + stream, load, store, rows, cols, epsilon, mean, inv_variance); + } + } +}; + +template +inline hipError_t DispatchLayerNormBlockUncachedImpl(hipStream_t stream, LOAD load, STORE store, + const int64_t rows, const int64_t cols, + const double epsilon, ComputeType* mean, + ComputeType* inv_variance) { + return DispatchLayerNormBlockUncachedImplPackSize()( + stream, load, store, rows, cols, epsilon, mean, inv_variance); +} + +template +inline typename std::enable_if::value, hipError_t>::type +DispatchLayerNorm(hipStream_t stream, LOAD load, STORE store, const int64_t rows, + const int64_t cols, const double epsilon, ComputeType* mean, + ComputeType* inv_variance) { + if (cols <= 1024) { + return DispatchLayerNormWarpImpl(stream, load, store, rows, cols, + epsilon, mean, inv_variance); + } else { + bool dispatch_smem_impl_success; + { + hipError_t err = TryDispatchLayerNormBlockSMemImpl( + stream, load, store, rows, cols, epsilon, mean, inv_variance, + &dispatch_smem_impl_success); + if (err != hipSuccess) { return err; } + } + if (!dispatch_smem_impl_success) { + return DispatchLayerNormBlockUncachedImpl( + stream, load, store, rows, cols, epsilon, mean, inv_variance); + } + return hipSuccess; + } +} + +template +inline typename std::enable_if::value, hipError_t>::type +DispatchLayerNorm(hipStream_t stream, LOAD load, STORE store, const int64_t rows, + const int64_t cols, const double epsilon, ComputeType* mean, + ComputeType* inv_variance) { + return DispatchLayerNormBlockUncachedImpl( + stream, load, store, rows, cols, epsilon, mean, inv_variance); +} + +/* +LayerNormGrad dx: +normalized = (x - mean) * inv_var +sum_stats1 = sum(scaled_dy) +sum_stats2 = sum(scaled_dy * normalized) +dx = cols * dy - sum_stats1 - normalized * sum_stats2 +dx *= inv_var / cols +*/ +template +__global__ void LayerNormGradWarpImpl(LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, STORE store, + const ComputeType* mean, const ComputeType* inv_variance, + const int64_t rows, const int64_t cols) { + static_assert(cols_per_thread % pack_size == 0, ""); + constexpr int pack_per_thread = cols_per_thread / pack_size; + assert(cols <= cols_per_thread * thread_group_width); + static_assert(thread_group_width <= kWarpSize, ""); + static_assert(kWarpSize % thread_group_width == 0, ""); + ComputeType normalized_buf[rows_per_access][cols_per_thread]; + ComputeType dy_buf[rows_per_access][cols_per_thread]; + const ComputeType one_over_cols = static_cast(1.0) / static_cast(cols); + const int64_t global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y; + const int64_t num_global_thread_group = gridDim.x * blockDim.y; + const int lane_id = threadIdx.x; + const int64_t step = num_global_thread_group * rows_per_access; + for (int64_t row = global_thread_group_id * rows_per_access; row < rows; row += step) { + ComputeType sum_stats1[rows_per_access]; + ComputeType sum_stats2[rows_per_access]; + ComputeType inv_variance_buf[rows_per_access]; +#pragma unroll + for (int row_id = 0; row_id < rows_per_access; ++row_id) { + const int global_row_id = row + row_id; + ComputeType mean_val = mean[global_row_id]; + inv_variance_buf[row_id] = inv_variance[global_row_id]; + sum_stats1[row_id] = 0; + sum_stats2[row_id] = 0; + ComputeType* row_normalized_buf = normalized_buf[row_id]; + ComputeType* row_dy_buf = dy_buf[row_id]; +#pragma unroll + for (int pack_id = 0; pack_id < pack_per_thread; ++pack_id) { + const int col = (pack_id * thread_group_width + lane_id) * pack_size; + const int pack_offset = pack_id * pack_size; + if (!padding || col < cols) { + load_x.template load(row_normalized_buf + pack_offset, global_row_id, col); + load_scaled_dy.template load(row_dy_buf + pack_offset, global_row_id, col); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + const int col_id = pack_offset + i; + // row_normalized_buf store x + row_normalized_buf[col_id] = + (row_normalized_buf[col_id] - mean_val) * inv_variance_buf[row_id]; + sum_stats1[row_id] += row_dy_buf[col_id]; + sum_stats2[row_id] += row_dy_buf[col_id] * row_normalized_buf[col_id]; + } + } + } + } + ComputeType warp_sum_stats1[rows_per_access]; + ComputeType warp_sum_stats2[rows_per_access]; +#pragma unroll + for (int row_id = 0; row_id < rows_per_access; ++row_id) { + warp_sum_stats1[row_id] = + WarpAllReduce(sum_stats1[row_id]); + warp_sum_stats2[row_id] = + WarpAllReduce(sum_stats2[row_id]); + } +#pragma unroll + for (int row_id = 0; row_id < rows_per_access; ++row_id) { + const int global_row_id = row + row_id; + ComputeType* row_normalized_buf = normalized_buf[row_id]; + ComputeType* row_dy_buf = dy_buf[row_id]; + const ComputeType inv_variance_over_cols = inv_variance_buf[row_id] * one_over_cols; +#pragma unroll + for (int pack_id = 0; pack_id < pack_per_thread; ++pack_id) { + const int col = (pack_id * thread_group_width + lane_id) * pack_size; + if (!padding || col < cols) { + for (int i = 0; i < pack_size; ++i) { + const int col_id = pack_id * pack_size + i; + row_dy_buf[col_id] = (cols * row_dy_buf[col_id] - warp_sum_stats1[row_id] + - row_normalized_buf[col_id] * warp_sum_stats2[row_id]) + * inv_variance_over_cols; + } + store.template store(row_dy_buf + pack_id * pack_size, global_row_id, col); + } + } + } + } +} + +template +inline hipError_t LaunchLayerNormGradWarpImpl(hipStream_t stream, LOAD_X load_x, + LOAD_SCALED_DY load_scaled_dy, STORE store, + const ComputeType* mean, + const ComputeType* inv_variance, const int64_t rows, + const int64_t cols) { + constexpr int block_size = 128; + constexpr int waves = 32; + static_assert(block_size % thread_group_width == 0, ""); + constexpr int thread_groups_per_block = block_size / thread_group_width; + dim3 block_dim(thread_group_width, thread_groups_per_block); + const int64_t num_blocks = + (rows / rows_per_access + thread_groups_per_block - 1) / thread_groups_per_block; + int grid_dim_x; + { + hipError_t err = GetNumBlocks( + LayerNormGradWarpImpl, + block_size, 0, num_blocks, waves, &grid_dim_x); + if (err != hipSuccess) { return err; } + } + LayerNormGradWarpImpl + <<>>(load_x, load_scaled_dy, store, mean, inv_variance, + rows, cols); + return hipPeekAtLastError(); +} + +template +inline hipError_t DispatchLayerNormGradWarpImplPadding(hipStream_t stream, LOAD_X load_x, + LOAD_SCALED_DY load_scaled_dy, STORE store, + const ComputeType* mean, + const ComputeType* inv_variance, + const int64_t rows, const int64_t cols) { + if (cols == cols_per_thread * thread_group_width) { + return LaunchLayerNormGradWarpImpl( + stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); + } else { + return LaunchLayerNormGradWarpImpl( + stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); + } +} + +template +typename std::enable_if::type DispatchLayerNormGradWarpImplCols( + hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, STORE store, + const ComputeType* mean, const ComputeType* inv_variance, const int64_t rows, + const int64_t cols) { + if (cols <= 0) { return hipErrorInvalidValue; } +#define DEFINE_ONE_ELIF(thread_group_width) \ + else if (cols <= (thread_group_width)*pack_size) { \ + if (rows % 2 == 0) { \ + return DispatchLayerNormGradWarpImplPadding( \ + stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); \ + } else { \ + return DispatchLayerNormGradWarpImplPadding( \ + stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); \ + } \ + } + DEFINE_ONE_ELIF(1) + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF +#define DEFINE_ONE_ELIF(col) \ + else if (cols <= (col)*kWarpSize) { \ + return DispatchLayerNormGradWarpImplPadding( \ + stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); \ + } + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(3) + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(5) + DEFINE_ONE_ELIF(6) + DEFINE_ONE_ELIF(7) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(9) + DEFINE_ONE_ELIF(10) + DEFINE_ONE_ELIF(11) + DEFINE_ONE_ELIF(12) + DEFINE_ONE_ELIF(13) + DEFINE_ONE_ELIF(14) + DEFINE_ONE_ELIF(15) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(17) + DEFINE_ONE_ELIF(18) + DEFINE_ONE_ELIF(19) + DEFINE_ONE_ELIF(20) + DEFINE_ONE_ELIF(21) + DEFINE_ONE_ELIF(22) + DEFINE_ONE_ELIF(23) + DEFINE_ONE_ELIF(24) + DEFINE_ONE_ELIF(25) + DEFINE_ONE_ELIF(26) + DEFINE_ONE_ELIF(27) + DEFINE_ONE_ELIF(28) + DEFINE_ONE_ELIF(29) + DEFINE_ONE_ELIF(30) + DEFINE_ONE_ELIF(31) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF + else { + return hipErrorInvalidValue; + } +} + +template +typename std::enable_if::type DispatchLayerNormGradWarpImplCols( + hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, STORE store, + const ComputeType* mean, const ComputeType* inv_variance, const int64_t rows, + const int64_t cols) { + if (cols <= 0) { return hipErrorInvalidValue; } +#define DEFINE_ONE_ELIF(thread_group_width) \ + else if (cols <= (thread_group_width)*pack_size) { \ + if (rows % 2 == 0) { \ + return DispatchLayerNormGradWarpImplPadding( \ + stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); \ + } else { \ + return DispatchLayerNormGradWarpImplPadding( \ + stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); \ + } \ + } + DEFINE_ONE_ELIF(1) + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF +#define DEFINE_ONE_ELIF(col) \ + else if (cols <= (col)*kWarpSize) { \ + return DispatchLayerNormGradWarpImplPadding( \ + stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); \ + } + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(6) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(10) + DEFINE_ONE_ELIF(12) + DEFINE_ONE_ELIF(14) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(18) + DEFINE_ONE_ELIF(20) + DEFINE_ONE_ELIF(22) + DEFINE_ONE_ELIF(24) + DEFINE_ONE_ELIF(26) + DEFINE_ONE_ELIF(28) + DEFINE_ONE_ELIF(30) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF + else { + return hipErrorInvalidValue; + } +} + +template +struct DispatchLayerNormGradWarpImplPackSize { + hipError_t operator()(hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, + STORE store, const ComputeType* mean, const ComputeType* inv_variance, + const int64_t rows, const int64_t cols) { + if (cols % 2 == 0) { + return DispatchLayerNormGradWarpImplCols( + stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); + } else { + return DispatchLayerNormGradWarpImplCols( + stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); + } + } +}; + +template +inline hipError_t DispatchLayerNormGradWarpImpl(hipStream_t stream, LOAD_X load_x, + LOAD_SCALED_DY load_scaled_dy, STORE store, + const ComputeType* mean, + const ComputeType* inv_variance, + const int64_t rows, const int64_t cols) { + return DispatchLayerNormGradWarpImplPackSize()( + stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); +} + +template +__global__ void LayerNormGradBlockSMemImpl(LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, + STORE store, const ComputeType* mean, + const ComputeType* inv_variance, const int64_t rows, + const int64_t cols) { + extern __shared__ __align__(sizeof(double)) unsigned char grad_shared_buf[]; + auto* normalized_buf = reinterpret_cast(grad_shared_buf); + auto* dy_buf = normalized_buf + cols; + const int tid = threadIdx.x; + assert(cols % pack_size == 0); + const int num_packs = static_cast(cols) / pack_size; + const ComputeType one_over_cols = static_cast(1.0) / static_cast(cols); + for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { + ComputeType sum_stats1 = 0; + ComputeType sum_stats2 = 0; + const ComputeType mean_val = mean[row]; + const ComputeType inv_variance_val = inv_variance[row]; + const ComputeType inv_variance_over_cols = inv_variance_val * one_over_cols; + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType x_pack[pack_size]; + ComputeType dy_pack[pack_size]; + load_x.template load(x_pack, row, pack_id * pack_size); + load_scaled_dy.template load(dy_pack, row, pack_id * pack_size); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + const int buf_offset = i * num_packs + pack_id; + ComputeType normalized = (x_pack[i] - mean_val) * inv_variance_val; + normalized_buf[buf_offset] = normalized; + dy_buf[buf_offset] = dy_pack[i]; + sum_stats1 += dy_pack[i]; + sum_stats2 += dy_pack[i] * normalized; + } + } + const ComputeType row_sum_stats1 = BlockAllReduce(sum_stats1); + const ComputeType row_sum_stats2 = BlockAllReduce(sum_stats2); + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType pack[pack_size]; +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + const int buf_offset = i * num_packs + pack_id; + pack[i] = (cols * dy_buf[buf_offset] - row_sum_stats1 + - normalized_buf[buf_offset] * row_sum_stats2) + * inv_variance_over_cols; + } + store.template store(pack, row, pack_id * pack_size); + } + } +} + +template +__global__ void LayerNormGradBlockSMemImpl_1024(LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, + STORE store, const ComputeType* mean, + const ComputeType* inv_variance, const int64_t rows, + const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) { + extern __shared__ __align__(sizeof(double)) unsigned char grad_shared_buf[]; + auto* normalized_buf = reinterpret_cast(grad_shared_buf); + auto* dy_buf = normalized_buf + cols; + const int tid = threadIdx.x; + assert(cols % pack_size == 0); + const int num_packs = static_cast(cols) / pack_size; + const ComputeType one_over_cols = static_cast(1.0) / static_cast(cols); + for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { + ComputeType sum_stats1 = 0; + ComputeType sum_stats2 = 0; + const ComputeType mean_val = mean[row]; + const ComputeType inv_variance_val = inv_variance[row]; + const ComputeType inv_variance_over_cols = inv_variance_val * one_over_cols; + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType x_pack[pack_size]; + ComputeType dy_pack[pack_size]; + load_x.template load(x_pack, row, pack_id * pack_size); + load_scaled_dy.template load(dy_pack, row, pack_id * pack_size); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + const int buf_offset = i * num_packs + pack_id; + ComputeType normalized = (x_pack[i] - mean_val) * inv_variance_val; + normalized_buf[buf_offset] = normalized; + dy_buf[buf_offset] = dy_pack[i]; + sum_stats1 += dy_pack[i]; + sum_stats2 += dy_pack[i] * normalized; + } + } + const ComputeType row_sum_stats1 = BlockAllReduce(sum_stats1); + const ComputeType row_sum_stats2 = BlockAllReduce(sum_stats2); + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType pack[pack_size]; +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + const int buf_offset = i * num_packs + pack_id; + pack[i] = (cols * dy_buf[buf_offset] - row_sum_stats1 + - normalized_buf[buf_offset] * row_sum_stats2) + * inv_variance_over_cols; + } + store.template store(pack, row, pack_id * pack_size); + } + } +} + +template +inline hipError_t LaunchLayerNormGradBlockSMemImpl(hipStream_t stream, LOAD_X load_x, + LOAD_SCALED_DY load_scaled_dy, STORE store, + const ComputeType* mean, + const ComputeType* inv_variance, int smem, + const int64_t rows, const int64_t cols) { + constexpr int waves = 32; + int grid_dim_x; + { + hipError_t err = GetNumBlocks(LayerNormGradBlockSMemImpl, + block_size, smem, rows, waves, &grid_dim_x); + if (err != hipSuccess) { return err; } + } + LayerNormGradBlockSMemImpl + <<>>(load_x, load_scaled_dy, store, mean, inv_variance, + rows, cols); + return hipPeekAtLastError(); +} + +template +inline hipError_t LaunchLayerNormGradBlockSMemImpl_1024(hipStream_t stream, LOAD_X load_x, + LOAD_SCALED_DY load_scaled_dy, STORE store, + const ComputeType* mean, + const ComputeType* inv_variance, int smem, + const int64_t rows, const int64_t cols) { + constexpr int waves = 32; + int grid_dim_x; + { + hipError_t err = GetNumBlocks(LayerNormGradBlockSMemImpl_1024, + block_size, smem, rows, waves, &grid_dim_x); + if (err != hipSuccess) { return err; } + } + LayerNormGradBlockSMemImpl_1024 + <<>>(load_x, load_scaled_dy, store, mean, inv_variance, + rows, cols); + return hipPeekAtLastError(); +} + +template +inline hipError_t TryDispatchLayerNormGradBlockSMemImplBlockSize( + hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, STORE store, + const ComputeType* mean, const ComputeType* inv_variance, const int64_t rows, + const int64_t cols, bool* success) { + constexpr int block_size_conf_1 = 128; + constexpr int block_size_conf_2 = 256; + constexpr int block_size_conf_3 = 512; + constexpr int block_size_conf_4 = 1024; + const size_t smem = cols * sizeof(ComputeType) * 2; + int max_active_blocks_conf_1; + { + hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_conf_1, + LayerNormGradBlockSMemImpl, + block_size_conf_1, smem); + if (err != hipSuccess) { return err; } + } + if (max_active_blocks_conf_1 <= 0) { + *success = false; + return hipSuccess; + } + int max_active_blocks_conf_4; + { + hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_conf_4, + LayerNormGradBlockSMemImpl_1024, + block_size_conf_4, smem); + if (err != hipSuccess) { return err; } + } + if (max_active_blocks_conf_4 == max_active_blocks_conf_1) { + *success = true; + return LaunchLayerNormGradBlockSMemImpl_1024( + stream, load_x, load_scaled_dy, store, mean, inv_variance, smem, rows, cols); + } + int max_active_blocks_conf_3; + { + hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_conf_3, + LayerNormGradBlockSMemImpl, + block_size_conf_3, smem); + if (err != hipSuccess) { return err; } + } + if (max_active_blocks_conf_3 == max_active_blocks_conf_1) { + *success = true; + return LaunchLayerNormGradBlockSMemImpl( + stream, load_x, load_scaled_dy, store, mean, inv_variance, smem, rows, cols); + } + int max_active_blocks_conf_2; + { + hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_conf_2, + LayerNormGradBlockSMemImpl, + block_size_conf_2, smem); + if (err != hipSuccess) { return err; } + } + if (max_active_blocks_conf_2 == max_active_blocks_conf_1) { + *success = true; + return LaunchLayerNormGradBlockSMemImpl( + stream, load_x, load_scaled_dy, store, mean, inv_variance, smem, rows, cols); + } + *success = true; + return LaunchLayerNormGradBlockSMemImpl(stream, load_x, load_scaled_dy, store, + mean, inv_variance, smem, rows, cols); +} + +template +struct TryDispatchLayerNormGradBlockSMemImplPackSize { + hipError_t operator()(hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, + STORE store, const ComputeType* mean, const ComputeType* inv_variance, + const int64_t rows, const int64_t cols, bool* success) { + if (cols % 2 == 0) { + return TryDispatchLayerNormGradBlockSMemImplBlockSize( + stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols, success); + } else { + return TryDispatchLayerNormGradBlockSMemImplBlockSize( + stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols, success); + } + } +}; + +template +inline hipError_t TryDispatchLayerNormGradBlockSMemImpl(hipStream_t stream, LOAD_X load_x, + LOAD_SCALED_DY load_scaled_dy, STORE store, + const ComputeType* mean, + const ComputeType* inv_variance, + const int64_t rows, const int64_t cols, + bool* success) { + return TryDispatchLayerNormGradBlockSMemImplPackSize()( + stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols, success); +} + +template +__global__ void LayerNormGradBlockUncachedImpl(LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, + STORE store, const ComputeType* mean, + const ComputeType* inv_variance, const int64_t rows, + const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) { + const int tid = threadIdx.x; + assert(cols % pack_size == 0); + const int num_packs = static_cast(cols) / pack_size; + const ComputeType one_over_cols = static_cast(1.0) / static_cast(cols); + for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { + const ComputeType mean_val = mean[row]; + const ComputeType inv_variance_val = inv_variance[row]; + const ComputeType inv_variance_over_cols = inv_variance_val * one_over_cols; + ComputeType sum_stats1 = 0; + ComputeType sum_stats2 = 0; + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType x_pack[pack_size]; + ComputeType dy_pack[pack_size]; + load_x.template load(x_pack, row, pack_id * pack_size); + load_scaled_dy.template load(dy_pack, row, pack_id * pack_size); + +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + sum_stats1 += dy_pack[i]; + sum_stats2 += dy_pack[i] * (x_pack[i] - mean_val) * inv_variance_val; + } + } + const ComputeType row_sum_stats1 = BlockAllReduce(sum_stats1); + const ComputeType row_sum_stats2 = BlockAllReduce(sum_stats2); + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType x_pack[pack_size]; + ComputeType dy_pack[pack_size]; + load_x.template load(x_pack, row, pack_id * pack_size); + load_scaled_dy.template load(dy_pack, row, pack_id * pack_size); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + dy_pack[i] = (cols * dy_pack[i] - row_sum_stats1 + - (x_pack[i] - mean_val) * inv_variance_val * row_sum_stats2) + * inv_variance_over_cols; + } + store.template store(dy_pack, row, pack_id * pack_size); + } + } +} + +template +inline hipError_t LaunchLayerNormGradBlockUncachedImpl(hipStream_t stream, LOAD_X load_x, + LOAD_SCALED_DY load_scaled_dy, STORE store, + const ComputeType* mean, + const ComputeType* inv_variance, + const int64_t rows, const int64_t cols) { + constexpr int block_size = 1024; + constexpr int waves = 32; + int grid_dim_x; + { + hipError_t err = + GetNumBlocks(LayerNormGradBlockUncachedImpl, + block_size, 0, rows, waves, &grid_dim_x); + if (err != hipSuccess) { return err; } + } + LayerNormGradBlockUncachedImpl + <<>>(load_x, load_scaled_dy, store, mean, inv_variance, + rows, cols); + return hipPeekAtLastError(); +} + +template +struct DispatchLayerNormGradBlockUncachedImplPackSize { + hipError_t operator()(hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, + STORE store, const ComputeType* mean, const ComputeType* inv_variance, + const int64_t rows, const int64_t cols) { + if (cols % 2 == 0 && cols > kWarpSize) { + return LaunchLayerNormGradBlockUncachedImpl( + stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); + } else { + return LaunchLayerNormGradBlockUncachedImpl( + stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); + } + } +}; + +template +inline hipError_t DispatchLayerNormGradBlockUncachedImpl(hipStream_t stream, LOAD_X load_x, + LOAD_SCALED_DY load_scaled_dy, + STORE store, const ComputeType* mean, + const ComputeType* inv_variance, + const int64_t rows, const int64_t cols) { + return DispatchLayerNormGradBlockUncachedImplPackSize()( + stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); +} + +template +inline typename std::enable_if::value, hipError_t>::type +DispatchLayerNormGrad(hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, + STORE store, const ComputeType* mean, const ComputeType* inv_variance, + const int64_t rows, const int64_t cols) { + if (cols <= 1024) { + return DispatchLayerNormGradWarpImpl( + stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); + } else { + bool dispatch_smem_impl_success; + { + hipError_t err = + TryDispatchLayerNormGradBlockSMemImpl( + stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols, + &dispatch_smem_impl_success); + if (err != hipSuccess) { return err; } + } + if (!dispatch_smem_impl_success) { + return DispatchLayerNormGradBlockUncachedImpl( + stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); + } + return hipSuccess; + } +} + +template +inline typename std::enable_if::value, hipError_t>::type +DispatchLayerNormGrad(hipStream_t stream, LOAD_X load_x, LOAD_SCALED_DY load_scaled_dy, + STORE store, const ComputeType* mean, const ComputeType* inv_variance, + const int64_t rows, const int64_t cols) { + return DispatchLayerNormGradBlockUncachedImpl( + stream, load_x, load_scaled_dy, store, mean, inv_variance, rows, cols); +} + +} // namespace layer_norm + +} // namespace cuda + +} // namespace oneflow + + +#endif // WITH_ROCM + +#endif // ONEFLOW_CORE_CUDA_LAYER_NORM_H_ diff --git a/oneflow/core/hip/softmax.hip.h b/oneflow/core/hip/softmax.hip.h index 5cf7f05..f887d2e 100644 --- a/oneflow/core/hip/softmax.hip.h +++ b/oneflow/core/hip/softmax.hip.h @@ -1,1499 +1,1499 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#ifndef ONEFLOW_CORE_HIP_SOFTMAX_H_ -#define ONEFLOW_CORE_HIP_SOFTMAX_H_ - -#ifdef WITH_ROCM - -#include -// #include -#include -#include - -// #if CUDA_VERSION >= 11000 -// #include -// #endif // CUDA_VERSION >= 11000 - -namespace oneflow { - -namespace cuda { - -namespace softmax { - -constexpr int kWarpSize = 64; - -template -struct SumOp { - __device__ __forceinline__ T operator()(const T& a, const T& b) const { return a + b; } -}; - -template -struct MaxOp { - __device__ __forceinline__ T operator()(const T& a, const T& b) const { return max(a, b); } -}; - -template class ReductionOp, typename T, int thread_group_width = kWarpSize> -__inline__ __device__ T WarpAllReduce(T val) { - for (int mask = thread_group_width / 2; mask > 0; mask /= 2) { - // val = ReductionOp()(val, __shfl_xor(0xffffffff, val, mask)); - val = ReductionOp()(val, __shfl_xor(val, mask, kWarpSize)); - } - return val; -} - -template class ReductionOp, typename T, int block_size> -__inline__ __device__ T BlockAllReduce(T val) { - typedef hipcub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - __shared__ T result_broadcast; - T result = BlockReduce(temp_storage).Reduce(val, ReductionOp()); - if (threadIdx.x == 0) { result_broadcast = result; } - __syncthreads(); - return result_broadcast; -} - -template -__inline__ __device__ T Inf(); - -template<> -__inline__ __device__ float Inf() { - return __int_as_float(0x7f800000U); -} - -template<> -__inline__ __device__ double Inf() { - return __longlong_as_double(0x7ff0000000000000ULL); -} - -template -__inline__ __device__ T Exp(T x); - -template<> -__inline__ __device__ float Exp(float x) { -#ifdef OF_SOFTMAX_USE_FAST_MATH - return __expf(x); -#else - return exp(x); -#endif -} - -template<> -__inline__ __device__ double Exp(double x) { - return exp(x); -} - -template -__inline__ __device__ T Div(T a, T b); - -template<> -__inline__ __device__ float Div(float a, float b) { -#ifdef OF_SOFTMAX_USE_FAST_MATH - return __fdividef(a, b); -#else - return a / b; -#endif -} - -template<> -__inline__ __device__ double Div(double a, double b) { - return a / b; -} - -template -__inline__ __device__ T Log(T x); - -template<> -__inline__ __device__ float Log(float x) { -#ifdef OF_SOFTMAX_USE_FAST_MATH - return __logf(x); -#else - return log(x); -#endif -} -template<> -__inline__ __device__ double Log(double x) { - return log(x); -} - -inline hipError_t GetNumBlocks(int64_t block_size, int64_t max_blocks, int64_t waves, - int* num_blocks) { - int dev; - { - hipError_t err = hipGetDevice(&dev); - if (err != hipSuccess) { return err; } - } - int sm_count; - { - hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev); - if (err != hipSuccess) { return err; } - } - int tpm; - { - hipError_t err = hipDeviceGetAttribute(&tpm, hipDeviceAttributeMaxThreadsPerMultiProcessor, dev); - if (err != hipSuccess) { return err; } - } - *num_blocks = - std::max(1, std::min(max_blocks, sm_count * tpm / block_size * waves)); - return hipSuccess; -} - -template -struct DefaultComputeType { - using type = T; -}; - -template<> -struct DefaultComputeType { - using type = float; -}; - -// #if CUDA_VERSION >= 11000 -// template<> -// struct DefaultComputeType { -// using type = float; -// }; -// #endif // CUDA_VERSION >= 11000 - -template -struct GetPackType { - using type = typename std::aligned_storage::type; -}; - -template -using PackType = typename GetPackType::type; - -template -union Pack { - static_assert(sizeof(PackType) == sizeof(T) * N, ""); - __device__ Pack() { - // do nothing - } - PackType storage; - T elem[N]; -}; - -template -struct DirectLoad { - DirectLoad(const SRC* src, int64_t row_size) : src(src), row_size(row_size) {} - template - __device__ void load(DST* dst, int64_t row, int64_t col) const { - Pack pack; - const int64_t offset = (row * row_size + col) / N; - pack.storage = *(reinterpret_cast*>(src) + offset); -#pragma unroll - for (int i = 0; i < N; ++i) { dst[i] = static_cast(pack.elem[i]); } - } - const SRC* src; - int64_t row_size; -}; - -template -struct DirectStore { - DirectStore(DST* dst, int64_t row_size) : dst(dst), row_size(row_size) {} - template - __device__ void store(const SRC* src, int64_t row, int64_t col) { - Pack pack; - const int64_t offset = (row * row_size + col) / N; -#pragma unroll - for (int i = 0; i < N; ++i) { pack.elem[i] = static_cast(src[i]); } - *(reinterpret_cast*>(dst) + offset) = pack.storage; - } - DST* dst; - int64_t row_size; -}; - -enum class Algorithm { - kSoftmax = 0, - kLogSoftmax = 1, -}; - -template -__global__ void SoftmaxWarpImpl(LOAD load, STORE store, const int64_t rows, const int64_t cols) { - static_assert(cols_per_thread % pack_size == 0, ""); - static_assert(thread_group_width <= kWarpSize, ""); - static_assert(kWarpSize % thread_group_width == 0, ""); - constexpr int num_packs = cols_per_thread / pack_size; - assert(cols <= cols_per_thread * thread_group_width); - ComputeType buf[rows_per_access][cols_per_thread]; - const int global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y; - const int num_global_thread_group = gridDim.x * blockDim.y; - const int lane_id = threadIdx.x; - const int64_t step = num_global_thread_group * rows_per_access; - for (int64_t row = global_thread_group_id * rows_per_access; row < rows; row += step) { - ComputeType thread_max[rows_per_access]; -#pragma unroll - for (int row_id = 0; row_id < rows_per_access; ++row_id) { - thread_max[row_id] = -Inf(); - ComputeType* row_buf = buf[row_id]; -#pragma unroll - for (int pack_id = 0; pack_id < num_packs; ++pack_id) { - const int pack_offset = pack_id * pack_size; - const int col = (pack_id * thread_group_width + lane_id) * pack_size; - if (!padding || col < cols) { - load.template load(row_buf + pack_offset, row + row_id, col); -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - thread_max[row_id] = max(thread_max[row_id], row_buf[pack_offset + i]); - } - } else { -#pragma unroll - for (int i = 0; i < pack_size; ++i) { row_buf[pack_offset + i] = -Inf(); } - } - } - } - ComputeType warp_max[rows_per_access]; -#pragma unroll - for (int row_id = 0; row_id < rows_per_access; ++row_id) { - warp_max[row_id] = WarpAllReduce(thread_max[row_id]); - } - ComputeType thread_sum[rows_per_access]; -#pragma unroll - for (int row_id = 0; row_id < rows_per_access; ++row_id) { - thread_sum[row_id] = 0; - ComputeType* row_buf = buf[row_id]; -#pragma unroll - for (int i = 0; i < cols_per_thread; ++i) { - if (algorithm == Algorithm::kSoftmax) { - row_buf[i] = Exp(row_buf[i] - warp_max[row_id]); - thread_sum[row_id] += row_buf[i]; - } else if (algorithm == Algorithm::kLogSoftmax) { - row_buf[i] -= warp_max[row_id]; - thread_sum[row_id] += Exp(row_buf[i]); - } else { - asm volatile("s_trap 0;"); - } - } - } - ComputeType warp_sum[rows_per_access]; -#pragma unroll - for (int row_id = 0; row_id < rows_per_access; ++row_id) { - warp_sum[row_id] = WarpAllReduce(thread_sum[row_id]); - } -#pragma unroll - for (int row_id = 0; row_id < rows_per_access; ++row_id) { - ComputeType* row_buf = buf[row_id]; -#pragma unroll - for (int i = 0; i < cols_per_thread; ++i) { - if (algorithm == Algorithm::kSoftmax) { - row_buf[i] = Div(row_buf[i], warp_sum[row_id]); - } else if (algorithm == Algorithm::kLogSoftmax) { - row_buf[i] -= Log(warp_sum[row_id]); - } else { - asm volatile("s_trap 0;"); - } - } -#pragma unroll - for (int i = 0; i < num_packs; ++i) { - const int col = (i * thread_group_width + lane_id) * pack_size; - if (!padding || col < cols) { - store.template store(row_buf + i * pack_size, row + row_id, col); - } - } - } - } -} - -template -inline hipError_t LaunchSoftmaxWarpImpl(hipStream_t stream, LOAD load, STORE store, - const int64_t rows, const int64_t cols) { - // std::cout << "LaunchSoftmaxWarpImpl" << std::endl; - constexpr int block_size = 128; - constexpr int waves = 32; - static_assert(block_size % thread_group_width == 0, ""); - constexpr int thread_groups_per_block = block_size / thread_group_width; - dim3 block_dim(thread_group_width, thread_groups_per_block); - const int64_t num_blocks = - (rows / rows_per_access + thread_groups_per_block - 1) / thread_groups_per_block; - int grid_dim_x; - { - hipError_t err = GetNumBlocks(block_size, num_blocks, waves, &grid_dim_x); - if (err != hipSuccess) { return err; } - } - SoftmaxWarpImpl - <<>>(load, store, rows, cols); - return hipPeekAtLastError(); -} - -template -inline hipError_t DispatchSoftmaxWarpImplPadding(hipStream_t stream, LOAD load, STORE store, - const int64_t rows, const int64_t cols) { - if (cols == cols_per_thread * thread_group_width) { - return LaunchSoftmaxWarpImpl( - stream, load, store, rows, cols); - } else { - return LaunchSoftmaxWarpImpl( - stream, load, store, rows, cols); - } -} - -template -typename std::enable_if::type DispatchSoftmaxWarpImplCols( - hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols) { - if (cols <= 0) { return hipErrorInvalidValue; } -#define DEFINE_ONE_ELIF(thread_group_width) \ - else if (cols <= (thread_group_width)*pack_size) { \ - if (rows % 2 == 0) { \ - return DispatchSoftmaxWarpImplPadding(stream, load, store, \ - rows, cols); \ - } else { \ - return DispatchSoftmaxWarpImplPadding(stream, load, store, \ - rows, cols); \ - } \ - } - DEFINE_ONE_ELIF(1) - DEFINE_ONE_ELIF(2) - DEFINE_ONE_ELIF(4) - DEFINE_ONE_ELIF(8) - DEFINE_ONE_ELIF(16) - DEFINE_ONE_ELIF(32) -#undef DEFINE_ONE_ELIF -#define DEFINE_ONE_ELIF(col) \ - else if (cols <= (col)*kWarpSize) { \ - return DispatchSoftmaxWarpImplPadding(stream, load, store, rows, cols); \ - } - DEFINE_ONE_ELIF(2) - DEFINE_ONE_ELIF(3) - DEFINE_ONE_ELIF(4) - DEFINE_ONE_ELIF(5) - DEFINE_ONE_ELIF(6) - DEFINE_ONE_ELIF(7) - DEFINE_ONE_ELIF(8) - DEFINE_ONE_ELIF(9) - DEFINE_ONE_ELIF(10) - DEFINE_ONE_ELIF(11) - DEFINE_ONE_ELIF(12) - DEFINE_ONE_ELIF(13) - DEFINE_ONE_ELIF(14) - DEFINE_ONE_ELIF(15) - DEFINE_ONE_ELIF(16) - DEFINE_ONE_ELIF(17) - DEFINE_ONE_ELIF(18) - DEFINE_ONE_ELIF(19) - DEFINE_ONE_ELIF(20) - DEFINE_ONE_ELIF(21) - DEFINE_ONE_ELIF(22) - DEFINE_ONE_ELIF(23) - DEFINE_ONE_ELIF(24) - DEFINE_ONE_ELIF(25) - DEFINE_ONE_ELIF(26) - DEFINE_ONE_ELIF(27) - DEFINE_ONE_ELIF(28) - DEFINE_ONE_ELIF(29) - DEFINE_ONE_ELIF(30) - DEFINE_ONE_ELIF(31) - DEFINE_ONE_ELIF(32) -#undef DEFINE_ONE_ELIF - else { - return hipErrorInvalidValue; - } -} - -template -typename std::enable_if::type DispatchSoftmaxWarpImplCols( - hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols) { - if (cols <= 0) { return hipErrorInvalidValue; } -#define DEFINE_ONE_ELIF(thread_group_width) \ - else if (cols <= (thread_group_width)*pack_size) { \ - if (rows % 2 == 0) { \ - return DispatchSoftmaxWarpImplPadding(stream, load, store, \ - rows, cols); \ - } else { \ - return DispatchSoftmaxWarpImplPadding(stream, load, store, \ - rows, cols); \ - } \ - } - DEFINE_ONE_ELIF(1) - DEFINE_ONE_ELIF(2) - DEFINE_ONE_ELIF(4) - DEFINE_ONE_ELIF(8) - DEFINE_ONE_ELIF(16) - DEFINE_ONE_ELIF(32) -#undef DEFINE_ONE_ELIF -#define DEFINE_ONE_ELIF(col) \ - else if (cols <= (col)*kWarpSize) { \ - return DispatchSoftmaxWarpImplPadding(stream, load, store, rows, cols); \ - } - DEFINE_ONE_ELIF(4) - DEFINE_ONE_ELIF(6) - DEFINE_ONE_ELIF(8) - DEFINE_ONE_ELIF(10) - DEFINE_ONE_ELIF(12) - DEFINE_ONE_ELIF(14) - DEFINE_ONE_ELIF(16) - DEFINE_ONE_ELIF(18) - DEFINE_ONE_ELIF(20) - DEFINE_ONE_ELIF(22) - DEFINE_ONE_ELIF(24) - DEFINE_ONE_ELIF(26) - DEFINE_ONE_ELIF(28) - DEFINE_ONE_ELIF(30) - DEFINE_ONE_ELIF(32) -#undef DEFINE_ONE_ELIF - else { - return hipErrorInvalidValue; - } -} - -template -struct DispatchSoftmaxWarpImplPackSize { - hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows, - const int64_t cols) { - if (cols % 2 == 0) { - return DispatchSoftmaxWarpImplCols(stream, load, - store, rows, cols); - } else { - return DispatchSoftmaxWarpImplCols(stream, load, - store, rows, cols); - } - } -}; - -template -inline hipError_t DispatchSoftmaxWarpImpl(hipStream_t stream, LOAD load, STORE store, - const int64_t rows, const int64_t cols) { - return DispatchSoftmaxWarpImplPackSize()(stream, load, store, - rows, cols); -} - -template -__global__ void SoftmaxBlockSMemImpl(LOAD load, STORE store, const int64_t rows, - const int64_t cols) { - extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[]; - auto* buf = reinterpret_cast(shared_buf); - const int tid = threadIdx.x; - assert(cols % pack_size == 0); - const int num_packs = cols / pack_size; - for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { - ComputeType thread_max = -Inf(); - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType pack[pack_size]; - load.template load(pack, row, pack_id * pack_size); -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - buf[i * num_packs + pack_id] = pack[i]; - thread_max = max(thread_max, pack[i]); - } - } - const ComputeType row_max = BlockAllReduce(thread_max); - ComputeType thread_sum = 0; - for (int col = tid; col < cols; col += block_size) { - if (algorithm == Algorithm::kSoftmax) { - const ComputeType exp_x = Exp(buf[col] - row_max); - buf[col] = exp_x; - thread_sum += exp_x; - } else { - const ComputeType x = buf[col] - row_max; - buf[col] = x; - thread_sum += Exp(x); - } - } - const ComputeType row_sum = BlockAllReduce(thread_sum); - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType pack[pack_size]; -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - if (algorithm == Algorithm::kSoftmax) { - pack[i] = Div(buf[i * num_packs + pack_id], row_sum); - } else if (algorithm == Algorithm::kLogSoftmax) { - pack[i] = buf[i * num_packs + pack_id] - Log(row_sum); - } else { - asm volatile("s_trap 0;"); - } - } - store.template store(pack, row, pack_id * pack_size); - } - } -} - -template -__global__ void SoftmaxBlockSMemImpl_1024(LOAD load, STORE store, const int64_t rows, - const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) { - extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[]; - auto* buf = reinterpret_cast(shared_buf); - const int tid = threadIdx.x; - assert(cols % pack_size == 0); - const int num_packs = cols / pack_size; - for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { - ComputeType thread_max = -Inf(); - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType pack[pack_size]; - load.template load(pack, row, pack_id * pack_size); -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - buf[i * num_packs + pack_id] = pack[i]; - thread_max = max(thread_max, pack[i]); - } - } - const ComputeType row_max = BlockAllReduce(thread_max); - ComputeType thread_sum = 0; - for (int col = tid; col < cols; col += block_size) { - if (algorithm == Algorithm::kSoftmax) { - const ComputeType exp_x = Exp(buf[col] - row_max); - buf[col] = exp_x; - thread_sum += exp_x; - } else { - const ComputeType x = buf[col] - row_max; - buf[col] = x; - thread_sum += Exp(x); - } - } - const ComputeType row_sum = BlockAllReduce(thread_sum); - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType pack[pack_size]; -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - if (algorithm == Algorithm::kSoftmax) { - pack[i] = Div(buf[i * num_packs + pack_id], row_sum); - } else if (algorithm == Algorithm::kLogSoftmax) { - pack[i] = buf[i * num_packs + pack_id] - Log(row_sum); - } else { - asm volatile("s_trap 0;"); - } - } - store.template store(pack, row, pack_id * pack_size); - } - } -} - -template -inline hipError_t LaunchSoftmaxBlockSMemImpl(hipStream_t stream, LOAD load, STORE store, int smem, - const int64_t rows, const int64_t cols) { - - constexpr int waves = 32; - int grid_dim_x; - { - hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x); - if (err != hipSuccess) { return err; } - } - SoftmaxBlockSMemImpl - <<>>(load, store, rows, cols); - return hipPeekAtLastError(); -} - -template -inline hipError_t LaunchSoftmaxBlockSMemImpl_1024(hipStream_t stream, LOAD load, STORE store, int smem, - const int64_t rows, const int64_t cols) { - - constexpr int waves = 32; - int grid_dim_x; - { - hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x); - if (err != hipSuccess) { return err; } - } - SoftmaxBlockSMemImpl_1024 - <<>>(load, store, rows, cols); - return hipPeekAtLastError(); -} - -template -inline hipError_t TryDispatchSoftmaxBlockSMemImplBlockSize(hipStream_t stream, LOAD load, - STORE store, const int64_t rows, - const int64_t cols, bool* success) { - - constexpr int block_size_conf_1 = 128; - constexpr int block_size_conf_2 = 256; - constexpr int block_size_conf_3 = 512; - constexpr int block_size_conf_4 = 1024; - const size_t smem = cols * sizeof(ComputeType); - int max_active_blocks_conf_1; - { - hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks_conf_1, - SoftmaxBlockSMemImpl, - block_size_conf_1, smem); - if (err != hipSuccess) { return err; } - } - if (max_active_blocks_conf_1 <= 0) { - *success = false; - return hipSuccess; - } - int max_active_blocks_conf_4; - { - hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks_conf_4, - SoftmaxBlockSMemImpl_1024, - block_size_conf_4, smem); - if (err != hipSuccess) { return err; } - } - if (max_active_blocks_conf_4 == max_active_blocks_conf_1) { - *success = true; - return LaunchSoftmaxBlockSMemImpl_1024(stream, load, store, smem, rows, cols); - } - int max_active_blocks_conf_3; - { - hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks_conf_3, - SoftmaxBlockSMemImpl, - block_size_conf_3, smem); - if (err != hipSuccess) { return err; } - } - if (max_active_blocks_conf_3 == max_active_blocks_conf_1) { - *success = true; - return LaunchSoftmaxBlockSMemImpl(stream, load, store, smem, rows, cols); - } - int max_active_blocks_conf_2; - { - hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks_conf_2, - SoftmaxBlockSMemImpl, - block_size_conf_2, smem); - if (err != hipSuccess) { return err; } - } - if (max_active_blocks_conf_2 == max_active_blocks_conf_1) { - *success = true; - return LaunchSoftmaxBlockSMemImpl(stream, load, store, smem, rows, cols); - } - *success = true; - return LaunchSoftmaxBlockSMemImpl(stream, load, store, smem, rows, cols); -} - -template -struct TryDispatchSoftmaxBlockSMemImplPackSize { - hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows, - const int64_t cols, bool* success) { - if (cols % 2 == 0) { - return TryDispatchSoftmaxBlockSMemImplBlockSize( - stream, load, store, rows, cols, success); - } else { - return TryDispatchSoftmaxBlockSMemImplBlockSize( - stream, load, store, rows, cols, success); - } - } -}; - -template -inline hipError_t TryDispatchSoftmaxBlockSMemImpl(hipStream_t stream, LOAD load, STORE store, - const int64_t rows, const int64_t cols, - bool* success) { - return TryDispatchSoftmaxBlockSMemImplPackSize()( - stream, load, store, rows, cols, success); -} - -template -__global__ void SoftmaxBlockUncachedImpl(LOAD load, STORE store, const int64_t rows, - const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) { - const int tid = threadIdx.x; - assert(cols % pack_size == 0); - const int num_packs = cols / pack_size; - for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { - ComputeType thread_max = -Inf(); - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType pack[pack_size]; - load.template load(pack, row, pack_id * pack_size); -#pragma unroll - for (int i = 0; i < pack_size; ++i) { thread_max = max(thread_max, pack[i]); } - } - const ComputeType row_max = BlockAllReduce(thread_max); - ComputeType thread_sum = 0; - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType pack[pack_size]; - load.template load(pack, row, pack_id * pack_size); -#pragma unroll - for (int i = 0; i < pack_size; ++i) { thread_sum += Exp(pack[i] - row_max); } - } - const ComputeType row_sum = BlockAllReduce(thread_sum); - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType pack[pack_size]; - load.template load(pack, row, pack_id * pack_size); -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - if (algorithm == Algorithm::kSoftmax) { - pack[i] = Div(Exp(pack[i] - row_max), row_sum); - } else if (algorithm == Algorithm::kLogSoftmax) { - pack[i] = (pack[i] - row_max) - Log(row_sum); - } else { - asm volatile("s_trap 0;"); - } - } - store.template store(pack, row, pack_id * pack_size); - } - } -} - -template -inline hipError_t LaunchSoftmaxBlockUncachedImpl(hipStream_t stream, LOAD load, STORE store, - const int64_t rows, const int64_t cols) { - // std::cout << "LaunchSoftmaxBlockUncachedImpl" << std::endl; - constexpr int block_size = 1024; - constexpr int waves = 32; - int grid_dim_x; - { - hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x); - if (err != hipSuccess) { return err; } - } - SoftmaxBlockUncachedImpl - <<>>(load, store, rows, cols); - return hipPeekAtLastError(); -} - -template -struct DispatchSoftmaxBlockUncachedImplPackSize { - hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows, - const int64_t cols) { - if (cols % 2 == 0) { - return LaunchSoftmaxBlockUncachedImpl( - stream, load, store, rows, cols); - } else { - return LaunchSoftmaxBlockUncachedImpl( - stream, load, store, rows, cols); - } - } -}; - -template -inline hipError_t DispatchSoftmaxBlockUncachedImpl(hipStream_t stream, LOAD load, STORE store, - const int64_t rows, const int64_t cols) { - return DispatchSoftmaxBlockUncachedImplPackSize()( - stream, load, store, rows, cols); -} - -template -inline typename std::enable_if::value, hipError_t>::type -DispatchSoftmax(hipStream_t stream, LOAD load, STORE store, const int64_t rows, - const int64_t cols) { - if (cols < 1024) { - return DispatchSoftmaxWarpImpl( - stream, load, store, rows, cols); - } else { - bool dispatch_smem_impl_success; - { - hipError_t err = - TryDispatchSoftmaxBlockSMemImpl( - stream, load, store, rows, cols, &dispatch_smem_impl_success); - if (err != hipSuccess) { return err; } - } - if (!dispatch_smem_impl_success) { - return DispatchSoftmaxBlockUncachedImpl( - stream, load, store, rows, cols); - } - return hipSuccess; - } -} - -template -inline typename std::enable_if::value, hipError_t>::type -DispatchSoftmax(hipStream_t stream, LOAD load, STORE store, const int64_t rows, - const int64_t cols) { - return DispatchSoftmaxBlockUncachedImpl( - stream, load, store, rows, cols); -} - -template -inline typename std::enable_if::value, hipError_t>::type -DispatchLogSoftmax(hipStream_t stream, LOAD load, STORE store, const int64_t rows, - const int64_t cols) { - if (cols <= 1024) { - return DispatchSoftmaxWarpImpl( - stream, load, store, rows, cols); - } else { - bool dispatch_smem_impl_success; - { - hipError_t err = - TryDispatchSoftmaxBlockSMemImpl( - stream, load, store, rows, cols, &dispatch_smem_impl_success); - if (err != hipSuccess) { return err; } - } - if (!dispatch_smem_impl_success) { - return DispatchSoftmaxBlockUncachedImpl( - stream, load, store, rows, cols); - } - return hipSuccess; - } -} - -template -inline typename std::enable_if::value, hipError_t>::type -DispatchLogSoftmax(hipStream_t stream, LOAD load, STORE store, const int64_t rows, - const int64_t cols) { - return DispatchSoftmaxBlockUncachedImpl( - stream, load, store, rows, cols); -} - -template -__global__ void SoftmaxGradWarpImpl(LOAD_Y load_y, LOAD_DY load_dy, STORE store, const int64_t rows, - const int64_t cols) { - static_assert(cols_per_thread % pack_size == 0, ""); - constexpr int pack_per_thread = cols_per_thread / pack_size; - assert(cols <= cols_per_thread * thread_group_width); - static_assert(thread_group_width <= kWarpSize, ""); - static_assert(kWarpSize % thread_group_width == 0, ""); - ComputeType y_buf[rows_per_access][cols_per_thread]; - ComputeType dy_buf[rows_per_access][cols_per_thread]; - const int global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y; - const int num_global_thread_group = gridDim.x * blockDim.y; - const int lane_id = threadIdx.x; - const int64_t step = num_global_thread_group * rows_per_access; - for (int64_t row = global_thread_group_id * rows_per_access; row < rows; row += step) { - ComputeType thread_sum[rows_per_access]; -#pragma unroll - for (int row_id = 0; row_id < rows_per_access; ++row_id) { - thread_sum[row_id] = 0; - ComputeType* row_y_buf = y_buf[row_id]; - ComputeType* row_dy_buf = dy_buf[row_id]; -#pragma unroll - for (int pack_id = 0; pack_id < pack_per_thread; ++pack_id) { - const int pack_offset = pack_id * pack_size; - const int col = (pack_id * thread_group_width + lane_id) * pack_size; - if (!padding || col < cols) { - load_y.template load(row_y_buf + pack_offset, row + row_id, col); - load_dy.template load(row_dy_buf + pack_offset, row + row_id, col); -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - if (algorithm == Algorithm::kSoftmax) { - thread_sum[row_id] += row_y_buf[pack_offset + i] * row_dy_buf[pack_offset + i]; - } else if (algorithm == Algorithm::kLogSoftmax) { - thread_sum[row_id] += row_dy_buf[pack_offset + i]; - } else { - asm volatile("s_trap 0;"); - } - } - } - } - } - ComputeType warp_sum[rows_per_access]; -#pragma unroll - for (int row_id = 0; row_id < rows_per_access; ++row_id) { - warp_sum[row_id] = WarpAllReduce(thread_sum[row_id]); - } -#pragma unroll - for (int row_id = 0; row_id < rows_per_access; ++row_id) { - ComputeType* row_y_buf = y_buf[row_id]; - ComputeType* row_dy_buf = dy_buf[row_id]; -#pragma unroll - for (int pack_id = 0; pack_id < pack_per_thread; ++pack_id) { - const int pack_offset = pack_id * pack_size; - const int col = (pack_id * thread_group_width + lane_id) * pack_size; - if (!padding || col < cols) { - for (int i = 0; i < pack_size; ++i) { - if (algorithm == Algorithm::kSoftmax) { - row_dy_buf[pack_offset + i] = - (row_dy_buf[pack_offset + i] - warp_sum[row_id]) * row_y_buf[pack_offset + i]; - } else if (algorithm == Algorithm::kLogSoftmax) { - row_dy_buf[pack_offset + i] -= Exp(row_y_buf[pack_offset + i]) * warp_sum[row_id]; - } else { - asm volatile("s_trap 0;"); - } - } - store.template store(row_dy_buf + pack_offset, row + row_id, col); - } - } - } - } -} - -template -inline hipError_t LaunchSoftmaxGradWarpImpl(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, - STORE store, const int64_t rows, const int64_t cols) { - constexpr int block_size = 128; - constexpr int waves = 32; - static_assert(block_size % thread_group_width == 0, ""); - constexpr int thread_groups_per_block = block_size / thread_group_width; - dim3 block_dim(thread_group_width, thread_groups_per_block); - const int64_t num_blocks = - (rows / rows_per_access + thread_groups_per_block - 1) / thread_groups_per_block; - int grid_dim_x; - { - hipError_t err = GetNumBlocks(block_size, num_blocks, waves, &grid_dim_x); - if (err != hipSuccess) { return err; } - } - SoftmaxGradWarpImpl - <<>>(load_y, load_dy, store, rows, cols); - return hipPeekAtLastError(); -} - -template -inline hipError_t DispatchSoftmaxGradWarpImplPadding(hipStream_t stream, LOAD_Y load_y, - LOAD_DY load_dy, STORE store, - const int64_t rows, const int64_t cols) { - if (cols == cols_per_thread * thread_group_width) { - return LaunchSoftmaxGradWarpImpl(stream, load_y, load_dy, store, rows, cols); - } else { - return LaunchSoftmaxGradWarpImpl(stream, load_y, load_dy, store, rows, cols); - } -} - -template -typename std::enable_if::type DispatchSoftmaxGradWarpImplCols( - hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store, const int64_t rows, - const int64_t cols) { - if (cols <= 0) { return hipErrorInvalidValue; } -#define DEFINE_ONE_ELIF(thread_group_width) \ - else if (cols <= (thread_group_width)*pack_size) { \ - if (rows % 2 == 0) { \ - return DispatchSoftmaxGradWarpImplPadding( \ - stream, load_y, load_dy, store, rows, cols); \ - } else { \ - return DispatchSoftmaxGradWarpImplPadding( \ - stream, load_y, load_dy, store, rows, cols); \ - } \ - } - DEFINE_ONE_ELIF(1) - DEFINE_ONE_ELIF(2) - DEFINE_ONE_ELIF(4) - DEFINE_ONE_ELIF(8) - DEFINE_ONE_ELIF(16) - DEFINE_ONE_ELIF(32) -#undef DEFINE_ONE_ELIF -#define DEFINE_ONE_ELIF(col) \ - else if (cols <= (col)*kWarpSize) { \ - return DispatchSoftmaxGradWarpImplPadding(stream, load_y, load_dy, \ - store, rows, cols); \ - } - DEFINE_ONE_ELIF(2) - DEFINE_ONE_ELIF(3) - DEFINE_ONE_ELIF(4) - DEFINE_ONE_ELIF(5) - DEFINE_ONE_ELIF(6) - DEFINE_ONE_ELIF(7) - DEFINE_ONE_ELIF(8) - DEFINE_ONE_ELIF(9) - DEFINE_ONE_ELIF(10) - DEFINE_ONE_ELIF(11) - DEFINE_ONE_ELIF(12) - DEFINE_ONE_ELIF(13) - DEFINE_ONE_ELIF(14) - DEFINE_ONE_ELIF(15) - DEFINE_ONE_ELIF(16) - DEFINE_ONE_ELIF(17) - DEFINE_ONE_ELIF(18) - DEFINE_ONE_ELIF(19) - DEFINE_ONE_ELIF(20) - DEFINE_ONE_ELIF(21) - DEFINE_ONE_ELIF(22) - DEFINE_ONE_ELIF(23) - DEFINE_ONE_ELIF(24) - DEFINE_ONE_ELIF(25) - DEFINE_ONE_ELIF(26) - DEFINE_ONE_ELIF(27) - DEFINE_ONE_ELIF(28) - DEFINE_ONE_ELIF(29) - DEFINE_ONE_ELIF(30) - DEFINE_ONE_ELIF(31) - DEFINE_ONE_ELIF(32) -#undef DEFINE_ONE_ELIF - else { - return hipErrorInvalidValue; - } -} - -template -typename std::enable_if::type DispatchSoftmaxGradWarpImplCols( - hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store, const int64_t rows, - const int64_t cols) { - if (cols <= 0) { return hipErrorInvalidValue; } -#define DEFINE_ONE_ELIF(thread_group_width) \ - else if (cols <= (thread_group_width)*pack_size) { \ - if (rows % 2 == 0) { \ - return DispatchSoftmaxGradWarpImplPadding( \ - stream, load_y, load_dy, store, rows, cols); \ - } else { \ - return DispatchSoftmaxGradWarpImplPadding( \ - stream, load_y, load_dy, store, rows, cols); \ - } \ - } - DEFINE_ONE_ELIF(1) - DEFINE_ONE_ELIF(2) - DEFINE_ONE_ELIF(4) - DEFINE_ONE_ELIF(8) - DEFINE_ONE_ELIF(16) - DEFINE_ONE_ELIF(32) -#undef DEFINE_ONE_ELIF -#define DEFINE_ONE_ELIF(col) \ - else if (cols <= (col)*kWarpSize) { \ - return DispatchSoftmaxGradWarpImplPadding(stream, load_y, load_dy, \ - store, rows, cols); \ - } - DEFINE_ONE_ELIF(4) - DEFINE_ONE_ELIF(6) - DEFINE_ONE_ELIF(8) - DEFINE_ONE_ELIF(10) - DEFINE_ONE_ELIF(12) - DEFINE_ONE_ELIF(14) - DEFINE_ONE_ELIF(16) - DEFINE_ONE_ELIF(18) - DEFINE_ONE_ELIF(20) - DEFINE_ONE_ELIF(22) - DEFINE_ONE_ELIF(24) - DEFINE_ONE_ELIF(26) - DEFINE_ONE_ELIF(28) - DEFINE_ONE_ELIF(30) - DEFINE_ONE_ELIF(32) -#undef DEFINE_ONE_ELIF - else { - return hipErrorInvalidValue; - } -} - -template -struct DispatchSoftmaxGradWarpImplPackSize { - hipError_t operator()(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store, - const int64_t rows, const int64_t cols) { - if (cols % 2 == 0) { - return DispatchSoftmaxGradWarpImplCols( - stream, load_y, load_dy, store, rows, cols); - } else { - return DispatchSoftmaxGradWarpImplCols( - stream, load_y, load_dy, store, rows, cols); - } - } -}; - -template -inline hipError_t DispatchSoftmaxGradWarpImpl(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, - STORE store, const int64_t rows, - const int64_t cols) { - return DispatchSoftmaxGradWarpImplPackSize()( - stream, load_y, load_dy, store, rows, cols); -} - -template -__global__ void SoftmaxGradBlockSMemImpl(LOAD_Y load_y, LOAD_DY load_dy, STORE store, - const int64_t rows, const int64_t cols) { - extern __shared__ __align__(sizeof(double)) unsigned char grad_shared_buf[]; - auto* y_buf = reinterpret_cast(grad_shared_buf); - auto* dy_buf = y_buf + cols; - const int tid = threadIdx.x; - assert(cols % pack_size == 0); - const int num_packs = cols / pack_size; - for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { - ComputeType thread_sum = 0; - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType y_pack[pack_size]; - ComputeType dy_pack[pack_size]; - load_y.template load(y_pack, row, pack_id * pack_size); - load_dy.template load(dy_pack, row, pack_id * pack_size); -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - y_buf[i * num_packs + pack_id] = y_pack[i]; - dy_buf[i * num_packs + pack_id] = dy_pack[i]; - if (algorithm == Algorithm::kSoftmax) { - thread_sum += y_pack[i] * dy_pack[i]; - } else if (algorithm == Algorithm::kLogSoftmax) { - thread_sum += dy_pack[i]; - } else { - asm volatile("s_trap 0;"); - } - } - } - const ComputeType row_sum = BlockAllReduce(thread_sum); - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType pack[pack_size]; -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - if (algorithm == Algorithm::kSoftmax) { - pack[i] = (dy_buf[i * num_packs + pack_id] - row_sum) * y_buf[i * num_packs + pack_id]; - } else if (algorithm == Algorithm::kLogSoftmax) { - pack[i] = dy_buf[i * num_packs + pack_id] - Exp(y_buf[i * num_packs + pack_id]) * row_sum; - } else { - asm volatile("s_trap 0;"); - } - } - store.template store(pack, row, pack_id * pack_size); - } - } -} - -template -__global__ void SoftmaxGradBlockSMemImpl_1024(LOAD_Y load_y, LOAD_DY load_dy, STORE store, - const int64_t rows, const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) { - extern __shared__ __align__(sizeof(double)) unsigned char grad_shared_buf[]; - auto* y_buf = reinterpret_cast(grad_shared_buf); - auto* dy_buf = y_buf + cols; - const int tid = threadIdx.x; - assert(cols % pack_size == 0); - const int num_packs = cols / pack_size; - for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { - ComputeType thread_sum = 0; - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType y_pack[pack_size]; - ComputeType dy_pack[pack_size]; - load_y.template load(y_pack, row, pack_id * pack_size); - load_dy.template load(dy_pack, row, pack_id * pack_size); -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - y_buf[i * num_packs + pack_id] = y_pack[i]; - dy_buf[i * num_packs + pack_id] = dy_pack[i]; - if (algorithm == Algorithm::kSoftmax) { - thread_sum += y_pack[i] * dy_pack[i]; - } else if (algorithm == Algorithm::kLogSoftmax) { - thread_sum += dy_pack[i]; - } else { - asm volatile("s_trap 0;"); - } - } - } - const ComputeType row_sum = BlockAllReduce(thread_sum); - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType pack[pack_size]; -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - if (algorithm == Algorithm::kSoftmax) { - pack[i] = (dy_buf[i * num_packs + pack_id] - row_sum) * y_buf[i * num_packs + pack_id]; - } else if (algorithm == Algorithm::kLogSoftmax) { - pack[i] = dy_buf[i * num_packs + pack_id] - Exp(y_buf[i * num_packs + pack_id]) * row_sum; - } else { - asm volatile("s_trap 0;"); - } - } - store.template store(pack, row, pack_id * pack_size); - } - } -} - -template -inline hipError_t LaunchSoftmaxGradBlockSMemImpl(hipStream_t stream, LOAD_Y load_y, - LOAD_DY load_dy, STORE store, int smem, - const int64_t rows, const int64_t cols) { - constexpr int waves = 32; - int grid_dim_x; - { - hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x); - if (err != hipSuccess) { return err; } - } - SoftmaxGradBlockSMemImpl - <<>>(load_y, load_dy, store, rows, cols); - return hipPeekAtLastError(); -} - -template -inline hipError_t LaunchSoftmaxGradBlockSMemImpl_1024(hipStream_t stream, LOAD_Y load_y, - LOAD_DY load_dy, STORE store, int smem, - const int64_t rows, const int64_t cols) { - constexpr int waves = 32; - int grid_dim_x; - { - hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x); - if (err != hipSuccess) { return err; } - } - SoftmaxGradBlockSMemImpl_1024 - <<>>(load_y, load_dy, store, rows, cols); - return hipPeekAtLastError(); -} - -template -inline hipError_t TryDispatchSoftmaxGradBlockSMemImplBlockSize(hipStream_t stream, LOAD_Y load_y, - LOAD_DY load_dy, STORE store, - const int64_t rows, - const int64_t cols, bool* success) { - constexpr int block_size_conf_1 = 128; - constexpr int block_size_conf_2 = 256; - constexpr int block_size_conf_3 = 512; - constexpr int block_size_conf_4 = 1024; - const size_t smem = cols * sizeof(ComputeType) * 2; - int max_active_blocks_conf_1; - { - hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks_conf_1, - SoftmaxGradBlockSMemImpl, - block_size_conf_1, smem); - if (err != hipSuccess) { return err; } - } - if (max_active_blocks_conf_1 <= 0) { - *success = false; - return hipSuccess; - } - int max_active_blocks_conf_4; - { - hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks_conf_4, - SoftmaxGradBlockSMemImpl_1024, - block_size_conf_4, smem); - if (err != hipSuccess) { return err; } - } - if (max_active_blocks_conf_4 == max_active_blocks_conf_1) { - *success = true; - return LaunchSoftmaxGradBlockSMemImpl_1024(stream, load_y, load_dy, - store, smem, rows, cols); - } - int max_active_blocks_conf_3; - { - hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks_conf_3, - SoftmaxGradBlockSMemImpl, - block_size_conf_3, smem); - if (err != hipSuccess) { return err; } - } - if (max_active_blocks_conf_3 == max_active_blocks_conf_1) { - *success = true; - return LaunchSoftmaxGradBlockSMemImpl(stream, load_y, load_dy, - store, smem, rows, cols); - } - int max_active_blocks_conf_2; - { - hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks_conf_2, - SoftmaxGradBlockSMemImpl, - block_size_conf_2, smem); - if (err != hipSuccess) { return err; } - } - if (max_active_blocks_conf_2 == max_active_blocks_conf_1) { - *success = true; - return LaunchSoftmaxGradBlockSMemImpl(stream, load_y, load_dy, - store, smem, rows, cols); - } - *success = true; - return LaunchSoftmaxGradBlockSMemImpl(stream, load_y, load_dy, - store, smem, rows, cols); -} - -template -struct TryDispatchSoftmaxGradBlockSMemImplPackSize { - hipError_t operator()(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store, - const int64_t rows, const int64_t cols, bool* success) { - if (cols % 2 == 0) { - return TryDispatchSoftmaxGradBlockSMemImplBlockSize(stream, load_y, load_dy, store, - rows, cols, success); - } else { - return TryDispatchSoftmaxGradBlockSMemImplBlockSize(stream, load_y, load_dy, store, - rows, cols, success); - } - } -}; - -template -inline hipError_t TryDispatchSoftmaxGradBlockSMemImpl(hipStream_t stream, LOAD_Y load_y, - LOAD_DY load_dy, STORE store, - const int64_t rows, const int64_t cols, - bool* success) { - return TryDispatchSoftmaxGradBlockSMemImplPackSize()(stream, load_y, load_dy, store, - rows, cols, success); -} - -template -__global__ void SoftmaxGradBlockUncachedImpl(LOAD_Y load_y, LOAD_DY load_dy, STORE store, - const int64_t rows, const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) { - const int tid = threadIdx.x; - assert(cols % pack_size == 0); - const int num_packs = cols / pack_size; - for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { - ComputeType thread_sum = 0; - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType y_pack[pack_size]; - ComputeType dy_pack[pack_size]; - load_y.template load(y_pack, row, pack_id * pack_size); - load_dy.template load(dy_pack, row, pack_id * pack_size); - -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - if (algorithm == Algorithm::kSoftmax) { - thread_sum += y_pack[i] * dy_pack[i]; - } else if (algorithm == Algorithm::kLogSoftmax) { - thread_sum += dy_pack[i]; - } else { - asm volatile("s_trap 0;"); - } - } - } - const ComputeType row_sum = BlockAllReduce(thread_sum); - for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { - ComputeType y_pack[pack_size]; - ComputeType dy_pack[pack_size]; - load_y.template load(y_pack, row, pack_id * pack_size); - load_dy.template load(dy_pack, row, pack_id * pack_size); -#pragma unroll - for (int i = 0; i < pack_size; ++i) { - if (algorithm == Algorithm::kSoftmax) { - dy_pack[i] = (dy_pack[i] - row_sum) * y_pack[i]; - } else if (algorithm == Algorithm::kLogSoftmax) { - dy_pack[i] -= Exp(y_pack[i]) * row_sum; - } else { - asm volatile("s_trap 0;"); - } - } - store.template store(dy_pack, row, pack_id * pack_size); - } - } -} - -template -inline hipError_t LaunchSoftmaxGradBlockUncachedImpl(hipStream_t stream, LOAD_Y load_y, - LOAD_DY load_dy, STORE store, - const int64_t rows, const int64_t cols) { - constexpr int block_size = 1024; - constexpr int waves = 32; - int grid_dim_x; - { - hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x); - if (err != hipSuccess) { return err; } - } - SoftmaxGradBlockUncachedImpl - <<>>(load_y, load_dy, store, rows, cols); - return hipPeekAtLastError(); -} - -template -struct DispatchSoftmaxGradBlockUncachedImplPackSize { - hipError_t operator()(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store, - const int64_t rows, const int64_t cols) { - if (cols % 2 == 0 && cols > kWarpSize) { - return LaunchSoftmaxGradBlockUncachedImpl( - stream, load_y, load_dy, store, rows, cols); - } else { - return LaunchSoftmaxGradBlockUncachedImpl( - stream, load_y, load_dy, store, rows, cols); - } - } -}; - -template -inline hipError_t DispatchSoftmaxGradBlockUncachedImpl(hipStream_t stream, LOAD_Y load_y, - LOAD_DY load_dy, STORE store, - const int64_t rows, const int64_t cols) { - return DispatchSoftmaxGradBlockUncachedImplPackSize()(stream, load_y, load_dy, store, - rows, cols); -} - -template -inline typename std::enable_if::value, hipError_t>::type -DispatchSoftmaxGrad(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store, - const int64_t rows, const int64_t cols) { - if (cols <= 1024) { - return DispatchSoftmaxGradWarpImpl( - stream, load_y, load_dy, store, rows, cols); - } else { - bool dispatch_smem_impl_success; - { - hipError_t err = TryDispatchSoftmaxGradBlockSMemImpl( - stream, load_y, load_dy, store, rows, cols, &dispatch_smem_impl_success); - if (err != hipSuccess) { return err; } - } - if (!dispatch_smem_impl_success) { - return DispatchSoftmaxGradBlockUncachedImpl(stream, load_y, load_dy, - store, rows, cols); - } - return hipSuccess; - } -} - -template -inline typename std::enable_if::value, hipError_t>::type -DispatchSoftmaxGrad(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store, - const int64_t rows, const int64_t cols) { - return DispatchSoftmaxGradBlockUncachedImpl(stream, load_y, load_dy, store, - rows, cols); -} - -template -inline typename std::enable_if::value, hipError_t>::type -DispatchLogSoftmaxGrad(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store, - const int64_t rows, const int64_t cols) { - if (cols <= 1024) { - return DispatchSoftmaxGradWarpImpl( - stream, load_y, load_dy, store, rows, cols); - } else { - bool dispatch_smem_impl_success; - { - hipError_t err = TryDispatchSoftmaxGradBlockSMemImpl( - stream, load_y, load_dy, store, rows, cols, &dispatch_smem_impl_success); - if (err != hipSuccess) { return err; } - } - if (!dispatch_smem_impl_success) { - return DispatchSoftmaxGradBlockUncachedImpl(stream, load_y, load_dy, - store, rows, cols); - } - return hipSuccess; - } -} - -template -inline typename std::enable_if::value, hipError_t>::type -DispatchLogSoftmaxGrad(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store, - const int64_t rows, const int64_t cols) { - return DispatchSoftmaxGradBlockUncachedImpl(stream, load_y, load_dy, - store, rows, cols); -} - -} // namespace softmax - -} // namespace cuda - -} // namespace oneflow - -#endif // WITH_ROCM - -#endif // ONEFLOW_CORE_CUDA_SOFTMAX_H_ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef ONEFLOW_CORE_HIP_SOFTMAX_H_ +#define ONEFLOW_CORE_HIP_SOFTMAX_H_ + +#ifdef WITH_ROCM + +#include +// #include +#include +#include + +// #if CUDA_VERSION >= 11000 +// #include +// #endif // CUDA_VERSION >= 11000 + +namespace oneflow { + +namespace cuda { + +namespace softmax { + +constexpr int kWarpSize = 64; + +template +struct SumOp { + __device__ __forceinline__ T operator()(const T& a, const T& b) const { return a + b; } +}; + +template +struct MaxOp { + __device__ __forceinline__ T operator()(const T& a, const T& b) const { return max(a, b); } +}; + +template class ReductionOp, typename T, int thread_group_width = kWarpSize> +__inline__ __device__ T WarpAllReduce(T val) { + for (int mask = thread_group_width / 2; mask > 0; mask /= 2) { + // val = ReductionOp()(val, __shfl_xor(0xffffffff, val, mask)); + val = ReductionOp()(val, __shfl_xor(val, mask, kWarpSize)); + } + return val; +} + +template class ReductionOp, typename T, int block_size> +__inline__ __device__ T BlockAllReduce(T val) { + typedef hipcub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + __shared__ T result_broadcast; + T result = BlockReduce(temp_storage).Reduce(val, ReductionOp()); + if (threadIdx.x == 0) { result_broadcast = result; } + __syncthreads(); + return result_broadcast; +} + +template +__inline__ __device__ T Inf(); + +template<> +__inline__ __device__ float Inf() { + return __int_as_float(0x7f800000U); +} + +template<> +__inline__ __device__ double Inf() { + return __longlong_as_double(0x7ff0000000000000ULL); +} + +template +__inline__ __device__ T Exp(T x); + +template<> +__inline__ __device__ float Exp(float x) { +#ifdef OF_SOFTMAX_USE_FAST_MATH + return __expf(x); +#else + return exp(x); +#endif +} + +template<> +__inline__ __device__ double Exp(double x) { + return exp(x); +} + +template +__inline__ __device__ T Div(T a, T b); + +template<> +__inline__ __device__ float Div(float a, float b) { +#ifdef OF_SOFTMAX_USE_FAST_MATH + return __fdividef(a, b); +#else + return a / b; +#endif +} + +template<> +__inline__ __device__ double Div(double a, double b) { + return a / b; +} + +template +__inline__ __device__ T Log(T x); + +template<> +__inline__ __device__ float Log(float x) { +#ifdef OF_SOFTMAX_USE_FAST_MATH + return __logf(x); +#else + return log(x); +#endif +} +template<> +__inline__ __device__ double Log(double x) { + return log(x); +} + +inline hipError_t GetNumBlocks(int64_t block_size, int64_t max_blocks, int64_t waves, + int* num_blocks) { + int dev; + { + hipError_t err = hipGetDevice(&dev); + if (err != hipSuccess) { return err; } + } + int sm_count; + { + hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev); + if (err != hipSuccess) { return err; } + } + int tpm; + { + hipError_t err = hipDeviceGetAttribute(&tpm, hipDeviceAttributeMaxThreadsPerMultiProcessor, dev); + if (err != hipSuccess) { return err; } + } + *num_blocks = + std::max(1, std::min(max_blocks, sm_count * tpm / block_size * waves)); + return hipSuccess; +} + +template +struct DefaultComputeType { + using type = T; +}; + +template<> +struct DefaultComputeType { + using type = float; +}; + +// #if CUDA_VERSION >= 11000 +// template<> +// struct DefaultComputeType { +// using type = float; +// }; +// #endif // CUDA_VERSION >= 11000 + +template +struct GetPackType { + using type = typename std::aligned_storage::type; +}; + +template +using PackType = typename GetPackType::type; + +template +union Pack { + static_assert(sizeof(PackType) == sizeof(T) * N, ""); + __device__ Pack() { + // do nothing + } + PackType storage; + T elem[N]; +}; + +template +struct DirectLoad { + DirectLoad(const SRC* src, int64_t row_size) : src(src), row_size(row_size) {} + template + __device__ void load(DST* dst, int64_t row, int64_t col) const { + Pack pack; + const int64_t offset = (row * row_size + col) / N; + pack.storage = *(reinterpret_cast*>(src) + offset); +#pragma unroll + for (int i = 0; i < N; ++i) { dst[i] = static_cast(pack.elem[i]); } + } + const SRC* src; + int64_t row_size; +}; + +template +struct DirectStore { + DirectStore(DST* dst, int64_t row_size) : dst(dst), row_size(row_size) {} + template + __device__ void store(const SRC* src, int64_t row, int64_t col) { + Pack pack; + const int64_t offset = (row * row_size + col) / N; +#pragma unroll + for (int i = 0; i < N; ++i) { pack.elem[i] = static_cast(src[i]); } + *(reinterpret_cast*>(dst) + offset) = pack.storage; + } + DST* dst; + int64_t row_size; +}; + +enum class Algorithm { + kSoftmax = 0, + kLogSoftmax = 1, +}; + +template +__global__ void SoftmaxWarpImpl(LOAD load, STORE store, const int64_t rows, const int64_t cols) { + static_assert(cols_per_thread % pack_size == 0, ""); + static_assert(thread_group_width <= kWarpSize, ""); + static_assert(kWarpSize % thread_group_width == 0, ""); + constexpr int num_packs = cols_per_thread / pack_size; + assert(cols <= cols_per_thread * thread_group_width); + ComputeType buf[rows_per_access][cols_per_thread]; + const int global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y; + const int num_global_thread_group = gridDim.x * blockDim.y; + const int lane_id = threadIdx.x; + const int64_t step = num_global_thread_group * rows_per_access; + for (int64_t row = global_thread_group_id * rows_per_access; row < rows; row += step) { + ComputeType thread_max[rows_per_access]; +#pragma unroll + for (int row_id = 0; row_id < rows_per_access; ++row_id) { + thread_max[row_id] = -Inf(); + ComputeType* row_buf = buf[row_id]; +#pragma unroll + for (int pack_id = 0; pack_id < num_packs; ++pack_id) { + const int pack_offset = pack_id * pack_size; + const int col = (pack_id * thread_group_width + lane_id) * pack_size; + if (!padding || col < cols) { + load.template load(row_buf + pack_offset, row + row_id, col); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + thread_max[row_id] = max(thread_max[row_id], row_buf[pack_offset + i]); + } + } else { +#pragma unroll + for (int i = 0; i < pack_size; ++i) { row_buf[pack_offset + i] = -Inf(); } + } + } + } + ComputeType warp_max[rows_per_access]; +#pragma unroll + for (int row_id = 0; row_id < rows_per_access; ++row_id) { + warp_max[row_id] = WarpAllReduce(thread_max[row_id]); + } + ComputeType thread_sum[rows_per_access]; +#pragma unroll + for (int row_id = 0; row_id < rows_per_access; ++row_id) { + thread_sum[row_id] = 0; + ComputeType* row_buf = buf[row_id]; +#pragma unroll + for (int i = 0; i < cols_per_thread; ++i) { + if (algorithm == Algorithm::kSoftmax) { + row_buf[i] = Exp(row_buf[i] - warp_max[row_id]); + thread_sum[row_id] += row_buf[i]; + } else if (algorithm == Algorithm::kLogSoftmax) { + row_buf[i] -= warp_max[row_id]; + thread_sum[row_id] += Exp(row_buf[i]); + } else { + asm volatile("s_trap 0;"); + } + } + } + ComputeType warp_sum[rows_per_access]; +#pragma unroll + for (int row_id = 0; row_id < rows_per_access; ++row_id) { + warp_sum[row_id] = WarpAllReduce(thread_sum[row_id]); + } +#pragma unroll + for (int row_id = 0; row_id < rows_per_access; ++row_id) { + ComputeType* row_buf = buf[row_id]; +#pragma unroll + for (int i = 0; i < cols_per_thread; ++i) { + if (algorithm == Algorithm::kSoftmax) { + row_buf[i] = Div(row_buf[i], warp_sum[row_id]); + } else if (algorithm == Algorithm::kLogSoftmax) { + row_buf[i] -= Log(warp_sum[row_id]); + } else { + asm volatile("s_trap 0;"); + } + } +#pragma unroll + for (int i = 0; i < num_packs; ++i) { + const int col = (i * thread_group_width + lane_id) * pack_size; + if (!padding || col < cols) { + store.template store(row_buf + i * pack_size, row + row_id, col); + } + } + } + } +} + +template +inline hipError_t LaunchSoftmaxWarpImpl(hipStream_t stream, LOAD load, STORE store, + const int64_t rows, const int64_t cols) { + // std::cout << "LaunchSoftmaxWarpImpl" << std::endl; + constexpr int block_size = 128; + constexpr int waves = 32; + static_assert(block_size % thread_group_width == 0, ""); + constexpr int thread_groups_per_block = block_size / thread_group_width; + dim3 block_dim(thread_group_width, thread_groups_per_block); + const int64_t num_blocks = + (rows / rows_per_access + thread_groups_per_block - 1) / thread_groups_per_block; + int grid_dim_x; + { + hipError_t err = GetNumBlocks(block_size, num_blocks, waves, &grid_dim_x); + if (err != hipSuccess) { return err; } + } + SoftmaxWarpImpl + <<>>(load, store, rows, cols); + return hipPeekAtLastError(); +} + +template +inline hipError_t DispatchSoftmaxWarpImplPadding(hipStream_t stream, LOAD load, STORE store, + const int64_t rows, const int64_t cols) { + if (cols == cols_per_thread * thread_group_width) { + return LaunchSoftmaxWarpImpl( + stream, load, store, rows, cols); + } else { + return LaunchSoftmaxWarpImpl( + stream, load, store, rows, cols); + } +} + +template +typename std::enable_if::type DispatchSoftmaxWarpImplCols( + hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols) { + if (cols <= 0) { return hipErrorInvalidValue; } +#define DEFINE_ONE_ELIF(thread_group_width) \ + else if (cols <= (thread_group_width)*pack_size) { \ + if (rows % 2 == 0) { \ + return DispatchSoftmaxWarpImplPadding(stream, load, store, \ + rows, cols); \ + } else { \ + return DispatchSoftmaxWarpImplPadding(stream, load, store, \ + rows, cols); \ + } \ + } + DEFINE_ONE_ELIF(1) + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF +#define DEFINE_ONE_ELIF(col) \ + else if (cols <= (col)*kWarpSize) { \ + return DispatchSoftmaxWarpImplPadding(stream, load, store, rows, cols); \ + } + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(3) + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(5) + DEFINE_ONE_ELIF(6) + DEFINE_ONE_ELIF(7) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(9) + DEFINE_ONE_ELIF(10) + DEFINE_ONE_ELIF(11) + DEFINE_ONE_ELIF(12) + DEFINE_ONE_ELIF(13) + DEFINE_ONE_ELIF(14) + DEFINE_ONE_ELIF(15) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(17) + DEFINE_ONE_ELIF(18) + DEFINE_ONE_ELIF(19) + DEFINE_ONE_ELIF(20) + DEFINE_ONE_ELIF(21) + DEFINE_ONE_ELIF(22) + DEFINE_ONE_ELIF(23) + DEFINE_ONE_ELIF(24) + DEFINE_ONE_ELIF(25) + DEFINE_ONE_ELIF(26) + DEFINE_ONE_ELIF(27) + DEFINE_ONE_ELIF(28) + DEFINE_ONE_ELIF(29) + DEFINE_ONE_ELIF(30) + DEFINE_ONE_ELIF(31) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF + else { + return hipErrorInvalidValue; + } +} + +template +typename std::enable_if::type DispatchSoftmaxWarpImplCols( + hipStream_t stream, LOAD load, STORE store, const int64_t rows, const int64_t cols) { + if (cols <= 0) { return hipErrorInvalidValue; } +#define DEFINE_ONE_ELIF(thread_group_width) \ + else if (cols <= (thread_group_width)*pack_size) { \ + if (rows % 2 == 0) { \ + return DispatchSoftmaxWarpImplPadding(stream, load, store, \ + rows, cols); \ + } else { \ + return DispatchSoftmaxWarpImplPadding(stream, load, store, \ + rows, cols); \ + } \ + } + DEFINE_ONE_ELIF(1) + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF +#define DEFINE_ONE_ELIF(col) \ + else if (cols <= (col)*kWarpSize) { \ + return DispatchSoftmaxWarpImplPadding(stream, load, store, rows, cols); \ + } + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(6) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(10) + DEFINE_ONE_ELIF(12) + DEFINE_ONE_ELIF(14) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(18) + DEFINE_ONE_ELIF(20) + DEFINE_ONE_ELIF(22) + DEFINE_ONE_ELIF(24) + DEFINE_ONE_ELIF(26) + DEFINE_ONE_ELIF(28) + DEFINE_ONE_ELIF(30) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF + else { + return hipErrorInvalidValue; + } +} + +template +struct DispatchSoftmaxWarpImplPackSize { + hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows, + const int64_t cols) { + if (cols % 2 == 0) { + return DispatchSoftmaxWarpImplCols(stream, load, + store, rows, cols); + } else { + return DispatchSoftmaxWarpImplCols(stream, load, + store, rows, cols); + } + } +}; + +template +inline hipError_t DispatchSoftmaxWarpImpl(hipStream_t stream, LOAD load, STORE store, + const int64_t rows, const int64_t cols) { + return DispatchSoftmaxWarpImplPackSize()(stream, load, store, + rows, cols); +} + +template +__global__ void SoftmaxBlockSMemImpl(LOAD load, STORE store, const int64_t rows, + const int64_t cols) { + extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[]; + auto* buf = reinterpret_cast(shared_buf); + const int tid = threadIdx.x; + assert(cols % pack_size == 0); + const int num_packs = cols / pack_size; + for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { + ComputeType thread_max = -Inf(); + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType pack[pack_size]; + load.template load(pack, row, pack_id * pack_size); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + buf[i * num_packs + pack_id] = pack[i]; + thread_max = max(thread_max, pack[i]); + } + } + const ComputeType row_max = BlockAllReduce(thread_max); + ComputeType thread_sum = 0; + for (int col = tid; col < cols; col += block_size) { + if (algorithm == Algorithm::kSoftmax) { + const ComputeType exp_x = Exp(buf[col] - row_max); + buf[col] = exp_x; + thread_sum += exp_x; + } else { + const ComputeType x = buf[col] - row_max; + buf[col] = x; + thread_sum += Exp(x); + } + } + const ComputeType row_sum = BlockAllReduce(thread_sum); + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType pack[pack_size]; +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + if (algorithm == Algorithm::kSoftmax) { + pack[i] = Div(buf[i * num_packs + pack_id], row_sum); + } else if (algorithm == Algorithm::kLogSoftmax) { + pack[i] = buf[i * num_packs + pack_id] - Log(row_sum); + } else { + asm volatile("s_trap 0;"); + } + } + store.template store(pack, row, pack_id * pack_size); + } + } +} + +template +__global__ void SoftmaxBlockSMemImpl_1024(LOAD load, STORE store, const int64_t rows, + const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) { + extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[]; + auto* buf = reinterpret_cast(shared_buf); + const int tid = threadIdx.x; + assert(cols % pack_size == 0); + const int num_packs = cols / pack_size; + for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { + ComputeType thread_max = -Inf(); + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType pack[pack_size]; + load.template load(pack, row, pack_id * pack_size); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + buf[i * num_packs + pack_id] = pack[i]; + thread_max = max(thread_max, pack[i]); + } + } + const ComputeType row_max = BlockAllReduce(thread_max); + ComputeType thread_sum = 0; + for (int col = tid; col < cols; col += block_size) { + if (algorithm == Algorithm::kSoftmax) { + const ComputeType exp_x = Exp(buf[col] - row_max); + buf[col] = exp_x; + thread_sum += exp_x; + } else { + const ComputeType x = buf[col] - row_max; + buf[col] = x; + thread_sum += Exp(x); + } + } + const ComputeType row_sum = BlockAllReduce(thread_sum); + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType pack[pack_size]; +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + if (algorithm == Algorithm::kSoftmax) { + pack[i] = Div(buf[i * num_packs + pack_id], row_sum); + } else if (algorithm == Algorithm::kLogSoftmax) { + pack[i] = buf[i * num_packs + pack_id] - Log(row_sum); + } else { + asm volatile("s_trap 0;"); + } + } + store.template store(pack, row, pack_id * pack_size); + } + } +} + +template +inline hipError_t LaunchSoftmaxBlockSMemImpl(hipStream_t stream, LOAD load, STORE store, int smem, + const int64_t rows, const int64_t cols) { + + constexpr int waves = 32; + int grid_dim_x; + { + hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x); + if (err != hipSuccess) { return err; } + } + SoftmaxBlockSMemImpl + <<>>(load, store, rows, cols); + return hipPeekAtLastError(); +} + +template +inline hipError_t LaunchSoftmaxBlockSMemImpl_1024(hipStream_t stream, LOAD load, STORE store, int smem, + const int64_t rows, const int64_t cols) { + + constexpr int waves = 32; + int grid_dim_x; + { + hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x); + if (err != hipSuccess) { return err; } + } + SoftmaxBlockSMemImpl_1024 + <<>>(load, store, rows, cols); + return hipPeekAtLastError(); +} + +template +inline hipError_t TryDispatchSoftmaxBlockSMemImplBlockSize(hipStream_t stream, LOAD load, + STORE store, const int64_t rows, + const int64_t cols, bool* success) { + + constexpr int block_size_conf_1 = 128; + constexpr int block_size_conf_2 = 256; + constexpr int block_size_conf_3 = 512; + constexpr int block_size_conf_4 = 1024; + const size_t smem = cols * sizeof(ComputeType); + int max_active_blocks_conf_1; + { + hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_conf_1, + SoftmaxBlockSMemImpl, + block_size_conf_1, smem); + if (err != hipSuccess) { return err; } + } + if (max_active_blocks_conf_1 <= 0) { + *success = false; + return hipSuccess; + } + int max_active_blocks_conf_4; + { + hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_conf_4, + SoftmaxBlockSMemImpl_1024, + block_size_conf_4, smem); + if (err != hipSuccess) { return err; } + } + if (max_active_blocks_conf_4 == max_active_blocks_conf_1) { + *success = true; + return LaunchSoftmaxBlockSMemImpl_1024(stream, load, store, smem, rows, cols); + } + int max_active_blocks_conf_3; + { + hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_conf_3, + SoftmaxBlockSMemImpl, + block_size_conf_3, smem); + if (err != hipSuccess) { return err; } + } + if (max_active_blocks_conf_3 == max_active_blocks_conf_1) { + *success = true; + return LaunchSoftmaxBlockSMemImpl(stream, load, store, smem, rows, cols); + } + int max_active_blocks_conf_2; + { + hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_conf_2, + SoftmaxBlockSMemImpl, + block_size_conf_2, smem); + if (err != hipSuccess) { return err; } + } + if (max_active_blocks_conf_2 == max_active_blocks_conf_1) { + *success = true; + return LaunchSoftmaxBlockSMemImpl(stream, load, store, smem, rows, cols); + } + *success = true; + return LaunchSoftmaxBlockSMemImpl(stream, load, store, smem, rows, cols); +} + +template +struct TryDispatchSoftmaxBlockSMemImplPackSize { + hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows, + const int64_t cols, bool* success) { + if (cols % 2 == 0) { + return TryDispatchSoftmaxBlockSMemImplBlockSize( + stream, load, store, rows, cols, success); + } else { + return TryDispatchSoftmaxBlockSMemImplBlockSize( + stream, load, store, rows, cols, success); + } + } +}; + +template +inline hipError_t TryDispatchSoftmaxBlockSMemImpl(hipStream_t stream, LOAD load, STORE store, + const int64_t rows, const int64_t cols, + bool* success) { + return TryDispatchSoftmaxBlockSMemImplPackSize()( + stream, load, store, rows, cols, success); +} + +template +__global__ void SoftmaxBlockUncachedImpl(LOAD load, STORE store, const int64_t rows, + const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) { + const int tid = threadIdx.x; + assert(cols % pack_size == 0); + const int num_packs = cols / pack_size; + for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { + ComputeType thread_max = -Inf(); + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType pack[pack_size]; + load.template load(pack, row, pack_id * pack_size); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { thread_max = max(thread_max, pack[i]); } + } + const ComputeType row_max = BlockAllReduce(thread_max); + ComputeType thread_sum = 0; + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType pack[pack_size]; + load.template load(pack, row, pack_id * pack_size); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { thread_sum += Exp(pack[i] - row_max); } + } + const ComputeType row_sum = BlockAllReduce(thread_sum); + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType pack[pack_size]; + load.template load(pack, row, pack_id * pack_size); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + if (algorithm == Algorithm::kSoftmax) { + pack[i] = Div(Exp(pack[i] - row_max), row_sum); + } else if (algorithm == Algorithm::kLogSoftmax) { + pack[i] = (pack[i] - row_max) - Log(row_sum); + } else { + asm volatile("s_trap 0;"); + } + } + store.template store(pack, row, pack_id * pack_size); + } + } +} + +template +inline hipError_t LaunchSoftmaxBlockUncachedImpl(hipStream_t stream, LOAD load, STORE store, + const int64_t rows, const int64_t cols) { + // std::cout << "LaunchSoftmaxBlockUncachedImpl" << std::endl; + constexpr int block_size = 1024; + constexpr int waves = 32; + int grid_dim_x; + { + hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x); + if (err != hipSuccess) { return err; } + } + SoftmaxBlockUncachedImpl + <<>>(load, store, rows, cols); + return hipPeekAtLastError(); +} + +template +struct DispatchSoftmaxBlockUncachedImplPackSize { + hipError_t operator()(hipStream_t stream, LOAD load, STORE store, const int64_t rows, + const int64_t cols) { + if (cols % 2 == 0) { + return LaunchSoftmaxBlockUncachedImpl( + stream, load, store, rows, cols); + } else { + return LaunchSoftmaxBlockUncachedImpl( + stream, load, store, rows, cols); + } + } +}; + +template +inline hipError_t DispatchSoftmaxBlockUncachedImpl(hipStream_t stream, LOAD load, STORE store, + const int64_t rows, const int64_t cols) { + return DispatchSoftmaxBlockUncachedImplPackSize()( + stream, load, store, rows, cols); +} + +template +inline typename std::enable_if::value, hipError_t>::type +DispatchSoftmax(hipStream_t stream, LOAD load, STORE store, const int64_t rows, + const int64_t cols) { + if (cols < 1024) { + return DispatchSoftmaxWarpImpl( + stream, load, store, rows, cols); + } else { + bool dispatch_smem_impl_success; + { + hipError_t err = + TryDispatchSoftmaxBlockSMemImpl( + stream, load, store, rows, cols, &dispatch_smem_impl_success); + if (err != hipSuccess) { return err; } + } + if (!dispatch_smem_impl_success) { + return DispatchSoftmaxBlockUncachedImpl( + stream, load, store, rows, cols); + } + return hipSuccess; + } +} + +template +inline typename std::enable_if::value, hipError_t>::type +DispatchSoftmax(hipStream_t stream, LOAD load, STORE store, const int64_t rows, + const int64_t cols) { + return DispatchSoftmaxBlockUncachedImpl( + stream, load, store, rows, cols); +} + +template +inline typename std::enable_if::value, hipError_t>::type +DispatchLogSoftmax(hipStream_t stream, LOAD load, STORE store, const int64_t rows, + const int64_t cols) { + if (cols <= 1024) { + return DispatchSoftmaxWarpImpl( + stream, load, store, rows, cols); + } else { + bool dispatch_smem_impl_success; + { + hipError_t err = + TryDispatchSoftmaxBlockSMemImpl( + stream, load, store, rows, cols, &dispatch_smem_impl_success); + if (err != hipSuccess) { return err; } + } + if (!dispatch_smem_impl_success) { + return DispatchSoftmaxBlockUncachedImpl( + stream, load, store, rows, cols); + } + return hipSuccess; + } +} + +template +inline typename std::enable_if::value, hipError_t>::type +DispatchLogSoftmax(hipStream_t stream, LOAD load, STORE store, const int64_t rows, + const int64_t cols) { + return DispatchSoftmaxBlockUncachedImpl( + stream, load, store, rows, cols); +} + +template +__global__ void SoftmaxGradWarpImpl(LOAD_Y load_y, LOAD_DY load_dy, STORE store, const int64_t rows, + const int64_t cols) { + static_assert(cols_per_thread % pack_size == 0, ""); + constexpr int pack_per_thread = cols_per_thread / pack_size; + assert(cols <= cols_per_thread * thread_group_width); + static_assert(thread_group_width <= kWarpSize, ""); + static_assert(kWarpSize % thread_group_width == 0, ""); + ComputeType y_buf[rows_per_access][cols_per_thread]; + ComputeType dy_buf[rows_per_access][cols_per_thread]; + const int global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y; + const int num_global_thread_group = gridDim.x * blockDim.y; + const int lane_id = threadIdx.x; + const int64_t step = num_global_thread_group * rows_per_access; + for (int64_t row = global_thread_group_id * rows_per_access; row < rows; row += step) { + ComputeType thread_sum[rows_per_access]; +#pragma unroll + for (int row_id = 0; row_id < rows_per_access; ++row_id) { + thread_sum[row_id] = 0; + ComputeType* row_y_buf = y_buf[row_id]; + ComputeType* row_dy_buf = dy_buf[row_id]; +#pragma unroll + for (int pack_id = 0; pack_id < pack_per_thread; ++pack_id) { + const int pack_offset = pack_id * pack_size; + const int col = (pack_id * thread_group_width + lane_id) * pack_size; + if (!padding || col < cols) { + load_y.template load(row_y_buf + pack_offset, row + row_id, col); + load_dy.template load(row_dy_buf + pack_offset, row + row_id, col); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + if (algorithm == Algorithm::kSoftmax) { + thread_sum[row_id] += row_y_buf[pack_offset + i] * row_dy_buf[pack_offset + i]; + } else if (algorithm == Algorithm::kLogSoftmax) { + thread_sum[row_id] += row_dy_buf[pack_offset + i]; + } else { + asm volatile("s_trap 0;"); + } + } + } + } + } + ComputeType warp_sum[rows_per_access]; +#pragma unroll + for (int row_id = 0; row_id < rows_per_access; ++row_id) { + warp_sum[row_id] = WarpAllReduce(thread_sum[row_id]); + } +#pragma unroll + for (int row_id = 0; row_id < rows_per_access; ++row_id) { + ComputeType* row_y_buf = y_buf[row_id]; + ComputeType* row_dy_buf = dy_buf[row_id]; +#pragma unroll + for (int pack_id = 0; pack_id < pack_per_thread; ++pack_id) { + const int pack_offset = pack_id * pack_size; + const int col = (pack_id * thread_group_width + lane_id) * pack_size; + if (!padding || col < cols) { + for (int i = 0; i < pack_size; ++i) { + if (algorithm == Algorithm::kSoftmax) { + row_dy_buf[pack_offset + i] = + (row_dy_buf[pack_offset + i] - warp_sum[row_id]) * row_y_buf[pack_offset + i]; + } else if (algorithm == Algorithm::kLogSoftmax) { + row_dy_buf[pack_offset + i] -= Exp(row_y_buf[pack_offset + i]) * warp_sum[row_id]; + } else { + asm volatile("s_trap 0;"); + } + } + store.template store(row_dy_buf + pack_offset, row + row_id, col); + } + } + } + } +} + +template +inline hipError_t LaunchSoftmaxGradWarpImpl(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, + STORE store, const int64_t rows, const int64_t cols) { + constexpr int block_size = 128; + constexpr int waves = 32; + static_assert(block_size % thread_group_width == 0, ""); + constexpr int thread_groups_per_block = block_size / thread_group_width; + dim3 block_dim(thread_group_width, thread_groups_per_block); + const int64_t num_blocks = + (rows / rows_per_access + thread_groups_per_block - 1) / thread_groups_per_block; + int grid_dim_x; + { + hipError_t err = GetNumBlocks(block_size, num_blocks, waves, &grid_dim_x); + if (err != hipSuccess) { return err; } + } + SoftmaxGradWarpImpl + <<>>(load_y, load_dy, store, rows, cols); + return hipPeekAtLastError(); +} + +template +inline hipError_t DispatchSoftmaxGradWarpImplPadding(hipStream_t stream, LOAD_Y load_y, + LOAD_DY load_dy, STORE store, + const int64_t rows, const int64_t cols) { + if (cols == cols_per_thread * thread_group_width) { + return LaunchSoftmaxGradWarpImpl(stream, load_y, load_dy, store, rows, cols); + } else { + return LaunchSoftmaxGradWarpImpl(stream, load_y, load_dy, store, rows, cols); + } +} + +template +typename std::enable_if::type DispatchSoftmaxGradWarpImplCols( + hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store, const int64_t rows, + const int64_t cols) { + if (cols <= 0) { return hipErrorInvalidValue; } +#define DEFINE_ONE_ELIF(thread_group_width) \ + else if (cols <= (thread_group_width)*pack_size) { \ + if (rows % 2 == 0) { \ + return DispatchSoftmaxGradWarpImplPadding( \ + stream, load_y, load_dy, store, rows, cols); \ + } else { \ + return DispatchSoftmaxGradWarpImplPadding( \ + stream, load_y, load_dy, store, rows, cols); \ + } \ + } + DEFINE_ONE_ELIF(1) + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF +#define DEFINE_ONE_ELIF(col) \ + else if (cols <= (col)*kWarpSize) { \ + return DispatchSoftmaxGradWarpImplPadding(stream, load_y, load_dy, \ + store, rows, cols); \ + } + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(3) + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(5) + DEFINE_ONE_ELIF(6) + DEFINE_ONE_ELIF(7) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(9) + DEFINE_ONE_ELIF(10) + DEFINE_ONE_ELIF(11) + DEFINE_ONE_ELIF(12) + DEFINE_ONE_ELIF(13) + DEFINE_ONE_ELIF(14) + DEFINE_ONE_ELIF(15) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(17) + DEFINE_ONE_ELIF(18) + DEFINE_ONE_ELIF(19) + DEFINE_ONE_ELIF(20) + DEFINE_ONE_ELIF(21) + DEFINE_ONE_ELIF(22) + DEFINE_ONE_ELIF(23) + DEFINE_ONE_ELIF(24) + DEFINE_ONE_ELIF(25) + DEFINE_ONE_ELIF(26) + DEFINE_ONE_ELIF(27) + DEFINE_ONE_ELIF(28) + DEFINE_ONE_ELIF(29) + DEFINE_ONE_ELIF(30) + DEFINE_ONE_ELIF(31) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF + else { + return hipErrorInvalidValue; + } +} + +template +typename std::enable_if::type DispatchSoftmaxGradWarpImplCols( + hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store, const int64_t rows, + const int64_t cols) { + if (cols <= 0) { return hipErrorInvalidValue; } +#define DEFINE_ONE_ELIF(thread_group_width) \ + else if (cols <= (thread_group_width)*pack_size) { \ + if (rows % 2 == 0) { \ + return DispatchSoftmaxGradWarpImplPadding( \ + stream, load_y, load_dy, store, rows, cols); \ + } else { \ + return DispatchSoftmaxGradWarpImplPadding( \ + stream, load_y, load_dy, store, rows, cols); \ + } \ + } + DEFINE_ONE_ELIF(1) + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF +#define DEFINE_ONE_ELIF(col) \ + else if (cols <= (col)*kWarpSize) { \ + return DispatchSoftmaxGradWarpImplPadding(stream, load_y, load_dy, \ + store, rows, cols); \ + } + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(6) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(10) + DEFINE_ONE_ELIF(12) + DEFINE_ONE_ELIF(14) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(18) + DEFINE_ONE_ELIF(20) + DEFINE_ONE_ELIF(22) + DEFINE_ONE_ELIF(24) + DEFINE_ONE_ELIF(26) + DEFINE_ONE_ELIF(28) + DEFINE_ONE_ELIF(30) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF + else { + return hipErrorInvalidValue; + } +} + +template +struct DispatchSoftmaxGradWarpImplPackSize { + hipError_t operator()(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store, + const int64_t rows, const int64_t cols) { + if (cols % 2 == 0) { + return DispatchSoftmaxGradWarpImplCols( + stream, load_y, load_dy, store, rows, cols); + } else { + return DispatchSoftmaxGradWarpImplCols( + stream, load_y, load_dy, store, rows, cols); + } + } +}; + +template +inline hipError_t DispatchSoftmaxGradWarpImpl(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, + STORE store, const int64_t rows, + const int64_t cols) { + return DispatchSoftmaxGradWarpImplPackSize()( + stream, load_y, load_dy, store, rows, cols); +} + +template +__global__ void SoftmaxGradBlockSMemImpl(LOAD_Y load_y, LOAD_DY load_dy, STORE store, + const int64_t rows, const int64_t cols) { + extern __shared__ __align__(sizeof(double)) unsigned char grad_shared_buf[]; + auto* y_buf = reinterpret_cast(grad_shared_buf); + auto* dy_buf = y_buf + cols; + const int tid = threadIdx.x; + assert(cols % pack_size == 0); + const int num_packs = cols / pack_size; + for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { + ComputeType thread_sum = 0; + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType y_pack[pack_size]; + ComputeType dy_pack[pack_size]; + load_y.template load(y_pack, row, pack_id * pack_size); + load_dy.template load(dy_pack, row, pack_id * pack_size); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + y_buf[i * num_packs + pack_id] = y_pack[i]; + dy_buf[i * num_packs + pack_id] = dy_pack[i]; + if (algorithm == Algorithm::kSoftmax) { + thread_sum += y_pack[i] * dy_pack[i]; + } else if (algorithm == Algorithm::kLogSoftmax) { + thread_sum += dy_pack[i]; + } else { + asm volatile("s_trap 0;"); + } + } + } + const ComputeType row_sum = BlockAllReduce(thread_sum); + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType pack[pack_size]; +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + if (algorithm == Algorithm::kSoftmax) { + pack[i] = (dy_buf[i * num_packs + pack_id] - row_sum) * y_buf[i * num_packs + pack_id]; + } else if (algorithm == Algorithm::kLogSoftmax) { + pack[i] = dy_buf[i * num_packs + pack_id] - Exp(y_buf[i * num_packs + pack_id]) * row_sum; + } else { + asm volatile("s_trap 0;"); + } + } + store.template store(pack, row, pack_id * pack_size); + } + } +} + +template +__global__ void SoftmaxGradBlockSMemImpl_1024(LOAD_Y load_y, LOAD_DY load_dy, STORE store, + const int64_t rows, const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) { + extern __shared__ __align__(sizeof(double)) unsigned char grad_shared_buf[]; + auto* y_buf = reinterpret_cast(grad_shared_buf); + auto* dy_buf = y_buf + cols; + const int tid = threadIdx.x; + assert(cols % pack_size == 0); + const int num_packs = cols / pack_size; + for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { + ComputeType thread_sum = 0; + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType y_pack[pack_size]; + ComputeType dy_pack[pack_size]; + load_y.template load(y_pack, row, pack_id * pack_size); + load_dy.template load(dy_pack, row, pack_id * pack_size); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + y_buf[i * num_packs + pack_id] = y_pack[i]; + dy_buf[i * num_packs + pack_id] = dy_pack[i]; + if (algorithm == Algorithm::kSoftmax) { + thread_sum += y_pack[i] * dy_pack[i]; + } else if (algorithm == Algorithm::kLogSoftmax) { + thread_sum += dy_pack[i]; + } else { + asm volatile("s_trap 0;"); + } + } + } + const ComputeType row_sum = BlockAllReduce(thread_sum); + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType pack[pack_size]; +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + if (algorithm == Algorithm::kSoftmax) { + pack[i] = (dy_buf[i * num_packs + pack_id] - row_sum) * y_buf[i * num_packs + pack_id]; + } else if (algorithm == Algorithm::kLogSoftmax) { + pack[i] = dy_buf[i * num_packs + pack_id] - Exp(y_buf[i * num_packs + pack_id]) * row_sum; + } else { + asm volatile("s_trap 0;"); + } + } + store.template store(pack, row, pack_id * pack_size); + } + } +} + +template +inline hipError_t LaunchSoftmaxGradBlockSMemImpl(hipStream_t stream, LOAD_Y load_y, + LOAD_DY load_dy, STORE store, int smem, + const int64_t rows, const int64_t cols) { + constexpr int waves = 32; + int grid_dim_x; + { + hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x); + if (err != hipSuccess) { return err; } + } + SoftmaxGradBlockSMemImpl + <<>>(load_y, load_dy, store, rows, cols); + return hipPeekAtLastError(); +} + +template +inline hipError_t LaunchSoftmaxGradBlockSMemImpl_1024(hipStream_t stream, LOAD_Y load_y, + LOAD_DY load_dy, STORE store, int smem, + const int64_t rows, const int64_t cols) { + constexpr int waves = 32; + int grid_dim_x; + { + hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x); + if (err != hipSuccess) { return err; } + } + SoftmaxGradBlockSMemImpl_1024 + <<>>(load_y, load_dy, store, rows, cols); + return hipPeekAtLastError(); +} + +template +inline hipError_t TryDispatchSoftmaxGradBlockSMemImplBlockSize(hipStream_t stream, LOAD_Y load_y, + LOAD_DY load_dy, STORE store, + const int64_t rows, + const int64_t cols, bool* success) { + constexpr int block_size_conf_1 = 128; + constexpr int block_size_conf_2 = 256; + constexpr int block_size_conf_3 = 512; + constexpr int block_size_conf_4 = 1024; + const size_t smem = cols * sizeof(ComputeType) * 2; + int max_active_blocks_conf_1; + { + hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_conf_1, + SoftmaxGradBlockSMemImpl, + block_size_conf_1, smem); + if (err != hipSuccess) { return err; } + } + if (max_active_blocks_conf_1 <= 0) { + *success = false; + return hipSuccess; + } + int max_active_blocks_conf_4; + { + hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_conf_4, + SoftmaxGradBlockSMemImpl_1024, + block_size_conf_4, smem); + if (err != hipSuccess) { return err; } + } + if (max_active_blocks_conf_4 == max_active_blocks_conf_1) { + *success = true; + return LaunchSoftmaxGradBlockSMemImpl_1024(stream, load_y, load_dy, + store, smem, rows, cols); + } + int max_active_blocks_conf_3; + { + hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_conf_3, + SoftmaxGradBlockSMemImpl, + block_size_conf_3, smem); + if (err != hipSuccess) { return err; } + } + if (max_active_blocks_conf_3 == max_active_blocks_conf_1) { + *success = true; + return LaunchSoftmaxGradBlockSMemImpl(stream, load_y, load_dy, + store, smem, rows, cols); + } + int max_active_blocks_conf_2; + { + hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_conf_2, + SoftmaxGradBlockSMemImpl, + block_size_conf_2, smem); + if (err != hipSuccess) { return err; } + } + if (max_active_blocks_conf_2 == max_active_blocks_conf_1) { + *success = true; + return LaunchSoftmaxGradBlockSMemImpl(stream, load_y, load_dy, + store, smem, rows, cols); + } + *success = true; + return LaunchSoftmaxGradBlockSMemImpl(stream, load_y, load_dy, + store, smem, rows, cols); +} + +template +struct TryDispatchSoftmaxGradBlockSMemImplPackSize { + hipError_t operator()(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store, + const int64_t rows, const int64_t cols, bool* success) { + if (cols % 2 == 0) { + return TryDispatchSoftmaxGradBlockSMemImplBlockSize(stream, load_y, load_dy, store, + rows, cols, success); + } else { + return TryDispatchSoftmaxGradBlockSMemImplBlockSize(stream, load_y, load_dy, store, + rows, cols, success); + } + } +}; + +template +inline hipError_t TryDispatchSoftmaxGradBlockSMemImpl(hipStream_t stream, LOAD_Y load_y, + LOAD_DY load_dy, STORE store, + const int64_t rows, const int64_t cols, + bool* success) { + return TryDispatchSoftmaxGradBlockSMemImplPackSize()(stream, load_y, load_dy, store, + rows, cols, success); +} + +template +__global__ void SoftmaxGradBlockUncachedImpl(LOAD_Y load_y, LOAD_DY load_dy, STORE store, + const int64_t rows, const int64_t cols) __attribute__((amdgpu_flat_work_group_size(1,1024))) { + const int tid = threadIdx.x; + assert(cols % pack_size == 0); + const int num_packs = cols / pack_size; + for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) { + ComputeType thread_sum = 0; + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType y_pack[pack_size]; + ComputeType dy_pack[pack_size]; + load_y.template load(y_pack, row, pack_id * pack_size); + load_dy.template load(dy_pack, row, pack_id * pack_size); + +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + if (algorithm == Algorithm::kSoftmax) { + thread_sum += y_pack[i] * dy_pack[i]; + } else if (algorithm == Algorithm::kLogSoftmax) { + thread_sum += dy_pack[i]; + } else { + asm volatile("s_trap 0;"); + } + } + } + const ComputeType row_sum = BlockAllReduce(thread_sum); + for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) { + ComputeType y_pack[pack_size]; + ComputeType dy_pack[pack_size]; + load_y.template load(y_pack, row, pack_id * pack_size); + load_dy.template load(dy_pack, row, pack_id * pack_size); +#pragma unroll + for (int i = 0; i < pack_size; ++i) { + if (algorithm == Algorithm::kSoftmax) { + dy_pack[i] = (dy_pack[i] - row_sum) * y_pack[i]; + } else if (algorithm == Algorithm::kLogSoftmax) { + dy_pack[i] -= Exp(y_pack[i]) * row_sum; + } else { + asm volatile("s_trap 0;"); + } + } + store.template store(dy_pack, row, pack_id * pack_size); + } + } +} + +template +inline hipError_t LaunchSoftmaxGradBlockUncachedImpl(hipStream_t stream, LOAD_Y load_y, + LOAD_DY load_dy, STORE store, + const int64_t rows, const int64_t cols) { + constexpr int block_size = 1024; + constexpr int waves = 32; + int grid_dim_x; + { + hipError_t err = GetNumBlocks(block_size, rows, waves, &grid_dim_x); + if (err != hipSuccess) { return err; } + } + SoftmaxGradBlockUncachedImpl + <<>>(load_y, load_dy, store, rows, cols); + return hipPeekAtLastError(); +} + +template +struct DispatchSoftmaxGradBlockUncachedImplPackSize { + hipError_t operator()(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store, + const int64_t rows, const int64_t cols) { + if (cols % 2 == 0 && cols > kWarpSize) { + return LaunchSoftmaxGradBlockUncachedImpl( + stream, load_y, load_dy, store, rows, cols); + } else { + return LaunchSoftmaxGradBlockUncachedImpl( + stream, load_y, load_dy, store, rows, cols); + } + } +}; + +template +inline hipError_t DispatchSoftmaxGradBlockUncachedImpl(hipStream_t stream, LOAD_Y load_y, + LOAD_DY load_dy, STORE store, + const int64_t rows, const int64_t cols) { + return DispatchSoftmaxGradBlockUncachedImplPackSize()(stream, load_y, load_dy, store, + rows, cols); +} + +template +inline typename std::enable_if::value, hipError_t>::type +DispatchSoftmaxGrad(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store, + const int64_t rows, const int64_t cols) { + if (cols <= 1024) { + return DispatchSoftmaxGradWarpImpl( + stream, load_y, load_dy, store, rows, cols); + } else { + bool dispatch_smem_impl_success; + { + hipError_t err = TryDispatchSoftmaxGradBlockSMemImpl( + stream, load_y, load_dy, store, rows, cols, &dispatch_smem_impl_success); + if (err != hipSuccess) { return err; } + } + if (!dispatch_smem_impl_success) { + return DispatchSoftmaxGradBlockUncachedImpl(stream, load_y, load_dy, + store, rows, cols); + } + return hipSuccess; + } +} + +template +inline typename std::enable_if::value, hipError_t>::type +DispatchSoftmaxGrad(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store, + const int64_t rows, const int64_t cols) { + return DispatchSoftmaxGradBlockUncachedImpl(stream, load_y, load_dy, store, + rows, cols); +} + +template +inline typename std::enable_if::value, hipError_t>::type +DispatchLogSoftmaxGrad(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store, + const int64_t rows, const int64_t cols) { + if (cols <= 1024) { + return DispatchSoftmaxGradWarpImpl( + stream, load_y, load_dy, store, rows, cols); + } else { + bool dispatch_smem_impl_success; + { + hipError_t err = TryDispatchSoftmaxGradBlockSMemImpl( + stream, load_y, load_dy, store, rows, cols, &dispatch_smem_impl_success); + if (err != hipSuccess) { return err; } + } + if (!dispatch_smem_impl_success) { + return DispatchSoftmaxGradBlockUncachedImpl(stream, load_y, load_dy, + store, rows, cols); + } + return hipSuccess; + } +} + +template +inline typename std::enable_if::value, hipError_t>::type +DispatchLogSoftmaxGrad(hipStream_t stream, LOAD_Y load_y, LOAD_DY load_dy, STORE store, + const int64_t rows, const int64_t cols) { + return DispatchSoftmaxGradBlockUncachedImpl(stream, load_y, load_dy, + store, rows, cols); +} + +} // namespace softmax + +} // namespace cuda + +} // namespace oneflow + +#endif // WITH_ROCM + +#endif // ONEFLOW_CORE_CUDA_SOFTMAX_H_ diff --git a/oneflow/core/hip/unique.hip.h b/oneflow/core/hip/unique.hip.h index 6e7e671..4de2c65 100644 --- a/oneflow/core/hip/unique.hip.h +++ b/oneflow/core/hip/unique.hip.h @@ -1,251 +1,251 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_HIP_UNIQUE_H_ -#define ONEFLOW_CORE_HIP_UNIQUE_H_ - -#ifdef WITH_ROCM - -#include -#include "hip/hip_runtime.h" -// #include -#include "hip/hip_runtime.h" -#include "oneflow/core/common/permutation_iterator.h" -#include "oneflow/core/common/not_equal_to_previous_adjacent_iterator.h" - -namespace oneflow { - -namespace cuda { - -namespace unique { - -using Flag = uint32_t; -static constexpr Flag kDefault = 0x0; -static constexpr Flag kInputSorted = 0x1; -static constexpr Flag kOutputInverseIndices = 0x1 << 1; -static constexpr Flag kOutputCounts = 0x1 << 2; - -namespace { - -constexpr size_t kCudaAlignSize = 512; - -__device__ __host__ __forceinline__ size_t GetCudaAlignedSize(size_t size) { - return (size + kCudaAlignSize - 1) / kCudaAlignSize * kCudaAlignSize; -} - -template -__device__ __host__ __forceinline__ T* PtrOffset(void* ptr, size_t offset) { - return reinterpret_cast(reinterpret_cast(ptr) + offset); -} - -__device__ __host__ __forceinline__ size_t max(size_t a, size_t b) { return a > b ? a : b; } - -template -hipError_t DoUnique(size_t n, const Key* sorted_in, Key* unique, Index* num_unique, - void* workspace, size_t* workspace_size, hipStream_t stream) { - size_t ws = *workspace_size; - hipError_t err = hipcub::DeviceSelect::Unique( - workspace, ws, sorted_in, unique, num_unique, n, stream); - if (err != hipSuccess) { return err; } - if (*workspace_size == 0) { *workspace_size = ws; } - return hipSuccess; -} - -template -hipError_t DoUniqueWithCounts(size_t n, const Key* sorted_in, Key* unique, Index* num_unique, - Index* counts, void* workspace, size_t* workspace_size, - hipStream_t stream) { - size_t ws = *workspace_size; - hipError_t err = hipcub::DeviceRunLengthEncode::Encode( - workspace, ws, sorted_in, unique, counts, num_unique, n, stream); - if (err != hipSuccess) { return err; } - if (*workspace_size == 0) { *workspace_size = ws; } - return hipSuccess; -} - -template -hipError_t DispatchOutputCounts(Flag flag, size_t n, const Key* sorted_in, Key* unique, - Index* num_unique, Index* counts, void* workspace, - size_t* workspace_size, hipStream_t stream) { - size_t ws = *workspace_size; - if ((flag & kOutputCounts) != 0) { - hipError_t err = DoUniqueWithCounts(n, sorted_in, unique, num_unique, counts, - workspace, &ws, stream); - if (err != hipSuccess) { return err; } - } else { - hipError_t err = - DoUnique(n, sorted_in, unique, num_unique, workspace, &ws, stream); - if (err != hipSuccess) { return err; } - } - if (*workspace_size == 0) { *workspace_size = ws; } - return hipSuccess; -} - -template -hipError_t DoGenInverseIndices(size_t n, const Key* sorted_in, - InverseIndicesIter inverse_indices_iter, void* workspace, - size_t* workspace_size, hipStream_t stream) { - size_t ws = *workspace_size; - NotEqualToPreviousAdjacentIterator unique_counting_iter(sorted_in, 0); - hipError_t err = - hipcub::DeviceScan::InclusiveSum( - workspace, ws, unique_counting_iter, inverse_indices_iter, n, stream); - if (err != hipSuccess) { return err; } - if (*workspace_size == 0) { *workspace_size = ws; } - return hipSuccess; -} - -template -hipError_t DispatchOutputInverseIndices(Flag flag, size_t n, const Key* sorted_in, Key* unique, - Index* num_unique, InverseIndicesIter inverse_indices_iter, - Index* counts, void* workspace, size_t* workspace_size, - hipStream_t stream) { - size_t dispatch_with_counts_ws = *workspace_size; - size_t do_gen_inverse_indices_ws = *workspace_size; - { - hipError_t err = - DispatchOutputCounts(flag, n, sorted_in, unique, num_unique, counts, workspace, - &dispatch_with_counts_ws, stream); - if (err != hipSuccess) { return err; } - } - if ((flag & kOutputInverseIndices) != 0) { - hipError_t err = DoGenInverseIndices( - n, sorted_in, inverse_indices_iter, workspace, &do_gen_inverse_indices_ws, stream); - if (err != hipSuccess) { return err; } - } - if (*workspace_size == 0) { - *workspace_size = max(dispatch_with_counts_ws, do_gen_inverse_indices_ws); - } - return hipSuccess; -} - -template -__global__ void IotaKernel(size_t n, T* out) { - for (T i = blockIdx.x * blockDim.x + threadIdx.x, step = blockDim.x * gridDim.x; i < n; - i += step) { - out[i] = i; - } -} - -template -hipError_t DoSort(size_t n, const Key* in, Key* sorted, Index* sorted_indices, void* workspace, - size_t* workspace_size, hipStream_t stream) { - Index* indices; - const size_t indices_size = GetCudaAlignedSize(n * sizeof(Index)); - void* sort_workspace; - size_t sort_ws; - if (*workspace_size == 0) { - indices = nullptr; - sort_workspace = nullptr; - sort_ws = 0; - } else { - if (*workspace_size <= indices_size) { return hipErrorInvalidValue; } - indices = PtrOffset(workspace, 0); - sort_workspace = PtrOffset(workspace, indices_size); - sort_ws = *workspace_size - indices_size; - } - if (*workspace_size != 0) { - const int block_size = 1024; - const int num_blocks = static_cast((n + block_size - 1) / block_size); - IotaKernel<<>>(n, indices); - } - hipError_t err = hipcub::DeviceRadixSort::SortPairs( - sort_workspace, sort_ws, in, sorted, indices, sorted_indices, n, 0, sizeof(Key) * 8, stream); - if (err != hipSuccess) { return err; } - if (*workspace_size == 0) { *workspace_size = indices_size + sort_ws; } - return hipSuccess; -} - -template -hipError_t DispatchInputSorted(Flag flag, size_t n, const Key* in, Key* unique, Index* num_unique, - Index* inverse_indices, Index* counts, void* workspace, - size_t* workspace_size, hipStream_t stream) { - if ((flag & kInputSorted) != 0) { - return DispatchOutputInverseIndices(flag, n, in, unique, num_unique, - inverse_indices, counts, workspace, - workspace_size, stream); - } else { - const size_t sorted_in_size = GetCudaAlignedSize(n * sizeof(Key)); - const size_t sorted_indices_size = GetCudaAlignedSize(n * sizeof(Index)); - const size_t sort_buffer_size = sorted_in_size + sorted_indices_size; - Key* sorted_in; - Index* sorted_indices; - size_t do_sort_ws; - void* do_sort_workspace; - size_t do_inverse_indices_ws; - void* do_inverse_indices_workspace; - if (*workspace_size == 0) { - sorted_in = nullptr; - sorted_indices = nullptr; - do_sort_ws = 0; - do_sort_workspace = nullptr; - do_inverse_indices_ws = 0; - do_inverse_indices_workspace = nullptr; - } else { - if (*workspace_size <= sort_buffer_size) { return hipErrorInvalidValue; } - sorted_in = PtrOffset(workspace, 0); - sorted_indices = PtrOffset(workspace, sorted_in_size); - do_sort_ws = *workspace_size - sort_buffer_size; - do_sort_workspace = PtrOffset(workspace, sort_buffer_size); - do_inverse_indices_ws = do_sort_ws; - do_inverse_indices_workspace = do_sort_workspace; - } - { - hipError_t err = DoSort(n, in, sorted_in, sorted_indices, do_sort_workspace, - &do_sort_ws, stream); - if (err != hipSuccess) { return err; } - } - PermutationIterator inverse_indices_iter(inverse_indices, - sorted_indices); - { - hipError_t err = DispatchOutputInverseIndices( - flag, n, sorted_in, unique, num_unique, inverse_indices_iter, counts, - do_inverse_indices_workspace, &do_inverse_indices_ws, stream); - if (err != hipSuccess) { return err; } - } - if (*workspace_size == 0) { - *workspace_size = sort_buffer_size + max(do_sort_ws, do_inverse_indices_ws); - } - return hipSuccess; - } -} - -} // namespace - -template -hipError_t Launch(Flag flag, size_t n, const Key* in, Key* unique, Index* num_unique, - Index* inverse_indices, Index* counts, void* workspace, size_t workspace_size, - hipStream_t stream) { - if (workspace_size == 0) { return hipErrorInvalidValue; } - return DispatchInputSorted(flag, n, in, unique, num_unique, inverse_indices, counts, - workspace, &workspace_size, stream); -} - -template -hipError_t GetWorkspaceSize(Flag flag, size_t n, size_t* workspace_size) { - *workspace_size = 0; - return DispatchInputSorted(flag, n, nullptr, nullptr, nullptr, nullptr, nullptr, - nullptr, workspace_size, 0); -} - -} // namespace unique - -} // namespace cuda - -} // namespace oneflow - -#endif // WITH_ROCM - -#endif // ONEFLOW_CORE_CUDA_UNIQUE_H_ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_HIP_UNIQUE_H_ +#define ONEFLOW_CORE_HIP_UNIQUE_H_ + +#ifdef WITH_ROCM + +#include +#include "hip/hip_runtime.h" +// #include +#include "hip/hip_runtime.h" +#include "oneflow/core/common/permutation_iterator.h" +#include "oneflow/core/common/not_equal_to_previous_adjacent_iterator.h" + +namespace oneflow { + +namespace cuda { + +namespace unique { + +using Flag = uint32_t; +static constexpr Flag kDefault = 0x0; +static constexpr Flag kInputSorted = 0x1; +static constexpr Flag kOutputInverseIndices = 0x1 << 1; +static constexpr Flag kOutputCounts = 0x1 << 2; + +namespace { + +constexpr size_t kCudaAlignSize = 512; + +__device__ __host__ __forceinline__ size_t GetCudaAlignedSize(size_t size) { + return (size + kCudaAlignSize - 1) / kCudaAlignSize * kCudaAlignSize; +} + +template +__device__ __host__ __forceinline__ T* PtrOffset(void* ptr, size_t offset) { + return reinterpret_cast(reinterpret_cast(ptr) + offset); +} + +__device__ __host__ __forceinline__ size_t max(size_t a, size_t b) { return a > b ? a : b; } + +template +hipError_t DoUnique(size_t n, const Key* sorted_in, Key* unique, Index* num_unique, + void* workspace, size_t* workspace_size, hipStream_t stream) { + size_t ws = *workspace_size; + hipError_t err = hipcub::DeviceSelect::Unique( + workspace, ws, sorted_in, unique, num_unique, n, stream); + if (err != hipSuccess) { return err; } + if (*workspace_size == 0) { *workspace_size = ws; } + return hipSuccess; +} + +template +hipError_t DoUniqueWithCounts(size_t n, const Key* sorted_in, Key* unique, Index* num_unique, + Index* counts, void* workspace, size_t* workspace_size, + hipStream_t stream) { + size_t ws = *workspace_size; + hipError_t err = hipcub::DeviceRunLengthEncode::Encode( + workspace, ws, sorted_in, unique, counts, num_unique, n, stream); + if (err != hipSuccess) { return err; } + if (*workspace_size == 0) { *workspace_size = ws; } + return hipSuccess; +} + +template +hipError_t DispatchOutputCounts(Flag flag, size_t n, const Key* sorted_in, Key* unique, + Index* num_unique, Index* counts, void* workspace, + size_t* workspace_size, hipStream_t stream) { + size_t ws = *workspace_size; + if ((flag & kOutputCounts) != 0) { + hipError_t err = DoUniqueWithCounts(n, sorted_in, unique, num_unique, counts, + workspace, &ws, stream); + if (err != hipSuccess) { return err; } + } else { + hipError_t err = + DoUnique(n, sorted_in, unique, num_unique, workspace, &ws, stream); + if (err != hipSuccess) { return err; } + } + if (*workspace_size == 0) { *workspace_size = ws; } + return hipSuccess; +} + +template +hipError_t DoGenInverseIndices(size_t n, const Key* sorted_in, + InverseIndicesIter inverse_indices_iter, void* workspace, + size_t* workspace_size, hipStream_t stream) { + size_t ws = *workspace_size; + NotEqualToPreviousAdjacentIterator unique_counting_iter(sorted_in, 0); + hipError_t err = + hipcub::DeviceScan::InclusiveSum( + workspace, ws, unique_counting_iter, inverse_indices_iter, n, stream); + if (err != hipSuccess) { return err; } + if (*workspace_size == 0) { *workspace_size = ws; } + return hipSuccess; +} + +template +hipError_t DispatchOutputInverseIndices(Flag flag, size_t n, const Key* sorted_in, Key* unique, + Index* num_unique, InverseIndicesIter inverse_indices_iter, + Index* counts, void* workspace, size_t* workspace_size, + hipStream_t stream) { + size_t dispatch_with_counts_ws = *workspace_size; + size_t do_gen_inverse_indices_ws = *workspace_size; + { + hipError_t err = + DispatchOutputCounts(flag, n, sorted_in, unique, num_unique, counts, workspace, + &dispatch_with_counts_ws, stream); + if (err != hipSuccess) { return err; } + } + if ((flag & kOutputInverseIndices) != 0) { + hipError_t err = DoGenInverseIndices( + n, sorted_in, inverse_indices_iter, workspace, &do_gen_inverse_indices_ws, stream); + if (err != hipSuccess) { return err; } + } + if (*workspace_size == 0) { + *workspace_size = max(dispatch_with_counts_ws, do_gen_inverse_indices_ws); + } + return hipSuccess; +} + +template +__global__ void IotaKernel(size_t n, T* out) { + for (T i = blockIdx.x * blockDim.x + threadIdx.x, step = blockDim.x * gridDim.x; i < n; + i += step) { + out[i] = i; + } +} + +template +hipError_t DoSort(size_t n, const Key* in, Key* sorted, Index* sorted_indices, void* workspace, + size_t* workspace_size, hipStream_t stream) { + Index* indices; + const size_t indices_size = GetCudaAlignedSize(n * sizeof(Index)); + void* sort_workspace; + size_t sort_ws; + if (*workspace_size == 0) { + indices = nullptr; + sort_workspace = nullptr; + sort_ws = 0; + } else { + if (*workspace_size <= indices_size) { return hipErrorInvalidValue; } + indices = PtrOffset(workspace, 0); + sort_workspace = PtrOffset(workspace, indices_size); + sort_ws = *workspace_size - indices_size; + } + if (*workspace_size != 0) { + const int block_size = 1024; + const int num_blocks = static_cast((n + block_size - 1) / block_size); + IotaKernel<<>>(n, indices); + } + hipError_t err = hipcub::DeviceRadixSort::SortPairs( + sort_workspace, sort_ws, in, sorted, indices, sorted_indices, n, 0, sizeof(Key) * 8, stream); + if (err != hipSuccess) { return err; } + if (*workspace_size == 0) { *workspace_size = indices_size + sort_ws; } + return hipSuccess; +} + +template +hipError_t DispatchInputSorted(Flag flag, size_t n, const Key* in, Key* unique, Index* num_unique, + Index* inverse_indices, Index* counts, void* workspace, + size_t* workspace_size, hipStream_t stream) { + if ((flag & kInputSorted) != 0) { + return DispatchOutputInverseIndices(flag, n, in, unique, num_unique, + inverse_indices, counts, workspace, + workspace_size, stream); + } else { + const size_t sorted_in_size = GetCudaAlignedSize(n * sizeof(Key)); + const size_t sorted_indices_size = GetCudaAlignedSize(n * sizeof(Index)); + const size_t sort_buffer_size = sorted_in_size + sorted_indices_size; + Key* sorted_in; + Index* sorted_indices; + size_t do_sort_ws; + void* do_sort_workspace; + size_t do_inverse_indices_ws; + void* do_inverse_indices_workspace; + if (*workspace_size == 0) { + sorted_in = nullptr; + sorted_indices = nullptr; + do_sort_ws = 0; + do_sort_workspace = nullptr; + do_inverse_indices_ws = 0; + do_inverse_indices_workspace = nullptr; + } else { + if (*workspace_size <= sort_buffer_size) { return hipErrorInvalidValue; } + sorted_in = PtrOffset(workspace, 0); + sorted_indices = PtrOffset(workspace, sorted_in_size); + do_sort_ws = *workspace_size - sort_buffer_size; + do_sort_workspace = PtrOffset(workspace, sort_buffer_size); + do_inverse_indices_ws = do_sort_ws; + do_inverse_indices_workspace = do_sort_workspace; + } + { + hipError_t err = DoSort(n, in, sorted_in, sorted_indices, do_sort_workspace, + &do_sort_ws, stream); + if (err != hipSuccess) { return err; } + } + PermutationIterator inverse_indices_iter(inverse_indices, + sorted_indices); + { + hipError_t err = DispatchOutputInverseIndices( + flag, n, sorted_in, unique, num_unique, inverse_indices_iter, counts, + do_inverse_indices_workspace, &do_inverse_indices_ws, stream); + if (err != hipSuccess) { return err; } + } + if (*workspace_size == 0) { + *workspace_size = sort_buffer_size + max(do_sort_ws, do_inverse_indices_ws); + } + return hipSuccess; + } +} + +} // namespace + +template +hipError_t Launch(Flag flag, size_t n, const Key* in, Key* unique, Index* num_unique, + Index* inverse_indices, Index* counts, void* workspace, size_t workspace_size, + hipStream_t stream) { + if (workspace_size == 0) { return hipErrorInvalidValue; } + return DispatchInputSorted(flag, n, in, unique, num_unique, inverse_indices, counts, + workspace, &workspace_size, stream); +} + +template +hipError_t GetWorkspaceSize(Flag flag, size_t n, size_t* workspace_size) { + *workspace_size = 0; + return DispatchInputSorted(flag, n, nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, workspace_size, 0); +} + +} // namespace unique + +} // namespace cuda + +} // namespace oneflow + +#endif // WITH_ROCM + +#endif // ONEFLOW_CORE_CUDA_UNIQUE_H_ diff --git a/oneflow/core/job/collective_boxing/nccl_executor_backend.hip.cpp b/oneflow/core/job/collective_boxing/nccl_executor_backend.hip.cpp index cab89de..3e2887c 100644 --- a/oneflow/core/job/collective_boxing/nccl_executor_backend.hip.cpp +++ b/oneflow/core/job/collective_boxing/nccl_executor_backend.hip.cpp @@ -1,665 +1,665 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/job/collective_boxing/nccl_executor_backend.h" -#include "oneflow/core/job/collective_boxing/request_store.h" -#include "oneflow/core/device/nccl_util.h" -#include "oneflow/core/graph/boxing/collective_boxing_util.h" -#include "oneflow/core/job/resource_desc.h" -#include "oneflow/core/control/ctrl_client.h" -#include "oneflow/core/control/global_process_ctx.h" -#include "oneflow/core/job/global_for.h" -#include "oneflow/core/thread/thread_pool.h" -#include "oneflow/core/device/cuda_util.h" - -#include - -#include -#include - -namespace oneflow { - -namespace boxing { - -namespace collective { - -namespace { - -ncclRedOp_t GetNcclReduceOp(ReduceMethod reduce_method) { - if (reduce_method == kReduceMethodSum) { - return ncclRedOp_t::ncclSum; - } else { - UNIMPLEMENTED(); - return ncclRedOp_t{}; - } -} - -std::string GetNcclUniqueIdRpcKey(const std::string& name, int64_t stream_id) { - return "CollectiveBoxingExecutorNcclUniqueIdRpcKey-" + name + "-" + std::to_string(stream_id); -} - -struct CopyParams { - void* dst; - const void* src; - int64_t count; -}; - -constexpr int64_t kMultiCopyParamsMaxSize = 128; -constexpr int64_t kMultiCopyAlignSize = 32; - -int64_t GetMultiCopyAlignedSize(int64_t size) { - return ((size + kMultiCopyAlignSize - 1) / kMultiCopyAlignSize) * kMultiCopyAlignSize; -} - -struct MultiCopyParams { - CopyParams params[kMultiCopyParamsMaxSize]; - int64_t count; - - MultiCopyParams() : count(0), params{} {} - - void Add(void* dst, const void* src, int64_t count) { - CHECK_LT(this->count, kMultiCopyParamsMaxSize); - params[this->count].dst = dst; - params[this->count].src = src; - params[this->count].count = count; - this->count += 1; - } -}; - -using BulkType = ulonglong2; - -__global__ void MultiCopyGpu(MultiCopyParams multi_params) { - for (int64_t p = 0; p < multi_params.count; ++p) { - const CopyParams params = multi_params.params[p]; - auto* bulk_dst = reinterpret_cast(params.dst); - const auto* bulk_src = reinterpret_cast(params.src); - const int64_t bulk_count = params.count / sizeof(BulkType); - CUDA_1D_KERNEL_LOOP_T(int64_t, i, bulk_count) { bulk_dst[i] = bulk_src[i]; } - const int64_t tail_offset = bulk_count * sizeof(BulkType); - auto* tail_dst = reinterpret_cast(params.dst) + tail_offset; - const auto* tail_src = reinterpret_cast(params.src) + tail_offset; - const int64_t tail_count = params.count - tail_offset; - CUDA_1D_KERNEL_LOOP_T(int64_t, i, tail_count) { tail_dst[i] = tail_src[i]; } - } -} - -void MultiCopy(hipStream_t stream, const MultiCopyParams& multi_params) { - if (multi_params.count <= 0) { return; } - CHECK_LE(multi_params.count, kMultiCopyParamsMaxSize); - int64_t max_count = multi_params.params[0].count; - for (int64_t i = 0; i < multi_params.count; ++i) { - max_count = std::max(max_count, multi_params.params[i].count); - } - hipLaunchKernelGGL(MultiCopyGpu, BlocksNum4ThreadsNum(max_count), kCudaThreadsNumPerBlock, 0, stream, - multi_params); -} - -class CommRank final { - public: - OF_DISALLOW_COPY(CommRank); - CommRank(int32_t device_id, int32_t global_rank, int32_t global_rank_count, int32_t local_rank, - int32_t local_rank_count) - : device_id_(device_id), - global_rank_(global_rank), - local_rank_(local_rank), - nccl_comm_(nullptr) {} - - CommRank(CommRank&& rhs) noexcept { - this->device_id_ = rhs.device_id_; - this->global_rank_ = rhs.global_rank_; - this->local_rank_ = rhs.local_rank_; - this->nccl_comm_ = rhs.nccl_comm_; - rhs.nccl_comm_ = nullptr; - } - - ~CommRank() { - if (nccl_comm_ != nullptr) { - CudaCurrentDeviceGuard guard(device_id_); - OF_NCCL_CHECK(ncclCommDestroy(nccl_comm_)); - } - } - - int32_t device_id() const { return device_id_; } - - ncclComm_t nccl_comm() const { return nccl_comm_; } - - void InitRank(ncclUniqueId unique_id, int32_t global_rank_count) { - CudaCurrentDeviceGuard guard(device_id_); - OF_NCCL_CHECK(ncclCommInitRank(&nccl_comm_, global_rank_count, unique_id, global_rank_)); - } - - private: - int32_t device_id_; - int32_t global_rank_; - int32_t local_rank_; - ncclComm_t nccl_comm_; -}; - -class CommGroup final { - public: - OF_DISALLOW_COPY(CommGroup); - CommGroup() = default; - ~CommGroup() = default; - CommGroup(CommGroup&& rhs) noexcept { - rank_vec_.swap(rhs.rank_vec_); - global_rank_count_ = rhs.global_rank_count_; - } - - void InitGroup(const DeviceSet& device_set, const std::string& unique_name) { - CudaCurrentDeviceGuard guard; - const int64_t this_machine_id = GlobalProcessCtx::Rank(); - global_rank_count_ = device_set.device_size(); - std::vector local_ranks; - for (int32_t i = 0; i < global_rank_count_; ++i) { - if (device_set.device(i).machine_id() == this_machine_id) { local_ranks.emplace_back(i); } - } - const int32_t local_rank_count = local_ranks.size(); - CHECK_GT(local_rank_count, 0); - ncclUniqueId nccl_unique_id{}; - if (local_ranks.front() == 0) { - OF_NCCL_CHECK(ncclGetUniqueId(&nccl_unique_id)); - if (local_rank_count != global_rank_count_) { - Singleton::Get()->PushKV(unique_name, NcclUniqueIdToString(nccl_unique_id)); - } - } else { - Singleton::Get()->PullKV(unique_name, [&nccl_unique_id](const std::string& val) { - NcclUniqueIdFromString(val, &nccl_unique_id); - }); - } - rank_vec_.reserve(local_rank_count); - OF_NCCL_CHECK(ncclGroupStart()); - for (int32_t local_rank = 0; local_rank < local_ranks.size(); ++local_rank) { - const int32_t global_rank = local_ranks.at(local_rank); - const int32_t device_id = device_set.device(global_rank).device_id(); - OF_CUDA_CHECK(hipSetDevice(device_id)); - rank_vec_.emplace_back(device_id, global_rank, global_rank_count_, local_rank, - local_rank_count); - rank_vec_.at(local_rank).InitRank(nccl_unique_id, global_rank_count_); - } - OF_NCCL_CHECK(ncclGroupEnd()); - } - - int32_t global_rank_count() const { return global_rank_count_; } - - int32_t local_rank_count() const { return rank_vec_.size(); } - - const CommRank& GetCommRank(int32_t local_rank) const { return rank_vec_.at(local_rank); } - - private: - std::vector rank_vec_; - int32_t global_rank_count_ = 0; -}; - -class StreamCtx { - public: - OF_DISALLOW_COPY(StreamCtx); - StreamCtx(int32_t device_id, size_t fusion_buffer_size) - : device_id_(device_id), fusion_buffer_size_(fusion_buffer_size) { - CudaCurrentDeviceGuard guard(device_id_); - int priority; - OF_CUDA_CHECK(hipDeviceGetStreamPriorityRange(nullptr, &priority)); - OF_CUDA_CHECK(hipStreamCreateWithPriority(&stream_, hipStreamNonBlocking, priority)); - OF_CUDA_CHECK(hipMalloc(&fusion_buffer_, fusion_buffer_size_)); - cb_event_poller_ = std::thread(&StreamCtx::PollEvent, this); - } - ~StreamCtx() { - cb_event_chan_.Close(); - cb_event_poller_.join(); - CudaCurrentDeviceGuard guard(device_id_); - OF_CUDA_CHECK(hipStreamSynchronize(stream_)); - OF_CUDA_CHECK(hipStreamDestroy(stream_)); - OF_CUDA_CHECK(hipFree(fusion_buffer_)); - } - - void PollEvent() { - CudaCurrentDeviceGuard guard(device_id_); - while (true) { - std::pair> cb_event; - ChannelStatus status = cb_event_chan_.Receive(&cb_event); - if (status == kChannelStatusErrorClosed) { break; } - CHECK_EQ(status, kChannelStatusSuccess); - OF_CUDA_CHECK(hipEventSynchronize(cb_event.first)); - cb_event.second(); - OF_CUDA_CHECK(hipEventDestroy(cb_event.first)); - } - } - - void AddCallback(const std::function& callback) { - hipEvent_t event; - OF_CUDA_CHECK(hipEventCreateWithFlags(&event, hipEventDisableTiming)); - OF_CUDA_CHECK(hipEventRecord(event, stream_)); - CHECK_EQ(cb_event_chan_.Send(std::make_pair(event, callback)), kChannelStatusSuccess); - } - - int32_t device_id() const { return device_id_; } - - hipStream_t stream() const { return stream_; } - - size_t fusion_buffer_size() const { return fusion_buffer_size_; } - - char* fusion_buffer() const { return fusion_buffer_; } - - private: - int32_t device_id_; - hipStream_t stream_ = nullptr; - size_t fusion_buffer_size_; - char* fusion_buffer_ = nullptr; - Channel>> cb_event_chan_; - std::thread cb_event_poller_; -}; - -void LaunchFusedAllReduce(const CommGroup& comm_group, - const std::vector>& device_id2stream_ctx, - const std::shared_ptr& request_store, - const std::vector& request_ids) { - CHECK_LE(request_ids.size(), kMultiCopyParamsMaxSize); - RequestEntry* first_request_entry = request_store->MutRequestEntry(request_ids.front()); - const ncclDataType_t nccl_data_type = - GetNcclDataType(first_request_entry->desc().op_desc().data_type()); - const ncclRedOp_t nccl_reduce_op = - GetNcclReduceOp(first_request_entry->desc().op_desc().reduce_method()); - const int64_t size_of_data_type = - GetSizeOfDataType(first_request_entry->desc().op_desc().data_type()); - std::vector offset_vec; - offset_vec.reserve(request_ids.size()); - int64_t offset = 0; - request_store->ForEachMutRequestEntryForIdsInJob( - request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) { - offset_vec.emplace_back(offset); - offset += GetMultiCopyAlignedSize(request_entry->size_in_bytes()); - }); - const int64_t elem_cnt = offset / size_of_data_type; - for (int32_t local_rank = 0; local_rank < comm_group.local_rank_count(); ++local_rank) { - MultiCopyParams copy_in_params; - const CommRank& comm_rank = comm_group.GetCommRank(local_rank); - const StreamCtx* stream_ctx = device_id2stream_ctx.at(comm_rank.device_id()).get(); - CHECK_LE(offset, stream_ctx->fusion_buffer_size()); - request_store->ForEachMutRequestEntryForIdsInJob( - request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) { - copy_in_params.Add(stream_ctx->fusion_buffer() + offset_vec.at(i), - request_entry->GetRuntimeRequest(local_rank)->send_buff, - request_entry->size_in_bytes()); - }); - OF_CUDA_CHECK(hipSetDevice(comm_rank.device_id())); - MultiCopy(stream_ctx->stream(), copy_in_params); - } - - OF_NCCL_CHECK(ncclGroupStart()); - for (int32_t local_rank = 0; local_rank < comm_group.local_rank_count(); ++local_rank) { - const CommRank& comm_rank = comm_group.GetCommRank(local_rank); - const StreamCtx* stream_ctx = device_id2stream_ctx.at(comm_rank.device_id()).get(); - OF_CUDA_CHECK(hipSetDevice(comm_rank.device_id())); - OF_NCCL_CHECK(ncclAllReduce(stream_ctx->fusion_buffer(), stream_ctx->fusion_buffer(), elem_cnt, - nccl_data_type, nccl_reduce_op, comm_rank.nccl_comm(), - stream_ctx->stream())); - } - OF_NCCL_CHECK(ncclGroupEnd()); - - for (int32_t local_rank = 0; local_rank < comm_group.local_rank_count(); ++local_rank) { - MultiCopyParams copy_out_params; - const CommRank& comm_rank = comm_group.GetCommRank(local_rank); - const StreamCtx* stream_ctx = device_id2stream_ctx.at(comm_rank.device_id()).get(); - request_store->ForEachMutRequestEntryForIdsInJob( - request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) { - copy_out_params.Add(request_entry->GetRuntimeRequest(local_rank)->recv_buff, - stream_ctx->fusion_buffer() + offset_vec.at(i), - request_entry->size_in_bytes()); - }); - OF_CUDA_CHECK(hipSetDevice(comm_rank.device_id())); - MultiCopy(stream_ctx->stream(), copy_out_params); - } -} - -void LaunchAggregatedOps(const CommGroup& comm_group, - const std::vector>& device_id2stream_ctx, - const std::shared_ptr& request_store, - const std::vector& request_ids) { - OF_NCCL_CHECK(ncclGroupStart()); - for (int32_t local_rank = 0; local_rank < comm_group.local_rank_count(); ++local_rank) { - const CommRank& comm_rank = comm_group.GetCommRank(local_rank); - const auto comm = comm_rank.nccl_comm(); - const StreamCtx* stream_ctx = device_id2stream_ctx.at(comm_rank.device_id()).get(); - OF_CUDA_CHECK(hipSetDevice(comm_rank.device_id())); - request_store->ForEachMutRequestEntryForIdsInJob( - request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) { - const auto& op_desc = request_entry->desc().op_desc(); - const std::shared_ptr& runtime_request_info = - request_entry->GetRuntimeRequest(local_rank); - const OpType op_type = op_desc.op_type(); - const void* send_buff = runtime_request_info->send_buff; - void* recv_buff = runtime_request_info->recv_buff; - const int64_t elem_cnt = request_entry->elem_cnt(); - const ncclDataType_t nccl_data_type = GetNcclDataType(op_desc.data_type()); - const int32_t num_ranks = comm_group.global_rank_count(); - if (op_type == OpType::kOpTypeAllReduce) { - OF_NCCL_CHECK(ncclAllReduce(send_buff, recv_buff, elem_cnt, nccl_data_type, - GetNcclReduceOp(op_desc.reduce_method()), comm, - stream_ctx->stream())); - } else if (op_type == OpType::kOpTypeAllGather) { - CHECK_EQ(elem_cnt % num_ranks, 0); - OF_NCCL_CHECK(ncclAllGather(send_buff, recv_buff, elem_cnt / num_ranks, nccl_data_type, - comm, stream_ctx->stream())); - } else if (op_type == OpType::kOpTypeReduceScatter) { - CHECK_EQ(elem_cnt % num_ranks, 0); - OF_NCCL_CHECK(ncclReduceScatter( - send_buff, recv_buff, elem_cnt / num_ranks, nccl_data_type, - GetNcclReduceOp(op_desc.reduce_method()), comm, stream_ctx->stream())); - } else if (op_type == OpType::kOpTypeReduce) { - OF_NCCL_CHECK(ncclReduce(send_buff, recv_buff, elem_cnt, nccl_data_type, - GetNcclReduceOp(op_desc.reduce_method()), op_desc.root(), comm, - stream_ctx->stream())); - } else if (op_type == OpType::kOpTypeBroadcast) { - OF_NCCL_CHECK(ncclBroadcast(send_buff, recv_buff, elem_cnt, nccl_data_type, - op_desc.root(), comm, stream_ctx->stream())); - } else if (op_type == OpType::kOpTypeAll2All) { -#if NCCL_VERSION_CODE > 2700 - const int64_t elem_per_rank = elem_cnt / num_ranks; - const int64_t elem_per_chunk = elem_per_rank / num_ranks; - const int64_t dtype_size = GetSizeOfDataType(op_desc.data_type()); - const int64_t chunk_size = elem_per_chunk * dtype_size; - for (int64_t j = 0; j < num_ranks; ++j) { - OF_NCCL_CHECK(ncclSend(reinterpret_cast( - reinterpret_cast(send_buff) + j * chunk_size), - elem_per_chunk, nccl_data_type, j, comm, - stream_ctx->stream())); - OF_NCCL_CHECK(ncclRecv( - reinterpret_cast(reinterpret_cast(recv_buff) + j * chunk_size), - elem_per_chunk, nccl_data_type, j, comm, stream_ctx->stream())); - } -#else - UNIMPLEMENTED(); -#endif - } else { - UNIMPLEMENTED(); - } - }); - } - OF_NCCL_CHECK(ncclGroupEnd()); -} - -void AddCallbackAndResetRuntimeRequest( - const CommGroup& comm_group, - const std::vector>& device_id2stream_ctx, - const std::shared_ptr& request_store, const std::vector& request_ids) { - std::vector>> saved_runtime_request_info( - request_ids.size()); - request_store->ForEachMutRequestEntryForIdsInJob( - request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) { - saved_runtime_request_info.at(i) = std::move(request_entry->ResetRuntimeRequest()); - }); - for (int32_t local_rank = 0; local_rank < comm_group.local_rank_count(); ++local_rank) { - const CommRank& comm_rank = comm_group.GetCommRank(local_rank); - StreamCtx* stream_ctx = device_id2stream_ctx.at(comm_rank.device_id()).get(); - auto runtime_request_info_vec = - std::make_shared>>(); - runtime_request_info_vec->reserve(request_ids.size()); - request_store->ForEachMutRequestEntryForIdsInJob( - request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) { - runtime_request_info_vec->emplace_back( - std::move(saved_runtime_request_info.at(i).at(local_rank))); - }); - OF_CUDA_CHECK(hipSetDevice(comm_rank.device_id())); - stream_ctx->AddCallback([runtime_request_info_vec]() { - for (auto& runtime_request_info : *runtime_request_info_vec) { - runtime_request_info->callback(Maybe::Ok()); - } - }); - } -} - -} // namespace - -struct NcclExecutorBackend::Impl { - Impl(const CollectiveBoxingConf& conf, std::shared_ptr request_store) - : conf(conf), request_store(std::move(request_store)) { - CHECK_GT(conf.nccl_num_streams(), 0); - CHECK_GE(conf.nccl_fusion_threshold_mb(), 0); - fusion_threshold = conf.nccl_fusion_threshold_mb() * 1024 * 1024; - num_streams = conf.nccl_num_streams(); - current_stream_id = 0; - enable_mixed_fusion = - (!conf.nccl_fusion_all_reduce_use_buffer()) && conf.nccl_enable_mixed_fusion(); - int nccl_version; - OF_NCCL_CHECK(ncclGetVersion(&nccl_version)); - if (nccl_version == 21003) { - LOG(WARNING) - << "Current nccl version is 2.10.3, in this version, ncclGroup() with mixed " - "datatype/element/collective could induce crash or corruption, so we will not " - "fuse any request."; - } - InitStreamCtx(); - InitIsOpTypeFusionEnabled(); - } - ~Impl() { - stream_id2device_id2stream_ctx.clear(); - device_set2stream_id2comm_group.clear(); - } - - void InitCommGroup(int64_t job_id) { - std::set local_device_ids; - request_store->ForEachMutRequestEntryInJob( - job_id, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) { - const auto& request = request_entry->desc(); - if (request.op_desc().backend() != Backend::kBackendNCCL) { return; } - if (!request_entry->HasRankOnThisNode()) { return; } - const DeviceSet& device_set = request.device_set(); - if (device_set2stream_id2comm_group.count(device_set) > 0) { return; } - auto& stream_id2comm_group = device_set2stream_id2comm_group[device_set]; - stream_id2comm_group.resize(num_streams); - for (int32_t stream_id = 0; stream_id < num_streams; ++stream_id) { - stream_id2comm_group.at(stream_id).InitGroup( - device_set, GetNcclUniqueIdRpcKey(request.op_desc().name(), stream_id)); - } - for (int32_t j = 0; j < stream_id2comm_group.at(0).local_rank_count(); ++j) { - local_device_ids.emplace(stream_id2comm_group.at(0).GetCommRank(j).device_id()); - } - }); - for (int32_t stream_id = 0; stream_id < num_streams; ++stream_id) { - for (const int64_t device_id : local_device_ids) { - if (stream_id2device_id2stream_ctx.at(stream_id).at(device_id) == nullptr) { - stream_id2device_id2stream_ctx.at(stream_id).at(device_id) = - std::make_unique(device_id, fusion_threshold); - } - } - } - } - - void InitStreamCtx() { - int32_t num_devices; - OF_CUDA_CHECK(hipGetDeviceCount(&num_devices)); - stream_id2device_id2stream_ctx.resize(num_streams); - for (int64_t stream_id = 0; stream_id < num_streams; ++stream_id) { - stream_id2device_id2stream_ctx.at(stream_id).resize(num_devices); - } - } - - void InitIsOpTypeFusionEnabled() { - op_type2fusion_enabled.resize(OpType_ARRAYSIZE, false); - op_type2fusion_enabled.at(OpType::kOpTypeAllReduce) = conf.nccl_fusion_all_reduce(); - op_type2fusion_enabled.at(OpType::kOpTypeAllGather) = conf.nccl_fusion_all_gather(); - op_type2fusion_enabled.at(OpType::kOpTypeReduceScatter) = conf.nccl_fusion_reduce_scatter(); - op_type2fusion_enabled.at(OpType::kOpTypeReduce) = conf.nccl_fusion_reduce(); - op_type2fusion_enabled.at(OpType::kOpTypeBroadcast) = conf.nccl_fusion_broadcast(); - op_type2fusion_enabled.at(OpType::kOpTypeAll2All) = false; - } - - int32_t NextStreamId() { - const int32_t stream_id = current_stream_id; - current_stream_id = (current_stream_id + 1) % num_streams; - return stream_id; - } - - bool IsOpTypeFusionEnabled(OpType op_type) const { return op_type2fusion_enabled.at(op_type); } - - bool IsRequestEntryFusionEnabled(const RequestEntry* entry) const { - return IsOpTypeFusionEnabled(entry->desc().op_desc().op_type()); - } - - bool CanRequestEntryFuse(const RequestEntry* lhs, const RequestEntry* rhs) const { - { - int nccl_version; - OF_NCCL_CHECK(ncclGetVersion(&nccl_version)); - // Workaround for https://github.com/NVIDIA/nccl/issues/560 - if (nccl_version == 21003) { return false; } - } - if (lhs->device_set_symbol() != rhs->device_set_symbol()) { return false; } - if ((!IsRequestEntryFusionEnabled(lhs)) || (!IsRequestEntryFusionEnabled(rhs))) { - return false; - } - if ((!enable_mixed_fusion) - && lhs->desc().op_desc().op_type() != rhs->desc().op_desc().op_type()) { - return false; - } - if (conf.nccl_fusion_all_reduce_use_buffer()) { - if (lhs->desc().op_desc().op_type() == OpType::kOpTypeAllReduce - && rhs->desc().op_desc().op_type() == OpType::kOpTypeAllReduce) { - CHECK(lhs->desc().op_desc().has_reduce_method()); - CHECK(rhs->desc().op_desc().has_reduce_method()); - return lhs->desc().op_desc().reduce_method() == rhs->desc().op_desc().reduce_method() - && lhs->desc().op_desc().data_type() == rhs->desc().op_desc().data_type(); - } else if (lhs->desc().op_desc().op_type() == OpType::kOpTypeAllReduce - || rhs->desc().op_desc().op_type() == OpType::kOpTypeAllReduce) { - return false; - } else { - return true; - } - } else { - return true; - } - } - - void GroupRequests(const std::vector& request_ids, - const std::function&&, void*)>& Handler) { - std::vector group; - int64_t group_size = 0; - const int64_t fusion_max_ops = std::min(conf.nccl_fusion_max_ops(), kMultiCopyParamsMaxSize); - request_store->ForEachMutRequestEntryForIdsInJob( - request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) { - const auto& request = request_entry->desc(); - const int64_t size = GetMultiCopyAlignedSize(request_entry->size_in_bytes()); - if (group.empty() - || !CanRequestEntryFuse(request_store->MutRequestEntry(group.back()), request_entry) - || group_size + size > fusion_threshold || group.size() >= fusion_max_ops) { - if (!group.empty()) { - void* token = CreateGroupToken(group); - Handler(std::move(group), token); - group.clear(); - group_size = 0; - } - } - group.emplace_back(request_id); - group_size += size; - }); - if (!group.empty()) { - void* token = CreateGroupToken(group); - Handler(std::move(group), token); - } - } - - struct GroupToken { - GroupToken(const std::vector& group, std::vector* stream_id2comm_group) - : request_ids(group), stream_id2comm_group(stream_id2comm_group) {} - std::vector request_ids; - std::vector* stream_id2comm_group; - }; - - void* CreateGroupToken(const std::vector& group) { - CHECK_GT(group.size(), 0); - void* group_token; - const DeviceSet& first_device_set = - request_store->MutRequestEntry(group.front())->desc().device_set(); - auto it = device_set2stream_id2comm_group.find(first_device_set); - CHECK(it != device_set2stream_id2comm_group.end()); - group_token = new GroupToken(group, &it->second); - request_store->ForEachMutRequestEntryForIdsInJob( - group, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) { - const DeviceSet& device_set = request_entry->desc().device_set(); - CHECK(first_device_set == device_set); - }); - return group_token; - } - - void DestroyGroupToken(void* group_token) { - GroupToken* token = static_cast(group_token); - delete token; - } - - void ExecuteGroup(void* group_token) { - GroupToken* token = static_cast(group_token); - const std::vector& request_ids = token->request_ids; - if (request_ids.empty()) { return; } - const int32_t stream_id = NextStreamId(); - CudaCurrentDeviceGuard device_guard; - const auto& comm_group = token->stream_id2comm_group->at(stream_id); - auto& device_id2stream_ctx = stream_id2device_id2stream_ctx.at(stream_id); - if (request_store->MutRequestEntry(request_ids.front())->desc().op_desc().op_type() - == OpType::kOpTypeAllReduce - && conf.nccl_fusion_all_reduce_use_buffer() && request_ids.size() > 1) { - LaunchFusedAllReduce(comm_group, device_id2stream_ctx, request_store, request_ids); - } else { - LaunchAggregatedOps(comm_group, device_id2stream_ctx, request_store, request_ids); - } - AddCallbackAndResetRuntimeRequest(comm_group, device_id2stream_ctx, request_store, request_ids); - } - - CollectiveBoxingConf conf; - int64_t fusion_threshold; - int32_t num_streams; - int32_t current_stream_id; - bool enable_mixed_fusion; - std::vector op_type2fusion_enabled; - std::shared_ptr request_store; - HashMap> device_set2stream_id2comm_group; - std::vector>> stream_id2device_id2stream_ctx; -}; - -NcclExecutorBackend::NcclExecutorBackend() = default; - -NcclExecutorBackend::~NcclExecutorBackend() = default; - -void NcclExecutorBackend::Init(std::shared_ptr request_store) { - impl_ = std::make_unique( - Singleton::Get()->collective_boxing_conf(), request_store); -} - -void NcclExecutorBackend::InitJob(int64_t job_id) { - CudaCurrentDeviceGuard guard; - impl_->InitCommGroup(job_id); -} - -void NcclExecutorBackend::DeinitJob(int64_t job_id) {} - -void NcclExecutorBackend::GroupRequests( - const std::vector& request_ids, - const std::function&&, void*)>& Handler) { - impl_->GroupRequests(request_ids, Handler); -} - -void* NcclExecutorBackend::CreateGroupToken(const std::vector& group) { - return impl_->CreateGroupToken(group); -} - -void NcclExecutorBackend::DestroyGroupToken(void* group_token) { - return impl_->DestroyGroupToken(group_token); -} - -void NcclExecutorBackend::ExecuteGroup(void* group_token) { impl_->ExecuteGroup(group_token); } - -} // namespace collective - -} // namespace boxing - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/job/collective_boxing/nccl_executor_backend.h" +#include "oneflow/core/job/collective_boxing/request_store.h" +#include "oneflow/core/device/nccl_util.h" +#include "oneflow/core/graph/boxing/collective_boxing_util.h" +#include "oneflow/core/job/resource_desc.h" +#include "oneflow/core/control/ctrl_client.h" +#include "oneflow/core/control/global_process_ctx.h" +#include "oneflow/core/job/global_for.h" +#include "oneflow/core/thread/thread_pool.h" +#include "oneflow/core/device/cuda_util.h" + +#include + +#include +#include + +namespace oneflow { + +namespace boxing { + +namespace collective { + +namespace { + +ncclRedOp_t GetNcclReduceOp(ReduceMethod reduce_method) { + if (reduce_method == kReduceMethodSum) { + return ncclRedOp_t::ncclSum; + } else { + UNIMPLEMENTED(); + return ncclRedOp_t{}; + } +} + +std::string GetNcclUniqueIdRpcKey(const std::string& name, int64_t stream_id) { + return "CollectiveBoxingExecutorNcclUniqueIdRpcKey-" + name + "-" + std::to_string(stream_id); +} + +struct CopyParams { + void* dst; + const void* src; + int64_t count; +}; + +constexpr int64_t kMultiCopyParamsMaxSize = 128; +constexpr int64_t kMultiCopyAlignSize = 32; + +int64_t GetMultiCopyAlignedSize(int64_t size) { + return ((size + kMultiCopyAlignSize - 1) / kMultiCopyAlignSize) * kMultiCopyAlignSize; +} + +struct MultiCopyParams { + CopyParams params[kMultiCopyParamsMaxSize]; + int64_t count; + + MultiCopyParams() : count(0), params{} {} + + void Add(void* dst, const void* src, int64_t count) { + CHECK_LT(this->count, kMultiCopyParamsMaxSize); + params[this->count].dst = dst; + params[this->count].src = src; + params[this->count].count = count; + this->count += 1; + } +}; + +using BulkType = ulonglong2; + +__global__ void MultiCopyGpu(MultiCopyParams multi_params) { + for (int64_t p = 0; p < multi_params.count; ++p) { + const CopyParams params = multi_params.params[p]; + auto* bulk_dst = reinterpret_cast(params.dst); + const auto* bulk_src = reinterpret_cast(params.src); + const int64_t bulk_count = params.count / sizeof(BulkType); + CUDA_1D_KERNEL_LOOP_T(int64_t, i, bulk_count) { bulk_dst[i] = bulk_src[i]; } + const int64_t tail_offset = bulk_count * sizeof(BulkType); + auto* tail_dst = reinterpret_cast(params.dst) + tail_offset; + const auto* tail_src = reinterpret_cast(params.src) + tail_offset; + const int64_t tail_count = params.count - tail_offset; + CUDA_1D_KERNEL_LOOP_T(int64_t, i, tail_count) { tail_dst[i] = tail_src[i]; } + } +} + +void MultiCopy(hipStream_t stream, const MultiCopyParams& multi_params) { + if (multi_params.count <= 0) { return; } + CHECK_LE(multi_params.count, kMultiCopyParamsMaxSize); + int64_t max_count = multi_params.params[0].count; + for (int64_t i = 0; i < multi_params.count; ++i) { + max_count = std::max(max_count, multi_params.params[i].count); + } + hipLaunchKernelGGL(MultiCopyGpu, BlocksNum4ThreadsNum(max_count), kCudaThreadsNumPerBlock, 0, stream, + multi_params); +} + +class CommRank final { + public: + OF_DISALLOW_COPY(CommRank); + CommRank(int32_t device_id, int32_t global_rank, int32_t global_rank_count, int32_t local_rank, + int32_t local_rank_count) + : device_id_(device_id), + global_rank_(global_rank), + local_rank_(local_rank), + nccl_comm_(nullptr) {} + + CommRank(CommRank&& rhs) noexcept { + this->device_id_ = rhs.device_id_; + this->global_rank_ = rhs.global_rank_; + this->local_rank_ = rhs.local_rank_; + this->nccl_comm_ = rhs.nccl_comm_; + rhs.nccl_comm_ = nullptr; + } + + ~CommRank() { + if (nccl_comm_ != nullptr) { + CudaCurrentDeviceGuard guard(device_id_); + OF_NCCL_CHECK(ncclCommDestroy(nccl_comm_)); + } + } + + int32_t device_id() const { return device_id_; } + + ncclComm_t nccl_comm() const { return nccl_comm_; } + + void InitRank(ncclUniqueId unique_id, int32_t global_rank_count) { + CudaCurrentDeviceGuard guard(device_id_); + OF_NCCL_CHECK(ncclCommInitRank(&nccl_comm_, global_rank_count, unique_id, global_rank_)); + } + + private: + int32_t device_id_; + int32_t global_rank_; + int32_t local_rank_; + ncclComm_t nccl_comm_; +}; + +class CommGroup final { + public: + OF_DISALLOW_COPY(CommGroup); + CommGroup() = default; + ~CommGroup() = default; + CommGroup(CommGroup&& rhs) noexcept { + rank_vec_.swap(rhs.rank_vec_); + global_rank_count_ = rhs.global_rank_count_; + } + + void InitGroup(const DeviceSet& device_set, const std::string& unique_name) { + CudaCurrentDeviceGuard guard; + const int64_t this_machine_id = GlobalProcessCtx::Rank(); + global_rank_count_ = device_set.device_size(); + std::vector local_ranks; + for (int32_t i = 0; i < global_rank_count_; ++i) { + if (device_set.device(i).machine_id() == this_machine_id) { local_ranks.emplace_back(i); } + } + const int32_t local_rank_count = local_ranks.size(); + CHECK_GT(local_rank_count, 0); + ncclUniqueId nccl_unique_id{}; + if (local_ranks.front() == 0) { + OF_NCCL_CHECK(ncclGetUniqueId(&nccl_unique_id)); + if (local_rank_count != global_rank_count_) { + Singleton::Get()->PushKV(unique_name, NcclUniqueIdToString(nccl_unique_id)); + } + } else { + Singleton::Get()->PullKV(unique_name, [&nccl_unique_id](const std::string& val) { + NcclUniqueIdFromString(val, &nccl_unique_id); + }); + } + rank_vec_.reserve(local_rank_count); + OF_NCCL_CHECK(ncclGroupStart()); + for (int32_t local_rank = 0; local_rank < local_ranks.size(); ++local_rank) { + const int32_t global_rank = local_ranks.at(local_rank); + const int32_t device_id = device_set.device(global_rank).device_id(); + OF_CUDA_CHECK(hipSetDevice(device_id)); + rank_vec_.emplace_back(device_id, global_rank, global_rank_count_, local_rank, + local_rank_count); + rank_vec_.at(local_rank).InitRank(nccl_unique_id, global_rank_count_); + } + OF_NCCL_CHECK(ncclGroupEnd()); + } + + int32_t global_rank_count() const { return global_rank_count_; } + + int32_t local_rank_count() const { return rank_vec_.size(); } + + const CommRank& GetCommRank(int32_t local_rank) const { return rank_vec_.at(local_rank); } + + private: + std::vector rank_vec_; + int32_t global_rank_count_ = 0; +}; + +class StreamCtx { + public: + OF_DISALLOW_COPY(StreamCtx); + StreamCtx(int32_t device_id, size_t fusion_buffer_size) + : device_id_(device_id), fusion_buffer_size_(fusion_buffer_size) { + CudaCurrentDeviceGuard guard(device_id_); + int priority; + OF_CUDA_CHECK(hipDeviceGetStreamPriorityRange(nullptr, &priority)); + OF_CUDA_CHECK(hipStreamCreateWithPriority(&stream_, hipStreamNonBlocking, priority)); + OF_CUDA_CHECK(hipMalloc(&fusion_buffer_, fusion_buffer_size_)); + cb_event_poller_ = std::thread(&StreamCtx::PollEvent, this); + } + ~StreamCtx() { + cb_event_chan_.Close(); + cb_event_poller_.join(); + CudaCurrentDeviceGuard guard(device_id_); + OF_CUDA_CHECK(hipStreamSynchronize(stream_)); + OF_CUDA_CHECK(hipStreamDestroy(stream_)); + OF_CUDA_CHECK(hipFree(fusion_buffer_)); + } + + void PollEvent() { + CudaCurrentDeviceGuard guard(device_id_); + while (true) { + std::pair> cb_event; + ChannelStatus status = cb_event_chan_.Receive(&cb_event); + if (status == kChannelStatusErrorClosed) { break; } + CHECK_EQ(status, kChannelStatusSuccess); + OF_CUDA_CHECK(hipEventSynchronize(cb_event.first)); + cb_event.second(); + OF_CUDA_CHECK(hipEventDestroy(cb_event.first)); + } + } + + void AddCallback(const std::function& callback) { + hipEvent_t event; + OF_CUDA_CHECK(hipEventCreateWithFlags(&event, hipEventDisableTiming)); + OF_CUDA_CHECK(hipEventRecord(event, stream_)); + CHECK_EQ(cb_event_chan_.Send(std::make_pair(event, callback)), kChannelStatusSuccess); + } + + int32_t device_id() const { return device_id_; } + + hipStream_t stream() const { return stream_; } + + size_t fusion_buffer_size() const { return fusion_buffer_size_; } + + char* fusion_buffer() const { return fusion_buffer_; } + + private: + int32_t device_id_; + hipStream_t stream_ = nullptr; + size_t fusion_buffer_size_; + char* fusion_buffer_ = nullptr; + Channel>> cb_event_chan_; + std::thread cb_event_poller_; +}; + +void LaunchFusedAllReduce(const CommGroup& comm_group, + const std::vector>& device_id2stream_ctx, + const std::shared_ptr& request_store, + const std::vector& request_ids) { + CHECK_LE(request_ids.size(), kMultiCopyParamsMaxSize); + RequestEntry* first_request_entry = request_store->MutRequestEntry(request_ids.front()); + const ncclDataType_t nccl_data_type = + GetNcclDataType(first_request_entry->desc().op_desc().data_type()); + const ncclRedOp_t nccl_reduce_op = + GetNcclReduceOp(first_request_entry->desc().op_desc().reduce_method()); + const int64_t size_of_data_type = + GetSizeOfDataType(first_request_entry->desc().op_desc().data_type()); + std::vector offset_vec; + offset_vec.reserve(request_ids.size()); + int64_t offset = 0; + request_store->ForEachMutRequestEntryForIdsInJob( + request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) { + offset_vec.emplace_back(offset); + offset += GetMultiCopyAlignedSize(request_entry->size_in_bytes()); + }); + const int64_t elem_cnt = offset / size_of_data_type; + for (int32_t local_rank = 0; local_rank < comm_group.local_rank_count(); ++local_rank) { + MultiCopyParams copy_in_params; + const CommRank& comm_rank = comm_group.GetCommRank(local_rank); + const StreamCtx* stream_ctx = device_id2stream_ctx.at(comm_rank.device_id()).get(); + CHECK_LE(offset, stream_ctx->fusion_buffer_size()); + request_store->ForEachMutRequestEntryForIdsInJob( + request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) { + copy_in_params.Add(stream_ctx->fusion_buffer() + offset_vec.at(i), + request_entry->GetRuntimeRequest(local_rank)->send_buff, + request_entry->size_in_bytes()); + }); + OF_CUDA_CHECK(hipSetDevice(comm_rank.device_id())); + MultiCopy(stream_ctx->stream(), copy_in_params); + } + + OF_NCCL_CHECK(ncclGroupStart()); + for (int32_t local_rank = 0; local_rank < comm_group.local_rank_count(); ++local_rank) { + const CommRank& comm_rank = comm_group.GetCommRank(local_rank); + const StreamCtx* stream_ctx = device_id2stream_ctx.at(comm_rank.device_id()).get(); + OF_CUDA_CHECK(hipSetDevice(comm_rank.device_id())); + OF_NCCL_CHECK(ncclAllReduce(stream_ctx->fusion_buffer(), stream_ctx->fusion_buffer(), elem_cnt, + nccl_data_type, nccl_reduce_op, comm_rank.nccl_comm(), + stream_ctx->stream())); + } + OF_NCCL_CHECK(ncclGroupEnd()); + + for (int32_t local_rank = 0; local_rank < comm_group.local_rank_count(); ++local_rank) { + MultiCopyParams copy_out_params; + const CommRank& comm_rank = comm_group.GetCommRank(local_rank); + const StreamCtx* stream_ctx = device_id2stream_ctx.at(comm_rank.device_id()).get(); + request_store->ForEachMutRequestEntryForIdsInJob( + request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) { + copy_out_params.Add(request_entry->GetRuntimeRequest(local_rank)->recv_buff, + stream_ctx->fusion_buffer() + offset_vec.at(i), + request_entry->size_in_bytes()); + }); + OF_CUDA_CHECK(hipSetDevice(comm_rank.device_id())); + MultiCopy(stream_ctx->stream(), copy_out_params); + } +} + +void LaunchAggregatedOps(const CommGroup& comm_group, + const std::vector>& device_id2stream_ctx, + const std::shared_ptr& request_store, + const std::vector& request_ids) { + OF_NCCL_CHECK(ncclGroupStart()); + for (int32_t local_rank = 0; local_rank < comm_group.local_rank_count(); ++local_rank) { + const CommRank& comm_rank = comm_group.GetCommRank(local_rank); + const auto comm = comm_rank.nccl_comm(); + const StreamCtx* stream_ctx = device_id2stream_ctx.at(comm_rank.device_id()).get(); + OF_CUDA_CHECK(hipSetDevice(comm_rank.device_id())); + request_store->ForEachMutRequestEntryForIdsInJob( + request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) { + const auto& op_desc = request_entry->desc().op_desc(); + const std::shared_ptr& runtime_request_info = + request_entry->GetRuntimeRequest(local_rank); + const OpType op_type = op_desc.op_type(); + const void* send_buff = runtime_request_info->send_buff; + void* recv_buff = runtime_request_info->recv_buff; + const int64_t elem_cnt = request_entry->elem_cnt(); + const ncclDataType_t nccl_data_type = GetNcclDataType(op_desc.data_type()); + const int32_t num_ranks = comm_group.global_rank_count(); + if (op_type == OpType::kOpTypeAllReduce) { + OF_NCCL_CHECK(ncclAllReduce(send_buff, recv_buff, elem_cnt, nccl_data_type, + GetNcclReduceOp(op_desc.reduce_method()), comm, + stream_ctx->stream())); + } else if (op_type == OpType::kOpTypeAllGather) { + CHECK_EQ(elem_cnt % num_ranks, 0); + OF_NCCL_CHECK(ncclAllGather(send_buff, recv_buff, elem_cnt / num_ranks, nccl_data_type, + comm, stream_ctx->stream())); + } else if (op_type == OpType::kOpTypeReduceScatter) { + CHECK_EQ(elem_cnt % num_ranks, 0); + OF_NCCL_CHECK(ncclReduceScatter( + send_buff, recv_buff, elem_cnt / num_ranks, nccl_data_type, + GetNcclReduceOp(op_desc.reduce_method()), comm, stream_ctx->stream())); + } else if (op_type == OpType::kOpTypeReduce) { + OF_NCCL_CHECK(ncclReduce(send_buff, recv_buff, elem_cnt, nccl_data_type, + GetNcclReduceOp(op_desc.reduce_method()), op_desc.root(), comm, + stream_ctx->stream())); + } else if (op_type == OpType::kOpTypeBroadcast) { + OF_NCCL_CHECK(ncclBroadcast(send_buff, recv_buff, elem_cnt, nccl_data_type, + op_desc.root(), comm, stream_ctx->stream())); + } else if (op_type == OpType::kOpTypeAll2All) { +#if NCCL_VERSION_CODE > 2700 + const int64_t elem_per_rank = elem_cnt / num_ranks; + const int64_t elem_per_chunk = elem_per_rank / num_ranks; + const int64_t dtype_size = GetSizeOfDataType(op_desc.data_type()); + const int64_t chunk_size = elem_per_chunk * dtype_size; + for (int64_t j = 0; j < num_ranks; ++j) { + OF_NCCL_CHECK(ncclSend(reinterpret_cast( + reinterpret_cast(send_buff) + j * chunk_size), + elem_per_chunk, nccl_data_type, j, comm, + stream_ctx->stream())); + OF_NCCL_CHECK(ncclRecv( + reinterpret_cast(reinterpret_cast(recv_buff) + j * chunk_size), + elem_per_chunk, nccl_data_type, j, comm, stream_ctx->stream())); + } +#else + UNIMPLEMENTED(); +#endif + } else { + UNIMPLEMENTED(); + } + }); + } + OF_NCCL_CHECK(ncclGroupEnd()); +} + +void AddCallbackAndResetRuntimeRequest( + const CommGroup& comm_group, + const std::vector>& device_id2stream_ctx, + const std::shared_ptr& request_store, const std::vector& request_ids) { + std::vector>> saved_runtime_request_info( + request_ids.size()); + request_store->ForEachMutRequestEntryForIdsInJob( + request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) { + saved_runtime_request_info.at(i) = std::move(request_entry->ResetRuntimeRequest()); + }); + for (int32_t local_rank = 0; local_rank < comm_group.local_rank_count(); ++local_rank) { + const CommRank& comm_rank = comm_group.GetCommRank(local_rank); + StreamCtx* stream_ctx = device_id2stream_ctx.at(comm_rank.device_id()).get(); + auto runtime_request_info_vec = + std::make_shared>>(); + runtime_request_info_vec->reserve(request_ids.size()); + request_store->ForEachMutRequestEntryForIdsInJob( + request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) { + runtime_request_info_vec->emplace_back( + std::move(saved_runtime_request_info.at(i).at(local_rank))); + }); + OF_CUDA_CHECK(hipSetDevice(comm_rank.device_id())); + stream_ctx->AddCallback([runtime_request_info_vec]() { + for (auto& runtime_request_info : *runtime_request_info_vec) { + runtime_request_info->callback(Maybe::Ok()); + } + }); + } +} + +} // namespace + +struct NcclExecutorBackend::Impl { + Impl(const CollectiveBoxingConf& conf, std::shared_ptr request_store) + : conf(conf), request_store(std::move(request_store)) { + CHECK_GT(conf.nccl_num_streams(), 0); + CHECK_GE(conf.nccl_fusion_threshold_mb(), 0); + fusion_threshold = conf.nccl_fusion_threshold_mb() * 1024 * 1024; + num_streams = conf.nccl_num_streams(); + current_stream_id = 0; + enable_mixed_fusion = + (!conf.nccl_fusion_all_reduce_use_buffer()) && conf.nccl_enable_mixed_fusion(); + int nccl_version; + OF_NCCL_CHECK(ncclGetVersion(&nccl_version)); + if (nccl_version == 21003) { + LOG(WARNING) + << "Current nccl version is 2.10.3, in this version, ncclGroup() with mixed " + "datatype/element/collective could induce crash or corruption, so we will not " + "fuse any request."; + } + InitStreamCtx(); + InitIsOpTypeFusionEnabled(); + } + ~Impl() { + stream_id2device_id2stream_ctx.clear(); + device_set2stream_id2comm_group.clear(); + } + + void InitCommGroup(int64_t job_id) { + std::set local_device_ids; + request_store->ForEachMutRequestEntryInJob( + job_id, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) { + const auto& request = request_entry->desc(); + if (request.op_desc().backend() != Backend::kBackendNCCL) { return; } + if (!request_entry->HasRankOnThisNode()) { return; } + const DeviceSet& device_set = request.device_set(); + if (device_set2stream_id2comm_group.count(device_set) > 0) { return; } + auto& stream_id2comm_group = device_set2stream_id2comm_group[device_set]; + stream_id2comm_group.resize(num_streams); + for (int32_t stream_id = 0; stream_id < num_streams; ++stream_id) { + stream_id2comm_group.at(stream_id).InitGroup( + device_set, GetNcclUniqueIdRpcKey(request.op_desc().name(), stream_id)); + } + for (int32_t j = 0; j < stream_id2comm_group.at(0).local_rank_count(); ++j) { + local_device_ids.emplace(stream_id2comm_group.at(0).GetCommRank(j).device_id()); + } + }); + for (int32_t stream_id = 0; stream_id < num_streams; ++stream_id) { + for (const int64_t device_id : local_device_ids) { + if (stream_id2device_id2stream_ctx.at(stream_id).at(device_id) == nullptr) { + stream_id2device_id2stream_ctx.at(stream_id).at(device_id) = + std::make_unique(device_id, fusion_threshold); + } + } + } + } + + void InitStreamCtx() { + int32_t num_devices; + OF_CUDA_CHECK(hipGetDeviceCount(&num_devices)); + stream_id2device_id2stream_ctx.resize(num_streams); + for (int64_t stream_id = 0; stream_id < num_streams; ++stream_id) { + stream_id2device_id2stream_ctx.at(stream_id).resize(num_devices); + } + } + + void InitIsOpTypeFusionEnabled() { + op_type2fusion_enabled.resize(OpType_ARRAYSIZE, false); + op_type2fusion_enabled.at(OpType::kOpTypeAllReduce) = conf.nccl_fusion_all_reduce(); + op_type2fusion_enabled.at(OpType::kOpTypeAllGather) = conf.nccl_fusion_all_gather(); + op_type2fusion_enabled.at(OpType::kOpTypeReduceScatter) = conf.nccl_fusion_reduce_scatter(); + op_type2fusion_enabled.at(OpType::kOpTypeReduce) = conf.nccl_fusion_reduce(); + op_type2fusion_enabled.at(OpType::kOpTypeBroadcast) = conf.nccl_fusion_broadcast(); + op_type2fusion_enabled.at(OpType::kOpTypeAll2All) = false; + } + + int32_t NextStreamId() { + const int32_t stream_id = current_stream_id; + current_stream_id = (current_stream_id + 1) % num_streams; + return stream_id; + } + + bool IsOpTypeFusionEnabled(OpType op_type) const { return op_type2fusion_enabled.at(op_type); } + + bool IsRequestEntryFusionEnabled(const RequestEntry* entry) const { + return IsOpTypeFusionEnabled(entry->desc().op_desc().op_type()); + } + + bool CanRequestEntryFuse(const RequestEntry* lhs, const RequestEntry* rhs) const { + { + int nccl_version; + OF_NCCL_CHECK(ncclGetVersion(&nccl_version)); + // Workaround for https://github.com/NVIDIA/nccl/issues/560 + if (nccl_version == 21003) { return false; } + } + if (lhs->device_set_symbol() != rhs->device_set_symbol()) { return false; } + if ((!IsRequestEntryFusionEnabled(lhs)) || (!IsRequestEntryFusionEnabled(rhs))) { + return false; + } + if ((!enable_mixed_fusion) + && lhs->desc().op_desc().op_type() != rhs->desc().op_desc().op_type()) { + return false; + } + if (conf.nccl_fusion_all_reduce_use_buffer()) { + if (lhs->desc().op_desc().op_type() == OpType::kOpTypeAllReduce + && rhs->desc().op_desc().op_type() == OpType::kOpTypeAllReduce) { + CHECK(lhs->desc().op_desc().has_reduce_method()); + CHECK(rhs->desc().op_desc().has_reduce_method()); + return lhs->desc().op_desc().reduce_method() == rhs->desc().op_desc().reduce_method() + && lhs->desc().op_desc().data_type() == rhs->desc().op_desc().data_type(); + } else if (lhs->desc().op_desc().op_type() == OpType::kOpTypeAllReduce + || rhs->desc().op_desc().op_type() == OpType::kOpTypeAllReduce) { + return false; + } else { + return true; + } + } else { + return true; + } + } + + void GroupRequests(const std::vector& request_ids, + const std::function&&, void*)>& Handler) { + std::vector group; + int64_t group_size = 0; + const int64_t fusion_max_ops = std::min(conf.nccl_fusion_max_ops(), kMultiCopyParamsMaxSize); + request_store->ForEachMutRequestEntryForIdsInJob( + request_ids, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) { + const auto& request = request_entry->desc(); + const int64_t size = GetMultiCopyAlignedSize(request_entry->size_in_bytes()); + if (group.empty() + || !CanRequestEntryFuse(request_store->MutRequestEntry(group.back()), request_entry) + || group_size + size > fusion_threshold || group.size() >= fusion_max_ops) { + if (!group.empty()) { + void* token = CreateGroupToken(group); + Handler(std::move(group), token); + group.clear(); + group_size = 0; + } + } + group.emplace_back(request_id); + group_size += size; + }); + if (!group.empty()) { + void* token = CreateGroupToken(group); + Handler(std::move(group), token); + } + } + + struct GroupToken { + GroupToken(const std::vector& group, std::vector* stream_id2comm_group) + : request_ids(group), stream_id2comm_group(stream_id2comm_group) {} + std::vector request_ids; + std::vector* stream_id2comm_group; + }; + + void* CreateGroupToken(const std::vector& group) { + CHECK_GT(group.size(), 0); + void* group_token; + const DeviceSet& first_device_set = + request_store->MutRequestEntry(group.front())->desc().device_set(); + auto it = device_set2stream_id2comm_group.find(first_device_set); + CHECK(it != device_set2stream_id2comm_group.end()); + group_token = new GroupToken(group, &it->second); + request_store->ForEachMutRequestEntryForIdsInJob( + group, [&](RequestEntry* request_entry, int32_t i, const RequestId& request_id) { + const DeviceSet& device_set = request_entry->desc().device_set(); + CHECK(first_device_set == device_set); + }); + return group_token; + } + + void DestroyGroupToken(void* group_token) { + GroupToken* token = static_cast(group_token); + delete token; + } + + void ExecuteGroup(void* group_token) { + GroupToken* token = static_cast(group_token); + const std::vector& request_ids = token->request_ids; + if (request_ids.empty()) { return; } + const int32_t stream_id = NextStreamId(); + CudaCurrentDeviceGuard device_guard; + const auto& comm_group = token->stream_id2comm_group->at(stream_id); + auto& device_id2stream_ctx = stream_id2device_id2stream_ctx.at(stream_id); + if (request_store->MutRequestEntry(request_ids.front())->desc().op_desc().op_type() + == OpType::kOpTypeAllReduce + && conf.nccl_fusion_all_reduce_use_buffer() && request_ids.size() > 1) { + LaunchFusedAllReduce(comm_group, device_id2stream_ctx, request_store, request_ids); + } else { + LaunchAggregatedOps(comm_group, device_id2stream_ctx, request_store, request_ids); + } + AddCallbackAndResetRuntimeRequest(comm_group, device_id2stream_ctx, request_store, request_ids); + } + + CollectiveBoxingConf conf; + int64_t fusion_threshold; + int32_t num_streams; + int32_t current_stream_id; + bool enable_mixed_fusion; + std::vector op_type2fusion_enabled; + std::shared_ptr request_store; + HashMap> device_set2stream_id2comm_group; + std::vector>> stream_id2device_id2stream_ctx; +}; + +NcclExecutorBackend::NcclExecutorBackend() = default; + +NcclExecutorBackend::~NcclExecutorBackend() = default; + +void NcclExecutorBackend::Init(std::shared_ptr request_store) { + impl_ = std::make_unique( + Singleton::Get()->collective_boxing_conf(), request_store); +} + +void NcclExecutorBackend::InitJob(int64_t job_id) { + CudaCurrentDeviceGuard guard; + impl_->InitCommGroup(job_id); +} + +void NcclExecutorBackend::DeinitJob(int64_t job_id) {} + +void NcclExecutorBackend::GroupRequests( + const std::vector& request_ids, + const std::function&&, void*)>& Handler) { + impl_->GroupRequests(request_ids, Handler); +} + +void* NcclExecutorBackend::CreateGroupToken(const std::vector& group) { + return impl_->CreateGroupToken(group); +} + +void NcclExecutorBackend::DestroyGroupToken(void* group_token) { + return impl_->DestroyGroupToken(group_token); +} + +void NcclExecutorBackend::ExecuteGroup(void* group_token) { impl_->ExecuteGroup(group_token); } + +} // namespace collective + +} // namespace boxing + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/job_rewriter/insert_nccl_logical_op_pass.cpp b/oneflow/core/job_rewriter/insert_nccl_logical_op_pass.cpp index b2ab4cb..36d981a 100644 --- a/oneflow/core/job_rewriter/insert_nccl_logical_op_pass.cpp +++ b/oneflow/core/job_rewriter/insert_nccl_logical_op_pass.cpp @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "oneflow/core/job/nd_sbp_util.h" -#if defined(WITH_CUDA) || defined(ROCM) +#if defined(WITH_CUDA) || defined(WITH_ROCM) #include "oneflow/core/framework/framework.h" #include "oneflow/core/framework/nd_sbp.h" #include "oneflow/core/framework/instructions_builder.h" diff --git a/oneflow/core/job_rewriter/sequential_one_embedding_shuffle_ops_pass.cpp b/oneflow/core/job_rewriter/sequential_one_embedding_shuffle_ops_pass.cpp index 961e203..6a524fa 100644 --- a/oneflow/core/job_rewriter/sequential_one_embedding_shuffle_ops_pass.cpp +++ b/oneflow/core/job_rewriter/sequential_one_embedding_shuffle_ops_pass.cpp @@ -1,80 +1,80 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#include "oneflow/core/job_rewriter/job_pass.h" -#include "oneflow/core/framework/framework.h" - -namespace oneflow { - -namespace { - -class SequentialOneEmbeddingOpsPass final : public JobPass { - public: - SequentialOneEmbeddingOpsPass() = default; - ~SequentialOneEmbeddingOpsPass() override = default; - - bool IsEnabled(const JobPassCtx& ctx) const { - return ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_DISABLE_PIPELINED_EXECUTION", false); - } - Maybe Apply(const OpGraph& op_graph, JobBuilder* job_builder) const; - - Maybe Apply(Job* job, JobPassCtx* ctx) const override { - if (!IsEnabled(*ctx)) { return Maybe::Ok(); } - const OpGraph op_graph(*job); - JobBuilder job_builder(job); - return Apply(op_graph, &job_builder); - } -}; - -bool IsUserOpWithTypeName(const OperatorConf& op_conf, const std::string& op_type_name) { - return op_conf.has_user_conf() && op_conf.user_conf().op_type_name() == op_type_name; -}; - -Maybe SequentialOneEmbeddingOpsPass::Apply(const OpGraph& op_graph, - JobBuilder* job_builder) const { - HashMap> stream_name_hint2shuffle_op_names; - op_graph.TopoForEachNode([&](const OpNode* op_node) { - if (!(IsUserOpWithTypeName(op_node->op().op_conf(), "id_shuffle") - || IsUserOpWithTypeName(op_node->op().op_conf(), "embedding_shuffle") - || IsUserOpWithTypeName(op_node->op().op_conf(), "embedding_gradient_shuffle"))) { - return; - } - OperatorConf op_conf = op_node->op().op_conf(); - std::string stream_name; - if (op_conf.has_stream_name_hint()) { - stream_name = op_conf.stream_name_hint(); - } else { - stream_name = "DEFAULT"; - } - const auto& it = stream_name_hint2shuffle_op_names.find(stream_name); - if (it != stream_name_hint2shuffle_op_names.end()) { - if (it->second.size() > 0) { - std::string pre_shuffle_op_name = it->second.back(); - op_conf.add_ctrl_in_op_name(pre_shuffle_op_name); - job_builder->MutOpsOnlyOnce({op_conf}); - } - it->second.push_back(op_conf.name()); - } else { - std::vector shuffle_ops{op_conf.name()}; - CHECK(stream_name_hint2shuffle_op_names.emplace(stream_name, shuffle_ops).second); - } - }); - - return Maybe::Ok(); -} - -} // namespace - -REGISTER_JOB_PASS("SequentialOneEmbeddingOpsPass", SequentialOneEmbeddingOpsPass); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "oneflow/core/job_rewriter/job_pass.h" +#include "oneflow/core/framework/framework.h" + +namespace oneflow { + +namespace { + +class SequentialOneEmbeddingOpsPass final : public JobPass { + public: + SequentialOneEmbeddingOpsPass() = default; + ~SequentialOneEmbeddingOpsPass() override = default; + + bool IsEnabled(const JobPassCtx& ctx) const { + return ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_DISABLE_PIPELINED_EXECUTION", false); + } + Maybe Apply(const OpGraph& op_graph, JobBuilder* job_builder) const; + + Maybe Apply(Job* job, JobPassCtx* ctx) const override { + if (!IsEnabled(*ctx)) { return Maybe::Ok(); } + const OpGraph op_graph(*job); + JobBuilder job_builder(job); + return Apply(op_graph, &job_builder); + } +}; + +bool IsUserOpWithTypeName(const OperatorConf& op_conf, const std::string& op_type_name) { + return op_conf.has_user_conf() && op_conf.user_conf().op_type_name() == op_type_name; +}; + +Maybe SequentialOneEmbeddingOpsPass::Apply(const OpGraph& op_graph, + JobBuilder* job_builder) const { + HashMap> stream_name_hint2shuffle_op_names; + op_graph.TopoForEachNode([&](const OpNode* op_node) { + if (!(IsUserOpWithTypeName(op_node->op().op_conf(), "id_shuffle") + || IsUserOpWithTypeName(op_node->op().op_conf(), "embedding_shuffle") + || IsUserOpWithTypeName(op_node->op().op_conf(), "embedding_gradient_shuffle"))) { + return; + } + OperatorConf op_conf = op_node->op().op_conf(); + std::string stream_name; + if (op_conf.has_stream_name_hint()) { + stream_name = op_conf.stream_name_hint(); + } else { + stream_name = "DEFAULT"; + } + const auto& it = stream_name_hint2shuffle_op_names.find(stream_name); + if (it != stream_name_hint2shuffle_op_names.end()) { + if (it->second.size() > 0) { + std::string pre_shuffle_op_name = it->second.back(); + op_conf.add_ctrl_in_op_name(pre_shuffle_op_name); + job_builder->MutOpsOnlyOnce({op_conf}); + } + it->second.push_back(op_conf.name()); + } else { + std::vector shuffle_ops{op_conf.name()}; + CHECK(stream_name_hint2shuffle_op_names.emplace(stream_name, shuffle_ops).second); + } + }); + + return Maybe::Ok(); +} + +} // namespace + +REGISTER_JOB_PASS("SequentialOneEmbeddingOpsPass", SequentialOneEmbeddingOpsPass); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/kernel/cuda_check_numerics_kernel_observer.hip.cpp b/oneflow/core/kernel/cuda_check_numerics_kernel_observer.hip.cpp index b0c81a6..2c38465 100644 --- a/oneflow/core/kernel/cuda_check_numerics_kernel_observer.hip.cpp +++ b/oneflow/core/kernel/cuda_check_numerics_kernel_observer.hip.cpp @@ -1,133 +1,133 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/kernel/cuda_check_numerics_kernel_observer.h" -#include "oneflow/core/kernel/kernel.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -__device__ bool IsNotFinite(T x) { - return !isfinite(x); -} - -template<> -__device__ bool IsNotFinite(half x) { - return (__hisinf(x) || __hisnan(x)); -} - -template -__global__ void HasNotFiniteGpuKernel(const int64_t n, const T* x, volatile bool* has_not_finite) { - if (*has_not_finite) { return; } - CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) { - if (IsNotFinite(x[i])) { - *has_not_finite = true; - return; - } - } -} - -template -bool HasNotFinite(ep::Stream* stream, const int64_t elem_cnt, const T* data_ptr, - bool* has_not_finite_host, bool* has_not_finite_device) { - OF_CUDA_CHECK(hipMemsetAsync(has_not_finite_device, 0, sizeof(bool), - stream->As()->cuda_stream())); - HasNotFiniteGpuKernel - <<As()->cuda_stream()>>>(elem_cnt, data_ptr, has_not_finite_device); - OF_CUDA_CHECK(hipMemcpyAsync(has_not_finite_host, has_not_finite_device, sizeof(bool), - hipMemcpyDefault, stream->As()->cuda_stream())); - OF_CUDA_CHECK(hipStreamSynchronize(stream->As()->cuda_stream())); - return *has_not_finite_host; -} - -bool HasNotFiniteGpu(ep::Stream* stream, const Blob* blob, bool* has_not_finite_host, - bool* has_not_finite_device) { - auto* cuda_stream = stream->As(); - const DataType dtype = blob->data_type(); - const int64_t elem_cnt = blob->shape().elem_cnt(); - if (elem_cnt == 0) { return false; } - if (dtype == kFloat) { - return HasNotFinite(stream, elem_cnt, blob->dptr(), has_not_finite_host, - has_not_finite_device); - } else if (dtype == kDouble) { - return HasNotFinite(stream, elem_cnt, blob->dptr(), has_not_finite_host, - has_not_finite_device); - } else if (dtype == kFloat16) { - if (cuda_stream->cuda_arch() >= 530) { - return HasNotFinite(stream, elem_cnt, blob->dptr(), has_not_finite_host, - has_not_finite_device); - } else { - LOG(FATAL) << "use half need nvcc arch >= 530"; - return true; - } - } else { - return false; - } -} - -void DumpBlob(KernelContext* ctx, const std::string& bn) { - Blob* blob = ctx->BnInOp2Blob(bn); - if (blob != nullptr) { - std::vector buffer(blob->ByteSizeOfBlobBody()); - OF_CUDA_CHECK( - hipMemcpy(buffer.data(), blob->dptr(), blob->ByteSizeOfBlobBody(), hipMemcpyDefault)); - OF_CUDA_CHECK(hipDeviceSynchronize()); - std::ofstream ofs(bn); - ofs.write(buffer.data(), blob->ByteSizeOfBlobBody()); - } -} - -void DumpBlobs(KernelContext* ctx, const Kernel* kernel) { - for (const auto& obn : kernel->op_attribute().output_bns()) { DumpBlob(ctx, obn); } - for (const auto& ibn : kernel->op_attribute().input_bns()) { DumpBlob(ctx, ibn); } -} - -} // namespace - -CudaCheckNumericsKernelObserver::CudaCheckNumericsKernelObserver() - : has_not_finite_host_(nullptr), has_not_finite_device_(nullptr) { - OF_CUDA_CHECK(hipGetDevice(&device_id_)); - OF_CUDA_CHECK(hipMallocHost(reinterpret_cast(&has_not_finite_host_), sizeof(bool))); - OF_CUDA_CHECK(hipMalloc(&has_not_finite_device_, sizeof(bool))); -} - -CudaCheckNumericsKernelObserver::~CudaCheckNumericsKernelObserver() { - CudaCurrentDeviceGuard guard(device_id_); - OF_CUDA_CHECK(hipHostFree(has_not_finite_host_)); - OF_CUDA_CHECK(hipFree(has_not_finite_device_)); -} - -void CudaCheckNumericsKernelObserver::DidForwardDataContent(KernelContext* ctx, - const Kernel* kernel) { - for (const auto& obn : kernel->op_attribute().output_bns()) { - Blob* blob = ctx->BnInOp2Blob(obn); - if (blob != nullptr) { - bool has_not_finite = - HasNotFiniteGpu(ctx->stream(), blob, has_not_finite_host_, has_not_finite_device_); - if (has_not_finite - && ParseBooleanFromEnv("ONEFLOW_DEBUG_KERNEL_SYNC_CHECK_NUMERICS_DUMP", false)) { - DumpBlobs(ctx, kernel); - } - CHECK(!has_not_finite) << kernel->op_conf().name() << " : " << obn << " has nan or inf"; - } - } -} - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/kernel/cuda_check_numerics_kernel_observer.h" +#include "oneflow/core/kernel/kernel.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +__device__ bool IsNotFinite(T x) { + return !isfinite(x); +} + +template<> +__device__ bool IsNotFinite(half x) { + return (__hisinf(x) || __hisnan(x)); +} + +template +__global__ void HasNotFiniteGpuKernel(const int64_t n, const T* x, volatile bool* has_not_finite) { + if (*has_not_finite) { return; } + CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) { + if (IsNotFinite(x[i])) { + *has_not_finite = true; + return; + } + } +} + +template +bool HasNotFinite(ep::Stream* stream, const int64_t elem_cnt, const T* data_ptr, + bool* has_not_finite_host, bool* has_not_finite_device) { + OF_CUDA_CHECK(hipMemsetAsync(has_not_finite_device, 0, sizeof(bool), + stream->As()->cuda_stream())); + HasNotFiniteGpuKernel + <<As()->cuda_stream()>>>(elem_cnt, data_ptr, has_not_finite_device); + OF_CUDA_CHECK(hipMemcpyAsync(has_not_finite_host, has_not_finite_device, sizeof(bool), + hipMemcpyDefault, stream->As()->cuda_stream())); + OF_CUDA_CHECK(hipStreamSynchronize(stream->As()->cuda_stream())); + return *has_not_finite_host; +} + +bool HasNotFiniteGpu(ep::Stream* stream, const Blob* blob, bool* has_not_finite_host, + bool* has_not_finite_device) { + auto* cuda_stream = stream->As(); + const DataType dtype = blob->data_type(); + const int64_t elem_cnt = blob->shape().elem_cnt(); + if (elem_cnt == 0) { return false; } + if (dtype == kFloat) { + return HasNotFinite(stream, elem_cnt, blob->dptr(), has_not_finite_host, + has_not_finite_device); + } else if (dtype == kDouble) { + return HasNotFinite(stream, elem_cnt, blob->dptr(), has_not_finite_host, + has_not_finite_device); + } else if (dtype == kFloat16) { + if (cuda_stream->cuda_arch() >= 530) { + return HasNotFinite(stream, elem_cnt, blob->dptr(), has_not_finite_host, + has_not_finite_device); + } else { + LOG(FATAL) << "use half need nvcc arch >= 530"; + return true; + } + } else { + return false; + } +} + +void DumpBlob(KernelContext* ctx, const std::string& bn) { + Blob* blob = ctx->BnInOp2Blob(bn); + if (blob != nullptr) { + std::vector buffer(blob->ByteSizeOfBlobBody()); + OF_CUDA_CHECK( + hipMemcpy(buffer.data(), blob->dptr(), blob->ByteSizeOfBlobBody(), hipMemcpyDefault)); + OF_CUDA_CHECK(hipDeviceSynchronize()); + std::ofstream ofs(bn); + ofs.write(buffer.data(), blob->ByteSizeOfBlobBody()); + } +} + +void DumpBlobs(KernelContext* ctx, const Kernel* kernel) { + for (const auto& obn : kernel->op_attribute().output_bns()) { DumpBlob(ctx, obn); } + for (const auto& ibn : kernel->op_attribute().input_bns()) { DumpBlob(ctx, ibn); } +} + +} // namespace + +CudaCheckNumericsKernelObserver::CudaCheckNumericsKernelObserver() + : has_not_finite_host_(nullptr), has_not_finite_device_(nullptr) { + OF_CUDA_CHECK(hipGetDevice(&device_id_)); + OF_CUDA_CHECK(hipMallocHost(reinterpret_cast(&has_not_finite_host_), sizeof(bool))); + OF_CUDA_CHECK(hipMalloc(&has_not_finite_device_, sizeof(bool))); +} + +CudaCheckNumericsKernelObserver::~CudaCheckNumericsKernelObserver() { + CudaCurrentDeviceGuard guard(device_id_); + OF_CUDA_CHECK(hipHostFree(has_not_finite_host_)); + OF_CUDA_CHECK(hipFree(has_not_finite_device_)); +} + +void CudaCheckNumericsKernelObserver::DidForwardDataContent(KernelContext* ctx, + const Kernel* kernel) { + for (const auto& obn : kernel->op_attribute().output_bns()) { + Blob* blob = ctx->BnInOp2Blob(obn); + if (blob != nullptr) { + bool has_not_finite = + HasNotFiniteGpu(ctx->stream(), blob, has_not_finite_host_, has_not_finite_device_); + if (has_not_finite + && ParseBooleanFromEnv("ONEFLOW_DEBUG_KERNEL_SYNC_CHECK_NUMERICS_DUMP", false)) { + DumpBlobs(ctx, kernel); + } + CHECK(!has_not_finite) << kernel->op_conf().name() << " : " << obn << " has nan or inf"; + } + } +} + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/kernel/kernel_util.hip.h b/oneflow/core/kernel/kernel_util.hip.h index fc466e8..15a01bc 100644 --- a/oneflow/core/kernel/kernel_util.hip.h +++ b/oneflow/core/kernel/kernel_util.hip.h @@ -1,53 +1,53 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_KERNEL_KERNEL_UTIL_HIP_H_ -#define ONEFLOW_CORE_KERNEL_KERNEL_UTIL_HIP_H_ -#include "oneflow/core/device/cuda_pseudo_half.h" -#include "oneflow/core/common/data_type.h" - -namespace oneflow { - -template::value>::type* = nullptr> -OF_DEVICE_FUNC T MaxWithLogThreshold(T x) { - const T threshold = 1e-20; - return x > threshold ? x : threshold; -} - -template::value>::type* = nullptr> -OF_DEVICE_FUNC T MaxWithLogThreshold(T x) { - return x; -} - -#if defined(__CUDACC__) || defined(__HIPCC__) -__device__ __forceinline__ half MaxWithLogThreshold(half x) { - half threshold = hexp2(__float2half(-14.0)); - if (__hgt(x, threshold)) { return x; } - return threshold; -} -#endif - -template -OF_DEVICE_FUNC T SafeLog(T x) { - return logf(MaxWithLogThreshold(x)); -} - -#if defined(__CUDACC__) || defined(__HIPCC__) -__device__ __forceinline__ half SafeLog(half x) { return hlog(MaxWithLogThreshold(x)); } -#endif - -} // namespace oneflow - -#endif // ONEFLOW_CORE_KERNEL_KERNEL_UTIL_HIP_H_ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_KERNEL_KERNEL_UTIL_HIP_H_ +#define ONEFLOW_CORE_KERNEL_KERNEL_UTIL_HIP_H_ +#include "oneflow/core/device/cuda_pseudo_half.h" +#include "oneflow/core/common/data_type.h" + +namespace oneflow { + +template::value>::type* = nullptr> +OF_DEVICE_FUNC T MaxWithLogThreshold(T x) { + const T threshold = 1e-20; + return x > threshold ? x : threshold; +} + +template::value>::type* = nullptr> +OF_DEVICE_FUNC T MaxWithLogThreshold(T x) { + return x; +} + +#if defined(__CUDACC__) || defined(__HIPCC__) +__device__ __forceinline__ half MaxWithLogThreshold(half x) { + half threshold = hexp2(__float2half(-14.0)); + if (__hgt(x, threshold)) { return x; } + return threshold; +} +#endif + +template +OF_DEVICE_FUNC T SafeLog(T x) { + return logf(MaxWithLogThreshold(x)); +} + +#if defined(__CUDACC__) || defined(__HIPCC__) +__device__ __forceinline__ half SafeLog(half x) { return hlog(MaxWithLogThreshold(x)); } +#endif + +} // namespace oneflow + +#endif // ONEFLOW_CORE_KERNEL_KERNEL_UTIL_HIP_H_ diff --git a/oneflow/core/kernel/random_generator.hip.cpp b/oneflow/core/kernel/random_generator.hip.cpp index e35a1c6..db8dbd9 100644 --- a/oneflow/core/kernel/random_generator.hip.cpp +++ b/oneflow/core/kernel/random_generator.hip.cpp @@ -1,59 +1,59 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/kernel/random_generator.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - - -namespace oneflow { - -namespace { - -template -void RngUniformGpu(const hiprandGenerator_t& gen, int64_t n, T* ret); - -template<> -void RngUniformGpu(const hiprandGenerator_t& gen, int64_t n, float* ret) { - OF_CURAND_CHECK(hiprandGenerateUniform(gen, ret, n)); -} - -template<> -void RngUniformGpu(const hiprandGenerator_t& gen, int64_t n, double* ret) { - OF_CURAND_CHECK(hiprandGenerateUniformDouble(gen, ret, n)); -} - -} // namespace - -RandomGenerator::RandomGenerator(int64_t seed, ep::Stream* stream) { - OF_CURAND_CHECK(hiprandCreateGenerator(&curand_generator_, HIPRAND_RNG_PSEUDO_DEFAULT)); - OF_CURAND_CHECK(hiprandSetPseudoRandomGeneratorSeed(curand_generator_, seed)); - OF_CURAND_CHECK(hiprandSetStream(curand_generator_, stream->As()->cuda_stream())); -} - -RandomGenerator::~RandomGenerator() { - OF_CURAND_CHECK(hiprandDestroyGenerator(curand_generator_)); -} - -template -void RandomGenerator::Uniform(const int64_t elem_cnt, T* dptr) { - RngUniformGpu(curand_generator_, elem_cnt, dptr); -} - -#define INITIATE_CUDA_RANDOM_GENERATOR_UNIFORM(T, typeproto) \ - template void RandomGenerator::Uniform(const int64_t elem_cnt, T* dptr); - -OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_RANDOM_GENERATOR_UNIFORM, FLOATING_DATA_TYPE_SEQ); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/kernel/random_generator.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + + +namespace oneflow { + +namespace { + +template +void RngUniformGpu(const hiprandGenerator_t& gen, int64_t n, T* ret); + +template<> +void RngUniformGpu(const hiprandGenerator_t& gen, int64_t n, float* ret) { + OF_CURAND_CHECK(hiprandGenerateUniform(gen, ret, n)); +} + +template<> +void RngUniformGpu(const hiprandGenerator_t& gen, int64_t n, double* ret) { + OF_CURAND_CHECK(hiprandGenerateUniformDouble(gen, ret, n)); +} + +} // namespace + +RandomGenerator::RandomGenerator(int64_t seed, ep::Stream* stream) { + OF_CURAND_CHECK(hiprandCreateGenerator(&curand_generator_, HIPRAND_RNG_PSEUDO_DEFAULT)); + OF_CURAND_CHECK(hiprandSetPseudoRandomGeneratorSeed(curand_generator_, seed)); + OF_CURAND_CHECK(hiprandSetStream(curand_generator_, stream->As()->cuda_stream())); +} + +RandomGenerator::~RandomGenerator() { + OF_CURAND_CHECK(hiprandDestroyGenerator(curand_generator_)); +} + +template +void RandomGenerator::Uniform(const int64_t elem_cnt, T* dptr) { + RngUniformGpu(curand_generator_, elem_cnt, dptr); +} + +#define INITIATE_CUDA_RANDOM_GENERATOR_UNIFORM(T, typeproto) \ + template void RandomGenerator::Uniform(const int64_t elem_cnt, T* dptr); + +OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_RANDOM_GENERATOR_UNIFORM, FLOATING_DATA_TYPE_SEQ); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/kernel/util/numeric_limits.hip.h b/oneflow/core/kernel/util/numeric_limits.hip.h index 96a9b10..7cdc409 100644 --- a/oneflow/core/kernel/util/numeric_limits.hip.h +++ b/oneflow/core/kernel/util/numeric_limits.hip.h @@ -1,128 +1,128 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// reference: https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/NumericLimits.cuh -#pragma once -#include -#include -#include - -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/framework/framework.h" - -// numeric_limits.cuh is a holder for numeric limits definitions of commonly used -// types. This header is very specific to ROCm HIP and may be removed in the future. - -// The lower_bound and upper_bound constants are same as lowest and max for -// integral types, but are -inf and +inf for floating point types. They are -// useful in implementing min, max, etc. - -namespace oneflow { -namespace detail { - -#if defined(__HIPCC__) -#define OF_NUMERICS_FUNC static inline __host__ __device__ -#else -#define OF_NUMERICS_FUNC static inline -#endif - -template -struct numeric_limits {}; - -// WARNING: the following oneflow::numeric_limits definitions are there only to support -// HIP compilation for the moment. Use std::numeric_limits if you are not -// compiling for ROCm. -// from @colesbury: "The functions on numeric_limits aren't marked with -// __device__ which is why they don't work with ROCm. CUDA allows them -// because they're constexpr." - -namespace { -// ROCm doesn't like INFINITY too. -constexpr double inf = INFINITY; -} // namespace - -template<> -struct numeric_limits { - OF_NUMERICS_FUNC bool lowest() { return false; } - OF_NUMERICS_FUNC bool max() { return true; } - OF_NUMERICS_FUNC bool lower_bound() { return false; } - OF_NUMERICS_FUNC bool upper_bound() { return true; } -}; - -template<> -struct numeric_limits { - OF_NUMERICS_FUNC uint8_t lowest() { return 0; } - OF_NUMERICS_FUNC uint8_t max() { return UINT8_MAX; } - OF_NUMERICS_FUNC uint8_t lower_bound() { return 0; } - OF_NUMERICS_FUNC uint8_t upper_bound() { return UINT8_MAX; } -}; - -template<> -struct numeric_limits { - OF_NUMERICS_FUNC int8_t lowest() { return INT8_MIN; } - OF_NUMERICS_FUNC int8_t max() { return INT8_MAX; } - OF_NUMERICS_FUNC int8_t lower_bound() { return INT8_MIN; } - OF_NUMERICS_FUNC int8_t upper_bound() { return INT8_MAX; } -}; - -template<> -struct numeric_limits { - OF_NUMERICS_FUNC int16_t lowest() { return INT16_MIN; } - OF_NUMERICS_FUNC int16_t max() { return INT16_MAX; } - OF_NUMERICS_FUNC int16_t lower_bound() { return INT16_MIN; } - OF_NUMERICS_FUNC int16_t upper_bound() { return INT16_MAX; } -}; - -template<> -struct numeric_limits { - OF_NUMERICS_FUNC int32_t lowest() { return INT32_MIN; } - OF_NUMERICS_FUNC int32_t max() { return INT32_MAX; } - OF_NUMERICS_FUNC int32_t lower_bound() { return INT32_MIN; } - OF_NUMERICS_FUNC int32_t upper_bound() { return INT32_MAX; } -}; - -template<> -struct numeric_limits { -#ifdef _MSC_VER - OF_NUMERICS_FUNC int64_t lowest() { return _I64_MIN; } - OF_NUMERICS_FUNC int64_t max() { return _I64_MAX; } - OF_NUMERICS_FUNC int64_t lower_bound() { return _I64_MIN; } - OF_NUMERICS_FUNC int64_t upper_bound() { return _I64_MAX; } -#else - OF_NUMERICS_FUNC int64_t lowest() { return INT64_MIN; } - OF_NUMERICS_FUNC int64_t max() { return INT64_MAX; } - OF_NUMERICS_FUNC int64_t lower_bound() { return INT64_MIN; } - OF_NUMERICS_FUNC int64_t upper_bound() { return INT64_MAX; } -#endif -}; - -template<> -struct numeric_limits { - OF_NUMERICS_FUNC float lowest() { return -FLT_MAX; } - OF_NUMERICS_FUNC float max() { return FLT_MAX; } - OF_NUMERICS_FUNC float lower_bound() { return -static_cast(inf); } - OF_NUMERICS_FUNC float upper_bound() { return static_cast(inf); } -}; - -template<> -struct numeric_limits { - OF_NUMERICS_FUNC double lowest() { return -DBL_MAX; } - OF_NUMERICS_FUNC double max() { return DBL_MAX; } - OF_NUMERICS_FUNC double lower_bound() { return -inf; } - OF_NUMERICS_FUNC double upper_bound() { return inf; } -}; - -} // namespace detail +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// reference: https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/NumericLimits.cuh +#pragma once +#include +#include +#include + +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/framework/framework.h" + +// numeric_limits.cuh is a holder for numeric limits definitions of commonly used +// types. This header is very specific to ROCm HIP and may be removed in the future. + +// The lower_bound and upper_bound constants are same as lowest and max for +// integral types, but are -inf and +inf for floating point types. They are +// useful in implementing min, max, etc. + +namespace oneflow { +namespace detail { + +#if defined(__HIPCC__) +#define OF_NUMERICS_FUNC static inline __host__ __device__ +#else +#define OF_NUMERICS_FUNC static inline +#endif + +template +struct numeric_limits {}; + +// WARNING: the following oneflow::numeric_limits definitions are there only to support +// HIP compilation for the moment. Use std::numeric_limits if you are not +// compiling for ROCm. +// from @colesbury: "The functions on numeric_limits aren't marked with +// __device__ which is why they don't work with ROCm. CUDA allows them +// because they're constexpr." + +namespace { +// ROCm doesn't like INFINITY too. +constexpr double inf = INFINITY; +} // namespace + +template<> +struct numeric_limits { + OF_NUMERICS_FUNC bool lowest() { return false; } + OF_NUMERICS_FUNC bool max() { return true; } + OF_NUMERICS_FUNC bool lower_bound() { return false; } + OF_NUMERICS_FUNC bool upper_bound() { return true; } +}; + +template<> +struct numeric_limits { + OF_NUMERICS_FUNC uint8_t lowest() { return 0; } + OF_NUMERICS_FUNC uint8_t max() { return UINT8_MAX; } + OF_NUMERICS_FUNC uint8_t lower_bound() { return 0; } + OF_NUMERICS_FUNC uint8_t upper_bound() { return UINT8_MAX; } +}; + +template<> +struct numeric_limits { + OF_NUMERICS_FUNC int8_t lowest() { return INT8_MIN; } + OF_NUMERICS_FUNC int8_t max() { return INT8_MAX; } + OF_NUMERICS_FUNC int8_t lower_bound() { return INT8_MIN; } + OF_NUMERICS_FUNC int8_t upper_bound() { return INT8_MAX; } +}; + +template<> +struct numeric_limits { + OF_NUMERICS_FUNC int16_t lowest() { return INT16_MIN; } + OF_NUMERICS_FUNC int16_t max() { return INT16_MAX; } + OF_NUMERICS_FUNC int16_t lower_bound() { return INT16_MIN; } + OF_NUMERICS_FUNC int16_t upper_bound() { return INT16_MAX; } +}; + +template<> +struct numeric_limits { + OF_NUMERICS_FUNC int32_t lowest() { return INT32_MIN; } + OF_NUMERICS_FUNC int32_t max() { return INT32_MAX; } + OF_NUMERICS_FUNC int32_t lower_bound() { return INT32_MIN; } + OF_NUMERICS_FUNC int32_t upper_bound() { return INT32_MAX; } +}; + +template<> +struct numeric_limits { +#ifdef _MSC_VER + OF_NUMERICS_FUNC int64_t lowest() { return _I64_MIN; } + OF_NUMERICS_FUNC int64_t max() { return _I64_MAX; } + OF_NUMERICS_FUNC int64_t lower_bound() { return _I64_MIN; } + OF_NUMERICS_FUNC int64_t upper_bound() { return _I64_MAX; } +#else + OF_NUMERICS_FUNC int64_t lowest() { return INT64_MIN; } + OF_NUMERICS_FUNC int64_t max() { return INT64_MAX; } + OF_NUMERICS_FUNC int64_t lower_bound() { return INT64_MIN; } + OF_NUMERICS_FUNC int64_t upper_bound() { return INT64_MAX; } +#endif +}; + +template<> +struct numeric_limits { + OF_NUMERICS_FUNC float lowest() { return -FLT_MAX; } + OF_NUMERICS_FUNC float max() { return FLT_MAX; } + OF_NUMERICS_FUNC float lower_bound() { return -static_cast(inf); } + OF_NUMERICS_FUNC float upper_bound() { return static_cast(inf); } +}; + +template<> +struct numeric_limits { + OF_NUMERICS_FUNC double lowest() { return -DBL_MAX; } + OF_NUMERICS_FUNC double max() { return DBL_MAX; } + OF_NUMERICS_FUNC double lower_bound() { return -inf; } + OF_NUMERICS_FUNC double upper_bound() { return inf; } +}; + +} // namespace detail } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/kernel/util/numerics.hip.h b/oneflow/core/kernel/util/numerics.hip.h index 5b1bad8..68b1b53 100644 --- a/oneflow/core/kernel/util/numerics.hip.h +++ b/oneflow/core/kernel/util/numerics.hip.h @@ -1,250 +1,250 @@ - -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// reference: https://github.com/pytorch/pytorch/blob/master/aten/src/THC/THCNumerics.cuh -#ifndef ONEFLOW_CORE_KERNEL_UTIL_NUMERICS_HIP_H -#define ONEFLOW_CORE_KERNEL_UTIL_NUMERICS_HIP_H -#pragma once - -#include -#include -#include -#include -#include - -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/util/numeric_limits.hip.h" - -namespace oneflow { -namespace detail { - -template -struct numerics {}; - -template -OF_NUMERICS_FUNC T powi(T a, T b) { - assert(numerics::ge(b, 0)); - T result = 1; - while (b) { - if (b & 1) { result *= a; } - b /= 2; - a *= a; - } - return result; -} - -template<> -struct numerics { - OF_NUMERICS_FUNC uint8_t min() { return detail::numeric_limits::lowest(); } - OF_NUMERICS_FUNC uint8_t max() { return detail::numeric_limits::max(); } - OF_NUMERICS_FUNC uint8_t lower_bound() { return detail::numeric_limits::lower_bound(); } - OF_NUMERICS_FUNC uint8_t upper_bound() { return detail::numeric_limits::upper_bound(); } - - OF_NUMERICS_FUNC bool lt(uint8_t a, uint8_t b) { return a < b; } - OF_NUMERICS_FUNC bool le(uint8_t a, uint8_t b) { return a <= b; } - OF_NUMERICS_FUNC bool gt(uint8_t a, uint8_t b) { return a > b; } - OF_NUMERICS_FUNC bool ge(uint8_t a, uint8_t b) { return a >= b; } - OF_NUMERICS_FUNC bool eq(uint8_t a, uint8_t b) { return a == b; } - OF_NUMERICS_FUNC bool ne(uint8_t a, uint8_t b) { return a != b; } - - OF_NUMERICS_FUNC uint8_t add(uint8_t a, uint8_t b) { return a + b; } - OF_NUMERICS_FUNC uint8_t mul(uint8_t a, uint8_t b) { return a * b; } - OF_NUMERICS_FUNC uint8_t sub(uint8_t a, uint8_t b) { return a - b; } - OF_NUMERICS_FUNC uint8_t div(uint8_t a, uint8_t b) { return a / b; } - OF_NUMERICS_FUNC uint8_t pow(uint8_t a, uint8_t b) { return powi(a, b); } - OF_NUMERICS_FUNC bool isnan(uint8_t a) { return false; } - OF_NUMERICS_FUNC bool isinf(uint8_t a) { return false; } -}; - -#ifdef _MSC_VER -// Suppress warning C4804: '/': unsafe use of type 'bool' in operation -#pragma warning(push) -#pragma warning(disable : 4804) -#endif - -template<> -struct numerics { - OF_NUMERICS_FUNC bool min() { return detail::numeric_limits::lowest(); } - OF_NUMERICS_FUNC bool max() { return detail::numeric_limits::max(); } - OF_NUMERICS_FUNC bool lower_bound() { return detail::numeric_limits::lower_bound(); } - OF_NUMERICS_FUNC bool upper_bound() { return detail::numeric_limits::upper_bound(); } - - OF_NUMERICS_FUNC bool lt(bool a, bool b) { return a < b; } - OF_NUMERICS_FUNC bool le(bool a, bool b) { return a <= b; } - OF_NUMERICS_FUNC bool gt(bool a, bool b) { return a > b; } - OF_NUMERICS_FUNC bool ge(bool a, bool b) { return a >= b; } - OF_NUMERICS_FUNC bool eq(bool a, bool b) { return a == b; } - OF_NUMERICS_FUNC bool ne(bool a, bool b) { return a != b; } - OF_NUMERICS_FUNC bool add(bool a, bool b) { return a + b; } - OF_NUMERICS_FUNC bool mul(bool a, bool b) { return a && b; } - OF_NUMERICS_FUNC bool sub(bool a, bool b) { return a - b; } - OF_NUMERICS_FUNC bool div(bool a, bool b) { return a / b; } - OF_NUMERICS_FUNC bool isnan(bool a) { return false; } - OF_NUMERICS_FUNC bool isinf(bool a) { return false; } -}; - -#ifdef _MSC_VER -#pragma warning(pop) -#endif - -template<> -struct numerics { - OF_NUMERICS_FUNC int8_t min() { return detail::numeric_limits::lowest(); } - OF_NUMERICS_FUNC int8_t max() { return detail::numeric_limits::max(); } - OF_NUMERICS_FUNC int8_t lower_bound() { return detail::numeric_limits::lower_bound(); } - OF_NUMERICS_FUNC int8_t upper_bound() { return detail::numeric_limits::upper_bound(); } - - OF_NUMERICS_FUNC bool lt(int8_t a, int8_t b) { return a < b; } - OF_NUMERICS_FUNC bool le(int8_t a, int8_t b) { return a <= b; } - OF_NUMERICS_FUNC bool gt(int8_t a, int8_t b) { return a > b; } - OF_NUMERICS_FUNC bool ge(int8_t a, int8_t b) { return a >= b; } - OF_NUMERICS_FUNC bool eq(int8_t a, int8_t b) { return a == b; } - OF_NUMERICS_FUNC bool ne(int8_t a, int8_t b) { return a != b; } - - OF_NUMERICS_FUNC int8_t add(int8_t a, int8_t b) { return a + b; } - OF_NUMERICS_FUNC int8_t mul(int8_t a, int8_t b) { return a * b; } - OF_NUMERICS_FUNC int8_t sub(int8_t a, int8_t b) { return a - b; } - OF_NUMERICS_FUNC int8_t div(int8_t a, int8_t b) { return a / b; } - OF_NUMERICS_FUNC int8_t pow(int8_t a, int8_t b) { return powi(a, b); } - OF_NUMERICS_FUNC bool isnan(int8_t a) { return false; } - OF_NUMERICS_FUNC bool isinf(int8_t a) { return false; } -}; - -template<> -struct numerics { - OF_NUMERICS_FUNC int16_t min() { return detail::numeric_limits::lowest(); } - OF_NUMERICS_FUNC int16_t max() { return detail::numeric_limits::max(); } - OF_NUMERICS_FUNC int16_t lower_bound() { return detail::numeric_limits::lower_bound(); } - OF_NUMERICS_FUNC int16_t upper_bound() { return detail::numeric_limits::upper_bound(); } - - OF_NUMERICS_FUNC bool lt(int16_t a, int16_t b) { return a < b; } - OF_NUMERICS_FUNC bool le(int16_t a, int16_t b) { return a <= b; } - OF_NUMERICS_FUNC bool gt(int16_t a, int16_t b) { return a > b; } - OF_NUMERICS_FUNC bool ge(int16_t a, int16_t b) { return a >= b; } - OF_NUMERICS_FUNC bool eq(int16_t a, int16_t b) { return a == b; } - OF_NUMERICS_FUNC bool ne(int16_t a, int16_t b) { return a != b; } - - OF_NUMERICS_FUNC int16_t add(int16_t a, int16_t b) { return a + b; } - OF_NUMERICS_FUNC int16_t mul(int16_t a, int16_t b) { return a * b; } - OF_NUMERICS_FUNC int16_t sub(int16_t a, int16_t b) { return a - b; } - OF_NUMERICS_FUNC int16_t div(int16_t a, int16_t b) { return a / b; } - OF_NUMERICS_FUNC int16_t pow(int16_t a, int16_t b) { return powi(a, b); } - OF_NUMERICS_FUNC bool isnan(int16_t a) { return false; } - OF_NUMERICS_FUNC bool isinf(int16_t a) { return false; } -}; - -template<> -struct numerics { - OF_NUMERICS_FUNC int32_t min() { return detail::numeric_limits::lowest(); } - OF_NUMERICS_FUNC int32_t max() { return detail::numeric_limits::max(); } - OF_NUMERICS_FUNC int32_t lower_bound() { return detail::numeric_limits::lower_bound(); } - OF_NUMERICS_FUNC int32_t upper_bound() { return detail::numeric_limits::upper_bound(); } - - OF_NUMERICS_FUNC bool lt(int32_t a, int32_t b) { return a < b; } - OF_NUMERICS_FUNC bool le(int32_t a, int32_t b) { return a <= b; } - OF_NUMERICS_FUNC bool gt(int32_t a, int32_t b) { return a > b; } - OF_NUMERICS_FUNC bool ge(int32_t a, int32_t b) { return a >= b; } - OF_NUMERICS_FUNC bool eq(int32_t a, int32_t b) { return a == b; } - OF_NUMERICS_FUNC bool ne(int32_t a, int32_t b) { return a != b; } - - OF_NUMERICS_FUNC int32_t add(int32_t a, int32_t b) { return a + b; } - OF_NUMERICS_FUNC int32_t mul(int32_t a, int32_t b) { return a * b; } - OF_NUMERICS_FUNC int32_t sub(int32_t a, int32_t b) { return a - b; } - OF_NUMERICS_FUNC int32_t div(int32_t a, int32_t b) { return a / b; } - OF_NUMERICS_FUNC int32_t pow(int32_t a, int32_t b) { return powi(a, b); } - OF_NUMERICS_FUNC bool isnan(int32_t a) { return false; } - OF_NUMERICS_FUNC bool isinf(int32_t a) { return false; } -}; - -template<> -struct numerics { - OF_NUMERICS_FUNC int64_t min() { return detail::numeric_limits::lowest(); } - OF_NUMERICS_FUNC int64_t max() { return detail::numeric_limits::max(); } - OF_NUMERICS_FUNC int64_t lower_bound() { return detail::numeric_limits::lower_bound(); } - OF_NUMERICS_FUNC int64_t upper_bound() { return detail::numeric_limits::upper_bound(); } - - OF_NUMERICS_FUNC bool lt(int64_t a, int64_t b) { return a < b; } - OF_NUMERICS_FUNC bool le(int64_t a, int64_t b) { return a <= b; } - OF_NUMERICS_FUNC bool gt(int64_t a, int64_t b) { return a > b; } - OF_NUMERICS_FUNC bool ge(int64_t a, int64_t b) { return a >= b; } - OF_NUMERICS_FUNC bool eq(int64_t a, int64_t b) { return a == b; } - OF_NUMERICS_FUNC bool ne(int64_t a, int64_t b) { return a != b; } - - OF_NUMERICS_FUNC int64_t add(int64_t a, int64_t b) { return a + b; } - OF_NUMERICS_FUNC int64_t mul(int64_t a, int64_t b) { return a * b; } - OF_NUMERICS_FUNC int64_t sub(int64_t a, int64_t b) { return a - b; } - OF_NUMERICS_FUNC int64_t div(int64_t a, int64_t b) { return a / b; }; - OF_NUMERICS_FUNC int64_t pow(int64_t a, int64_t b) { return powi(a, b); } - OF_NUMERICS_FUNC bool isnan(int64_t a) { return false; } - OF_NUMERICS_FUNC bool isinf(int64_t a) { return false; } -}; - -// DEPRECATED: use math functions from std and cuda math API (if needed) -template<> -struct numerics { - OF_NUMERICS_FUNC float min() { return detail::numeric_limits::lowest(); } - OF_NUMERICS_FUNC float max() { return detail::numeric_limits::max(); } - OF_NUMERICS_FUNC float lower_bound() { return detail::numeric_limits::lower_bound(); } - OF_NUMERICS_FUNC float upper_bound() { return detail::numeric_limits::upper_bound(); } - - OF_NUMERICS_FUNC bool lt(float a, float b) { return a < b; } - OF_NUMERICS_FUNC bool le(float a, float b) { return a <= b; } - OF_NUMERICS_FUNC bool gt(float a, float b) { return a > b; } - OF_NUMERICS_FUNC bool ge(float a, float b) { return a >= b; } - OF_NUMERICS_FUNC bool eq(float a, float b) { return a == b; } - OF_NUMERICS_FUNC bool ne(float a, float b) { return a != b; } - - OF_NUMERICS_FUNC float sqrt(float a) { return sqrtf(a); } - OF_NUMERICS_FUNC float atan(float a) { return atanf(a); } - OF_NUMERICS_FUNC float add(float a, float b) { return a + b; } - OF_NUMERICS_FUNC float div(float a, float b) { return a / b; } - OF_NUMERICS_FUNC float mul(float a, float b) { return a * b; } - OF_NUMERICS_FUNC float sub(float a, float b) { return a - b; } - OF_NUMERICS_FUNC float pow(float a, float b) { return powf(a, b); } - OF_NUMERICS_FUNC bool isnan(float a) { return ::isnan(a); } - OF_NUMERICS_FUNC bool isinf(float a) { return ::isinf(a); } -}; - -template<> -struct numerics { - OF_NUMERICS_FUNC double min() { return detail::numeric_limits::lowest(); } - OF_NUMERICS_FUNC double max() { return detail::numeric_limits::max(); } - OF_NUMERICS_FUNC double lower_bound() { return detail::numeric_limits::lower_bound(); } - OF_NUMERICS_FUNC double upper_bound() { return detail::numeric_limits::upper_bound(); } - - OF_NUMERICS_FUNC bool lt(double a, double b) { return a < b; } - OF_NUMERICS_FUNC bool le(double a, double b) { return a <= b; } - OF_NUMERICS_FUNC bool gt(double a, double b) { return a > b; } - OF_NUMERICS_FUNC bool ge(double a, double b) { return a >= b; } - OF_NUMERICS_FUNC bool eq(double a, double b) { return a == b; } - OF_NUMERICS_FUNC bool ne(double a, double b) { return a != b; } - - OF_NUMERICS_FUNC double sqrt(double a) { return ::sqrt(a); } - OF_NUMERICS_FUNC double atan(double a) { return ::atan(a); } - OF_NUMERICS_FUNC double add(double a, double b) { return a + b; } - OF_NUMERICS_FUNC double div(double a, double b) { return a / b; } - OF_NUMERICS_FUNC double mul(double a, double b) { return a * b; } - OF_NUMERICS_FUNC double sub(double a, double b) { return a - b; } - OF_NUMERICS_FUNC double pow(double a, double b) { return ::pow(a, b); } - OF_NUMERICS_FUNC bool isnan(double a) { return ::isnan(a); } - OF_NUMERICS_FUNC bool isinf(double a) { return ::isinf(a); } -}; - -} // namespace detail -} // namespace oneflow - + +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// reference: https://github.com/pytorch/pytorch/blob/master/aten/src/THC/THCNumerics.cuh +#ifndef ONEFLOW_CORE_KERNEL_UTIL_NUMERICS_HIP_H +#define ONEFLOW_CORE_KERNEL_UTIL_NUMERICS_HIP_H +#pragma once + +#include +#include +#include +#include +#include + +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/util/numeric_limits.hip.h" + +namespace oneflow { +namespace detail { + +template +struct numerics {}; + +template +OF_NUMERICS_FUNC T powi(T a, T b) { + assert(numerics::ge(b, 0)); + T result = 1; + while (b) { + if (b & 1) { result *= a; } + b /= 2; + a *= a; + } + return result; +} + +template<> +struct numerics { + OF_NUMERICS_FUNC uint8_t min() { return detail::numeric_limits::lowest(); } + OF_NUMERICS_FUNC uint8_t max() { return detail::numeric_limits::max(); } + OF_NUMERICS_FUNC uint8_t lower_bound() { return detail::numeric_limits::lower_bound(); } + OF_NUMERICS_FUNC uint8_t upper_bound() { return detail::numeric_limits::upper_bound(); } + + OF_NUMERICS_FUNC bool lt(uint8_t a, uint8_t b) { return a < b; } + OF_NUMERICS_FUNC bool le(uint8_t a, uint8_t b) { return a <= b; } + OF_NUMERICS_FUNC bool gt(uint8_t a, uint8_t b) { return a > b; } + OF_NUMERICS_FUNC bool ge(uint8_t a, uint8_t b) { return a >= b; } + OF_NUMERICS_FUNC bool eq(uint8_t a, uint8_t b) { return a == b; } + OF_NUMERICS_FUNC bool ne(uint8_t a, uint8_t b) { return a != b; } + + OF_NUMERICS_FUNC uint8_t add(uint8_t a, uint8_t b) { return a + b; } + OF_NUMERICS_FUNC uint8_t mul(uint8_t a, uint8_t b) { return a * b; } + OF_NUMERICS_FUNC uint8_t sub(uint8_t a, uint8_t b) { return a - b; } + OF_NUMERICS_FUNC uint8_t div(uint8_t a, uint8_t b) { return a / b; } + OF_NUMERICS_FUNC uint8_t pow(uint8_t a, uint8_t b) { return powi(a, b); } + OF_NUMERICS_FUNC bool isnan(uint8_t a) { return false; } + OF_NUMERICS_FUNC bool isinf(uint8_t a) { return false; } +}; + +#ifdef _MSC_VER +// Suppress warning C4804: '/': unsafe use of type 'bool' in operation +#pragma warning(push) +#pragma warning(disable : 4804) +#endif + +template<> +struct numerics { + OF_NUMERICS_FUNC bool min() { return detail::numeric_limits::lowest(); } + OF_NUMERICS_FUNC bool max() { return detail::numeric_limits::max(); } + OF_NUMERICS_FUNC bool lower_bound() { return detail::numeric_limits::lower_bound(); } + OF_NUMERICS_FUNC bool upper_bound() { return detail::numeric_limits::upper_bound(); } + + OF_NUMERICS_FUNC bool lt(bool a, bool b) { return a < b; } + OF_NUMERICS_FUNC bool le(bool a, bool b) { return a <= b; } + OF_NUMERICS_FUNC bool gt(bool a, bool b) { return a > b; } + OF_NUMERICS_FUNC bool ge(bool a, bool b) { return a >= b; } + OF_NUMERICS_FUNC bool eq(bool a, bool b) { return a == b; } + OF_NUMERICS_FUNC bool ne(bool a, bool b) { return a != b; } + OF_NUMERICS_FUNC bool add(bool a, bool b) { return a + b; } + OF_NUMERICS_FUNC bool mul(bool a, bool b) { return a && b; } + OF_NUMERICS_FUNC bool sub(bool a, bool b) { return a - b; } + OF_NUMERICS_FUNC bool div(bool a, bool b) { return a / b; } + OF_NUMERICS_FUNC bool isnan(bool a) { return false; } + OF_NUMERICS_FUNC bool isinf(bool a) { return false; } +}; + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +template<> +struct numerics { + OF_NUMERICS_FUNC int8_t min() { return detail::numeric_limits::lowest(); } + OF_NUMERICS_FUNC int8_t max() { return detail::numeric_limits::max(); } + OF_NUMERICS_FUNC int8_t lower_bound() { return detail::numeric_limits::lower_bound(); } + OF_NUMERICS_FUNC int8_t upper_bound() { return detail::numeric_limits::upper_bound(); } + + OF_NUMERICS_FUNC bool lt(int8_t a, int8_t b) { return a < b; } + OF_NUMERICS_FUNC bool le(int8_t a, int8_t b) { return a <= b; } + OF_NUMERICS_FUNC bool gt(int8_t a, int8_t b) { return a > b; } + OF_NUMERICS_FUNC bool ge(int8_t a, int8_t b) { return a >= b; } + OF_NUMERICS_FUNC bool eq(int8_t a, int8_t b) { return a == b; } + OF_NUMERICS_FUNC bool ne(int8_t a, int8_t b) { return a != b; } + + OF_NUMERICS_FUNC int8_t add(int8_t a, int8_t b) { return a + b; } + OF_NUMERICS_FUNC int8_t mul(int8_t a, int8_t b) { return a * b; } + OF_NUMERICS_FUNC int8_t sub(int8_t a, int8_t b) { return a - b; } + OF_NUMERICS_FUNC int8_t div(int8_t a, int8_t b) { return a / b; } + OF_NUMERICS_FUNC int8_t pow(int8_t a, int8_t b) { return powi(a, b); } + OF_NUMERICS_FUNC bool isnan(int8_t a) { return false; } + OF_NUMERICS_FUNC bool isinf(int8_t a) { return false; } +}; + +template<> +struct numerics { + OF_NUMERICS_FUNC int16_t min() { return detail::numeric_limits::lowest(); } + OF_NUMERICS_FUNC int16_t max() { return detail::numeric_limits::max(); } + OF_NUMERICS_FUNC int16_t lower_bound() { return detail::numeric_limits::lower_bound(); } + OF_NUMERICS_FUNC int16_t upper_bound() { return detail::numeric_limits::upper_bound(); } + + OF_NUMERICS_FUNC bool lt(int16_t a, int16_t b) { return a < b; } + OF_NUMERICS_FUNC bool le(int16_t a, int16_t b) { return a <= b; } + OF_NUMERICS_FUNC bool gt(int16_t a, int16_t b) { return a > b; } + OF_NUMERICS_FUNC bool ge(int16_t a, int16_t b) { return a >= b; } + OF_NUMERICS_FUNC bool eq(int16_t a, int16_t b) { return a == b; } + OF_NUMERICS_FUNC bool ne(int16_t a, int16_t b) { return a != b; } + + OF_NUMERICS_FUNC int16_t add(int16_t a, int16_t b) { return a + b; } + OF_NUMERICS_FUNC int16_t mul(int16_t a, int16_t b) { return a * b; } + OF_NUMERICS_FUNC int16_t sub(int16_t a, int16_t b) { return a - b; } + OF_NUMERICS_FUNC int16_t div(int16_t a, int16_t b) { return a / b; } + OF_NUMERICS_FUNC int16_t pow(int16_t a, int16_t b) { return powi(a, b); } + OF_NUMERICS_FUNC bool isnan(int16_t a) { return false; } + OF_NUMERICS_FUNC bool isinf(int16_t a) { return false; } +}; + +template<> +struct numerics { + OF_NUMERICS_FUNC int32_t min() { return detail::numeric_limits::lowest(); } + OF_NUMERICS_FUNC int32_t max() { return detail::numeric_limits::max(); } + OF_NUMERICS_FUNC int32_t lower_bound() { return detail::numeric_limits::lower_bound(); } + OF_NUMERICS_FUNC int32_t upper_bound() { return detail::numeric_limits::upper_bound(); } + + OF_NUMERICS_FUNC bool lt(int32_t a, int32_t b) { return a < b; } + OF_NUMERICS_FUNC bool le(int32_t a, int32_t b) { return a <= b; } + OF_NUMERICS_FUNC bool gt(int32_t a, int32_t b) { return a > b; } + OF_NUMERICS_FUNC bool ge(int32_t a, int32_t b) { return a >= b; } + OF_NUMERICS_FUNC bool eq(int32_t a, int32_t b) { return a == b; } + OF_NUMERICS_FUNC bool ne(int32_t a, int32_t b) { return a != b; } + + OF_NUMERICS_FUNC int32_t add(int32_t a, int32_t b) { return a + b; } + OF_NUMERICS_FUNC int32_t mul(int32_t a, int32_t b) { return a * b; } + OF_NUMERICS_FUNC int32_t sub(int32_t a, int32_t b) { return a - b; } + OF_NUMERICS_FUNC int32_t div(int32_t a, int32_t b) { return a / b; } + OF_NUMERICS_FUNC int32_t pow(int32_t a, int32_t b) { return powi(a, b); } + OF_NUMERICS_FUNC bool isnan(int32_t a) { return false; } + OF_NUMERICS_FUNC bool isinf(int32_t a) { return false; } +}; + +template<> +struct numerics { + OF_NUMERICS_FUNC int64_t min() { return detail::numeric_limits::lowest(); } + OF_NUMERICS_FUNC int64_t max() { return detail::numeric_limits::max(); } + OF_NUMERICS_FUNC int64_t lower_bound() { return detail::numeric_limits::lower_bound(); } + OF_NUMERICS_FUNC int64_t upper_bound() { return detail::numeric_limits::upper_bound(); } + + OF_NUMERICS_FUNC bool lt(int64_t a, int64_t b) { return a < b; } + OF_NUMERICS_FUNC bool le(int64_t a, int64_t b) { return a <= b; } + OF_NUMERICS_FUNC bool gt(int64_t a, int64_t b) { return a > b; } + OF_NUMERICS_FUNC bool ge(int64_t a, int64_t b) { return a >= b; } + OF_NUMERICS_FUNC bool eq(int64_t a, int64_t b) { return a == b; } + OF_NUMERICS_FUNC bool ne(int64_t a, int64_t b) { return a != b; } + + OF_NUMERICS_FUNC int64_t add(int64_t a, int64_t b) { return a + b; } + OF_NUMERICS_FUNC int64_t mul(int64_t a, int64_t b) { return a * b; } + OF_NUMERICS_FUNC int64_t sub(int64_t a, int64_t b) { return a - b; } + OF_NUMERICS_FUNC int64_t div(int64_t a, int64_t b) { return a / b; }; + OF_NUMERICS_FUNC int64_t pow(int64_t a, int64_t b) { return powi(a, b); } + OF_NUMERICS_FUNC bool isnan(int64_t a) { return false; } + OF_NUMERICS_FUNC bool isinf(int64_t a) { return false; } +}; + +// DEPRECATED: use math functions from std and cuda math API (if needed) +template<> +struct numerics { + OF_NUMERICS_FUNC float min() { return detail::numeric_limits::lowest(); } + OF_NUMERICS_FUNC float max() { return detail::numeric_limits::max(); } + OF_NUMERICS_FUNC float lower_bound() { return detail::numeric_limits::lower_bound(); } + OF_NUMERICS_FUNC float upper_bound() { return detail::numeric_limits::upper_bound(); } + + OF_NUMERICS_FUNC bool lt(float a, float b) { return a < b; } + OF_NUMERICS_FUNC bool le(float a, float b) { return a <= b; } + OF_NUMERICS_FUNC bool gt(float a, float b) { return a > b; } + OF_NUMERICS_FUNC bool ge(float a, float b) { return a >= b; } + OF_NUMERICS_FUNC bool eq(float a, float b) { return a == b; } + OF_NUMERICS_FUNC bool ne(float a, float b) { return a != b; } + + OF_NUMERICS_FUNC float sqrt(float a) { return sqrtf(a); } + OF_NUMERICS_FUNC float atan(float a) { return atanf(a); } + OF_NUMERICS_FUNC float add(float a, float b) { return a + b; } + OF_NUMERICS_FUNC float div(float a, float b) { return a / b; } + OF_NUMERICS_FUNC float mul(float a, float b) { return a * b; } + OF_NUMERICS_FUNC float sub(float a, float b) { return a - b; } + OF_NUMERICS_FUNC float pow(float a, float b) { return powf(a, b); } + OF_NUMERICS_FUNC bool isnan(float a) { return ::isnan(a); } + OF_NUMERICS_FUNC bool isinf(float a) { return ::isinf(a); } +}; + +template<> +struct numerics { + OF_NUMERICS_FUNC double min() { return detail::numeric_limits::lowest(); } + OF_NUMERICS_FUNC double max() { return detail::numeric_limits::max(); } + OF_NUMERICS_FUNC double lower_bound() { return detail::numeric_limits::lower_bound(); } + OF_NUMERICS_FUNC double upper_bound() { return detail::numeric_limits::upper_bound(); } + + OF_NUMERICS_FUNC bool lt(double a, double b) { return a < b; } + OF_NUMERICS_FUNC bool le(double a, double b) { return a <= b; } + OF_NUMERICS_FUNC bool gt(double a, double b) { return a > b; } + OF_NUMERICS_FUNC bool ge(double a, double b) { return a >= b; } + OF_NUMERICS_FUNC bool eq(double a, double b) { return a == b; } + OF_NUMERICS_FUNC bool ne(double a, double b) { return a != b; } + + OF_NUMERICS_FUNC double sqrt(double a) { return ::sqrt(a); } + OF_NUMERICS_FUNC double atan(double a) { return ::atan(a); } + OF_NUMERICS_FUNC double add(double a, double b) { return a + b; } + OF_NUMERICS_FUNC double div(double a, double b) { return a / b; } + OF_NUMERICS_FUNC double mul(double a, double b) { return a * b; } + OF_NUMERICS_FUNC double sub(double a, double b) { return a - b; } + OF_NUMERICS_FUNC double pow(double a, double b) { return ::pow(a, b); } + OF_NUMERICS_FUNC bool isnan(double a) { return ::isnan(a); } + OF_NUMERICS_FUNC bool isinf(double a) { return ::isinf(a); } +}; + +} // namespace detail +} // namespace oneflow + #endif // ONEFLOW_CORE_KERNEL_UTIL_NUMERICS_HIP_H \ No newline at end of file diff --git a/oneflow/core/ndarray/ndarray_apply_binary_core.hip.cpp b/oneflow/core/ndarray/ndarray_apply_binary_core.hip.cpp index 9ae4c6f..d24d580 100644 --- a/oneflow/core/ndarray/ndarray_apply_binary_core.hip.cpp +++ b/oneflow/core/ndarray/ndarray_apply_binary_core.hip.cpp @@ -1,68 +1,68 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/ndarray/ndarray_apply_binary_core.h" -#include "oneflow/core/ndarray/binary_func.h" - -namespace oneflow { - -namespace { - -template class binary_func> -__global__ void NdarrayApplyBinaryApplyGpu(size_t n, - typename BinaryFuncTrait::return_type* y, - const T* a, const T* b) { - NdarrayApplyBinaryCore::Apply(n, y, a, b); -} - -template class binary_func> -__global__ void NdarrayApplyBinaryInplaceApplyGpu(size_t n, T* y, const T* x) { - NdarrayApplyBinaryCore::InplaceApply(n, y, x); -} - -} // namespace - -template class binary_func> -struct NdarrayApplyBinaryCoreWrapper final { - static void Apply(ep::Stream* stream, - const XpuVarNdarray::return_type>& y, - const XpuVarNdarray& a, const XpuVarNdarray& b) { - size_t n = y.host_shape().HostElemNum(); - if (n == 0) { return; } - RUN_CUDA_KERNEL((NdarrayApplyBinaryApplyGpu), stream, n, n, y.host_ptr(), - a.host_ptr(), b.host_ptr()); - } - static void InplaceApply(ep::Stream* stream, const XpuVarNdarray& y, - const XpuVarNdarray& x) { - size_t n = y.host_shape().HostElemNum(); - if (n == 0) { return; } - RUN_CUDA_KERNEL((NdarrayApplyBinaryInplaceApplyGpu), stream, n, n, y.host_ptr(), - x.host_ptr()); - } -}; - -#define INSTANTIATE_NDARRAY_APPLY_BINARY_CORE(dtype_pair, binary_func) \ - template struct NdarrayApplyBinaryCoreWrapper; -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_APPLY_BINARY_CORE, - ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ - UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, - ARITHMETIC_BINARY_FUNC_SEQ); -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_APPLY_BINARY_CORE, - ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ - UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, - LOGICAL_BINARY_FUNC_SEQ); +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/ndarray/ndarray_apply_binary_core.h" +#include "oneflow/core/ndarray/binary_func.h" + +namespace oneflow { + +namespace { + +template class binary_func> +__global__ void NdarrayApplyBinaryApplyGpu(size_t n, + typename BinaryFuncTrait::return_type* y, + const T* a, const T* b) { + NdarrayApplyBinaryCore::Apply(n, y, a, b); +} + +template class binary_func> +__global__ void NdarrayApplyBinaryInplaceApplyGpu(size_t n, T* y, const T* x) { + NdarrayApplyBinaryCore::InplaceApply(n, y, x); +} + +} // namespace + +template class binary_func> +struct NdarrayApplyBinaryCoreWrapper final { + static void Apply(ep::Stream* stream, + const XpuVarNdarray::return_type>& y, + const XpuVarNdarray& a, const XpuVarNdarray& b) { + size_t n = y.host_shape().HostElemNum(); + if (n == 0) { return; } + RUN_CUDA_KERNEL((NdarrayApplyBinaryApplyGpu), stream, n, n, y.host_ptr(), + a.host_ptr(), b.host_ptr()); + } + static void InplaceApply(ep::Stream* stream, const XpuVarNdarray& y, + const XpuVarNdarray& x) { + size_t n = y.host_shape().HostElemNum(); + if (n == 0) { return; } + RUN_CUDA_KERNEL((NdarrayApplyBinaryInplaceApplyGpu), stream, n, n, y.host_ptr(), + x.host_ptr()); + } +}; + +#define INSTANTIATE_NDARRAY_APPLY_BINARY_CORE(dtype_pair, binary_func) \ + template struct NdarrayApplyBinaryCoreWrapper; +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_APPLY_BINARY_CORE, + ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ + UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, + ARITHMETIC_BINARY_FUNC_SEQ); +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_APPLY_BINARY_CORE, + ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ + UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, + LOGICAL_BINARY_FUNC_SEQ); } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/ndarray/ndarray_apply_broadcast_binary_core.hip.cpp b/oneflow/core/ndarray/ndarray_apply_broadcast_binary_core.hip.cpp index 68e9d7b..0f7ecd4 100644 --- a/oneflow/core/ndarray/ndarray_apply_broadcast_binary_core.hip.cpp +++ b/oneflow/core/ndarray/ndarray_apply_broadcast_binary_core.hip.cpp @@ -1,190 +1,190 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/ndarray/ndarray_apply_broadcast_binary_core.h" - -namespace oneflow { - -namespace { - -template -struct XY2XFunctor final { - __host__ __device__ XY2XFunctor(Index dim_y) : dim_y_(dim_y) {} - - __host__ __device__ Index operator()(Index idx) const { return idx / dim_y_; } - - Index dim_y_; -}; - -template -struct XY2YFunctor final { - __host__ __device__ XY2YFunctor(Index dim_y) : dim_y_(dim_y) {} - - __host__ __device__ Index operator()(Index idx) const { return idx % dim_y_; } - - Index dim_y_; -}; - -template -struct XYZ2XZFunctor final { - __host__ __device__ XYZ2XZFunctor(Index dim_y, Index dim_z) - : dim_yz_(dim_y * dim_z), dim_z_(dim_z) {} - - __host__ __device__ Index operator()(Index idx) const { - const Index x = idx / dim_yz_; - const Index z = (idx % dim_yz_) % dim_z_; - return x * dim_z_ + z; - } - - Index dim_yz_; - Index dim_z_; -}; - -template -struct XYZ2YFunctor final { - __host__ __device__ XYZ2YFunctor(Index dim_y, Index dim_z) - : dim_yz_(dim_y * dim_z), dim_z_(dim_z) {} - - __host__ __device__ Index operator()(Index idx) const { return (idx % dim_yz_) / dim_z_; } - - Index dim_yz_; - Index dim_z_; -}; - -template class binary_func, typename OffsetFunctor> -__global__ void PartialBroadcastGpu(K n, typename BinaryFuncTrait::return_type* y, - const T* a, const T* b, OffsetFunctor offset_functor) { - CUDA_1D_KERNEL_LOOP_T(K, i, n) { y[i] = binary_func::Invoke(a[i], b[offset_functor(i)]); } -} - -template class binary_func> -__global__ void GpuBroadcastBinaryFunc( - const XpuVarNdarray::return_type> y, - const XpuVarNdarray a, const XpuVarNdarray b) { - NdarrayApplyBroadcastBinaryCore::Apply(y, a, b); -} -template class binary_func> -__global__ void GpuInplaceBroadcastBinaryFunc(const XpuVarNdarray y, - const XpuVarNdarray x) { - NdarrayApplyBroadcastBinaryCore::InplaceApply(y, x); -} - -} // namespace - -template class binary_func> -struct NdarrayApplyBroadcastBinaryCoreWrapper final { - static void Apply(ep::Stream* stream, - const XpuVarNdarray::return_type>& y, - const XpuVarNdarray& a, const XpuVarNdarray& b) { - size_t n = y.host_shape().HostElemNum(); - if (n == 0) { return; } - if (IsKernelSafeInt32(n) && PartialBroadcast(stream, y, a, b)) { return; } - if (!IsKernelSafeInt32(n) && PartialBroadcast(stream, y, a, b)) { return; } - RUN_CUDA_KERNEL((GpuBroadcastBinaryFunc), stream, n, y, a, b); - } - - template - static bool PartialBroadcast( - ep::Stream* stream, - const XpuVarNdarray::return_type>& y, - const XpuVarNdarray& a, const XpuVarNdarray& b) { - size_t n = y.host_shape().HostElemNum(); - if (y.host_shape() == a.host_shape()) { - if (y.host_shape().NumAxes() == 2) { - const K y_dim0 = y.host_shape().At(0); - const K y_dim1 = y.host_shape().At(1); - const K b_dim0 = b.host_shape().At(0); - const K b_dim1 = b.host_shape().At(1); - if (b_dim0 == y_dim0 && b_dim1 == 1) { - XY2XFunctor xy2x(y_dim1); - RUN_CUDA_KERNEL((PartialBroadcastGpu>), stream, n, n, - y.host_ptr(), a.host_ptr(), b.host_ptr(), xy2x); - return true; - } - if (b_dim0 == 1 && b_dim1 == y_dim1) { - XY2YFunctor xy2y(y_dim1); - RUN_CUDA_KERNEL((PartialBroadcastGpu>), stream, n, n, - y.host_ptr(), a.host_ptr(), b.host_ptr(), xy2y); - return true; - } - } - if (y.host_shape().NumAxes() == 3) { - const K y_dim0 = y.host_shape().At(0); - const K y_dim1 = y.host_shape().At(1); - const K y_dim2 = y.host_shape().At(2); - const K b_dim0 = b.host_shape().At(0); - const K b_dim1 = b.host_shape().At(1); - const K b_dim2 = b.host_shape().At(2); - if (b_dim0 == y_dim0 && b_dim1 == 1 && b_dim2 == y_dim2) { - XYZ2XZFunctor xyz2xz(y_dim1, y_dim2); - RUN_CUDA_KERNEL((PartialBroadcastGpu>), stream, n, n, - y.host_ptr(), a.host_ptr(), b.host_ptr(), xyz2xz); - return true; - } - if (b_dim0 == 1 && b_dim1 == y_dim1 && b_dim2 == 1) { - XYZ2YFunctor xyz2y(y_dim1, y_dim2); - RUN_CUDA_KERNEL((PartialBroadcastGpu>), stream, n, n, - y.host_ptr(), a.host_ptr(), b.host_ptr(), xyz2y); - return true; - } - } - } - return false; - } -}; - -template class binary_func> -struct NdarrayApplyBroadcastInplaceBinaryCoreWrapper - final { - static void InplaceApply(ep::Stream* stream, const XpuVarNdarray& y, - const XpuVarNdarray& x) { - size_t n = y.host_shape().HostElemNum(); - XpuVarNdarray a(y.host_shape(), y.host_ptr()); - using NBB = NdarrayApplyBroadcastBinaryCoreWrapper; - if (n == 0) { return; } - if (IsKernelSafeInt32(n) && NBB::template PartialBroadcast(stream, y, a, x)) { - return; - } - if (!IsKernelSafeInt32(n) && NBB::template PartialBroadcast(stream, y, a, x)) { - return; - } - RUN_CUDA_KERNEL((GpuInplaceBroadcastBinaryFunc), stream, n, y, x); - } -}; - -#define INSTANTIATE_BROADCAST_BINARY_FUNC(dtype_pair, NDIMS, binary_func) \ - template struct NdarrayApplyBroadcastBinaryCoreWrapper< \ - DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype_pair), NDIMS, binary_func>; -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BROADCAST_BINARY_FUNC, - ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ - UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, - DIM_SEQ, ARITHMETIC_BINARY_FUNC_SEQ); -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BROADCAST_BINARY_FUNC, - ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ - UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, - DIM_SEQ, LOGICAL_BINARY_FUNC_SEQ); - -#define INSTANTIATE_BROADCAST_INPLACE_BINARY_FUNC(dtype_pair, NDIMS, binary_func) \ - template struct NdarrayApplyBroadcastInplaceBinaryCoreWrapper< \ - DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype_pair), NDIMS, binary_func>; -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BROADCAST_INPLACE_BINARY_FUNC, - ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ - UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, - DIM_SEQ, ARITHMETIC_BINARY_FUNC_SEQ); -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BROADCAST_INPLACE_BINARY_FUNC, - ((bool, DataType::kBool)), DIM_SEQ, LOGICAL_BINARY_FUNC_SEQ); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/ndarray/ndarray_apply_broadcast_binary_core.h" + +namespace oneflow { + +namespace { + +template +struct XY2XFunctor final { + __host__ __device__ XY2XFunctor(Index dim_y) : dim_y_(dim_y) {} + + __host__ __device__ Index operator()(Index idx) const { return idx / dim_y_; } + + Index dim_y_; +}; + +template +struct XY2YFunctor final { + __host__ __device__ XY2YFunctor(Index dim_y) : dim_y_(dim_y) {} + + __host__ __device__ Index operator()(Index idx) const { return idx % dim_y_; } + + Index dim_y_; +}; + +template +struct XYZ2XZFunctor final { + __host__ __device__ XYZ2XZFunctor(Index dim_y, Index dim_z) + : dim_yz_(dim_y * dim_z), dim_z_(dim_z) {} + + __host__ __device__ Index operator()(Index idx) const { + const Index x = idx / dim_yz_; + const Index z = (idx % dim_yz_) % dim_z_; + return x * dim_z_ + z; + } + + Index dim_yz_; + Index dim_z_; +}; + +template +struct XYZ2YFunctor final { + __host__ __device__ XYZ2YFunctor(Index dim_y, Index dim_z) + : dim_yz_(dim_y * dim_z), dim_z_(dim_z) {} + + __host__ __device__ Index operator()(Index idx) const { return (idx % dim_yz_) / dim_z_; } + + Index dim_yz_; + Index dim_z_; +}; + +template class binary_func, typename OffsetFunctor> +__global__ void PartialBroadcastGpu(K n, typename BinaryFuncTrait::return_type* y, + const T* a, const T* b, OffsetFunctor offset_functor) { + CUDA_1D_KERNEL_LOOP_T(K, i, n) { y[i] = binary_func::Invoke(a[i], b[offset_functor(i)]); } +} + +template class binary_func> +__global__ void GpuBroadcastBinaryFunc( + const XpuVarNdarray::return_type> y, + const XpuVarNdarray a, const XpuVarNdarray b) { + NdarrayApplyBroadcastBinaryCore::Apply(y, a, b); +} +template class binary_func> +__global__ void GpuInplaceBroadcastBinaryFunc(const XpuVarNdarray y, + const XpuVarNdarray x) { + NdarrayApplyBroadcastBinaryCore::InplaceApply(y, x); +} + +} // namespace + +template class binary_func> +struct NdarrayApplyBroadcastBinaryCoreWrapper final { + static void Apply(ep::Stream* stream, + const XpuVarNdarray::return_type>& y, + const XpuVarNdarray& a, const XpuVarNdarray& b) { + size_t n = y.host_shape().HostElemNum(); + if (n == 0) { return; } + if (IsKernelSafeInt32(n) && PartialBroadcast(stream, y, a, b)) { return; } + if (!IsKernelSafeInt32(n) && PartialBroadcast(stream, y, a, b)) { return; } + RUN_CUDA_KERNEL((GpuBroadcastBinaryFunc), stream, n, y, a, b); + } + + template + static bool PartialBroadcast( + ep::Stream* stream, + const XpuVarNdarray::return_type>& y, + const XpuVarNdarray& a, const XpuVarNdarray& b) { + size_t n = y.host_shape().HostElemNum(); + if (y.host_shape() == a.host_shape()) { + if (y.host_shape().NumAxes() == 2) { + const K y_dim0 = y.host_shape().At(0); + const K y_dim1 = y.host_shape().At(1); + const K b_dim0 = b.host_shape().At(0); + const K b_dim1 = b.host_shape().At(1); + if (b_dim0 == y_dim0 && b_dim1 == 1) { + XY2XFunctor xy2x(y_dim1); + RUN_CUDA_KERNEL((PartialBroadcastGpu>), stream, n, n, + y.host_ptr(), a.host_ptr(), b.host_ptr(), xy2x); + return true; + } + if (b_dim0 == 1 && b_dim1 == y_dim1) { + XY2YFunctor xy2y(y_dim1); + RUN_CUDA_KERNEL((PartialBroadcastGpu>), stream, n, n, + y.host_ptr(), a.host_ptr(), b.host_ptr(), xy2y); + return true; + } + } + if (y.host_shape().NumAxes() == 3) { + const K y_dim0 = y.host_shape().At(0); + const K y_dim1 = y.host_shape().At(1); + const K y_dim2 = y.host_shape().At(2); + const K b_dim0 = b.host_shape().At(0); + const K b_dim1 = b.host_shape().At(1); + const K b_dim2 = b.host_shape().At(2); + if (b_dim0 == y_dim0 && b_dim1 == 1 && b_dim2 == y_dim2) { + XYZ2XZFunctor xyz2xz(y_dim1, y_dim2); + RUN_CUDA_KERNEL((PartialBroadcastGpu>), stream, n, n, + y.host_ptr(), a.host_ptr(), b.host_ptr(), xyz2xz); + return true; + } + if (b_dim0 == 1 && b_dim1 == y_dim1 && b_dim2 == 1) { + XYZ2YFunctor xyz2y(y_dim1, y_dim2); + RUN_CUDA_KERNEL((PartialBroadcastGpu>), stream, n, n, + y.host_ptr(), a.host_ptr(), b.host_ptr(), xyz2y); + return true; + } + } + } + return false; + } +}; + +template class binary_func> +struct NdarrayApplyBroadcastInplaceBinaryCoreWrapper + final { + static void InplaceApply(ep::Stream* stream, const XpuVarNdarray& y, + const XpuVarNdarray& x) { + size_t n = y.host_shape().HostElemNum(); + XpuVarNdarray a(y.host_shape(), y.host_ptr()); + using NBB = NdarrayApplyBroadcastBinaryCoreWrapper; + if (n == 0) { return; } + if (IsKernelSafeInt32(n) && NBB::template PartialBroadcast(stream, y, a, x)) { + return; + } + if (!IsKernelSafeInt32(n) && NBB::template PartialBroadcast(stream, y, a, x)) { + return; + } + RUN_CUDA_KERNEL((GpuInplaceBroadcastBinaryFunc), stream, n, y, x); + } +}; + +#define INSTANTIATE_BROADCAST_BINARY_FUNC(dtype_pair, NDIMS, binary_func) \ + template struct NdarrayApplyBroadcastBinaryCoreWrapper< \ + DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype_pair), NDIMS, binary_func>; +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BROADCAST_BINARY_FUNC, + ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ + UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, + DIM_SEQ, ARITHMETIC_BINARY_FUNC_SEQ); +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BROADCAST_BINARY_FUNC, + ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ + UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, + DIM_SEQ, LOGICAL_BINARY_FUNC_SEQ); + +#define INSTANTIATE_BROADCAST_INPLACE_BINARY_FUNC(dtype_pair, NDIMS, binary_func) \ + template struct NdarrayApplyBroadcastInplaceBinaryCoreWrapper< \ + DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype_pair), NDIMS, binary_func>; +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BROADCAST_INPLACE_BINARY_FUNC, + ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ + UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, + DIM_SEQ, ARITHMETIC_BINARY_FUNC_SEQ); +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BROADCAST_INPLACE_BINARY_FUNC, + ((bool, DataType::kBool)), DIM_SEQ, LOGICAL_BINARY_FUNC_SEQ); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/ndarray/ndarray_apply_broadcast_unary_core.hip.cpp b/oneflow/core/ndarray/ndarray_apply_broadcast_unary_core.hip.cpp index 1b77803..d1de3fe 100644 --- a/oneflow/core/ndarray/ndarray_apply_broadcast_unary_core.hip.cpp +++ b/oneflow/core/ndarray/ndarray_apply_broadcast_unary_core.hip.cpp @@ -1,46 +1,46 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/ndarray/ndarray_apply_broadcast_unary_core.h" - -namespace oneflow { - -namespace { - -template class unary_func> -__global__ void GpuBroadcastUnaryFunc(const XpuVarNdarray y, const XpuVarNdarray x) { - NdarrayApplyBroadcastUnaryCore::Apply(y, x); -} - -} // namespace - -template class unary_func> -struct NdarrayApplyBroadcastUnaryCoreWrapper final { - static void Apply(ep::Stream* stream, const XpuVarNdarray& y, - const XpuVarNdarray& x) { - size_t n = y.host_shape().HostElemNum(); - if (n == 0) { return; } - RUN_CUDA_KERNEL((GpuBroadcastUnaryFunc), stream, n, y, x); - } -}; - -#define INSTANTIATE_BROADCAST_UNARY_FUNC(dtype_pair, NDIMS, unary_func) \ - template struct NdarrayApplyBroadcastUnaryCoreWrapper< \ - DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype_pair), NDIMS, unary_func>; -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BROADCAST_UNARY_FUNC, - ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, - DIM_SEQ, ARITHMETIC_UNARY_FUNC_SEQ) +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/ndarray/ndarray_apply_broadcast_unary_core.h" + +namespace oneflow { + +namespace { + +template class unary_func> +__global__ void GpuBroadcastUnaryFunc(const XpuVarNdarray y, const XpuVarNdarray x) { + NdarrayApplyBroadcastUnaryCore::Apply(y, x); +} + +} // namespace + +template class unary_func> +struct NdarrayApplyBroadcastUnaryCoreWrapper final { + static void Apply(ep::Stream* stream, const XpuVarNdarray& y, + const XpuVarNdarray& x) { + size_t n = y.host_shape().HostElemNum(); + if (n == 0) { return; } + RUN_CUDA_KERNEL((GpuBroadcastUnaryFunc), stream, n, y, x); + } +}; + +#define INSTANTIATE_BROADCAST_UNARY_FUNC(dtype_pair, NDIMS, unary_func) \ + template struct NdarrayApplyBroadcastUnaryCoreWrapper< \ + DeviceType::kCUDA, OF_PP_PAIR_FIRST(dtype_pair), NDIMS, unary_func>; +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BROADCAST_UNARY_FUNC, + ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, + DIM_SEQ, ARITHMETIC_UNARY_FUNC_SEQ) } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/ndarray/ndarray_apply_unary_core.hip.cpp b/oneflow/core/ndarray/ndarray_apply_unary_core.hip.cpp index 41f68f4..ce2b03f 100644 --- a/oneflow/core/ndarray/ndarray_apply_unary_core.hip.cpp +++ b/oneflow/core/ndarray/ndarray_apply_unary_core.hip.cpp @@ -1,47 +1,47 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/ndarray/ndarray_apply_unary_core.h" -#include "oneflow/core/ndarray/unary_func.h" - -namespace oneflow { - -namespace { - -template class unary_func> -__global__ void NdarrayApplyUnaryInplaceApplyGpu(T* ptr, size_t n) { - NdarrayApplyUnaryCore::InplaceApply(ptr, n); -} - -} // namespace - -template class unary_func> -struct NdarrayApplyUnaryCoreWrapper final { - static void InplaceApply(ep::Stream* stream, const XpuVarNdarray& y) { - size_t n = y.host_shape().HostElemNum(); - if (n == 0) { return; } - RUN_CUDA_KERNEL((NdarrayApplyUnaryInplaceApplyGpu), stream, n, y.host_ptr(), n); - } -}; - -#define INSTANTIATE_NDARRAY_APPLY_UNARY_CORE(dtype_pair, unary_func) \ - template struct NdarrayApplyUnaryCoreWrapper; -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_APPLY_UNARY_CORE, - ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, - ARITHMETIC_UNARY_FUNC_SEQ); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/ndarray/ndarray_apply_unary_core.h" +#include "oneflow/core/ndarray/unary_func.h" + +namespace oneflow { + +namespace { + +template class unary_func> +__global__ void NdarrayApplyUnaryInplaceApplyGpu(T* ptr, size_t n) { + NdarrayApplyUnaryCore::InplaceApply(ptr, n); +} + +} // namespace + +template class unary_func> +struct NdarrayApplyUnaryCoreWrapper final { + static void InplaceApply(ep::Stream* stream, const XpuVarNdarray& y) { + size_t n = y.host_shape().HostElemNum(); + if (n == 0) { return; } + RUN_CUDA_KERNEL((NdarrayApplyUnaryInplaceApplyGpu), stream, n, y.host_ptr(), n); + } +}; + +#define INSTANTIATE_NDARRAY_APPLY_UNARY_CORE(dtype_pair, unary_func) \ + template struct NdarrayApplyUnaryCoreWrapper; +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_APPLY_UNARY_CORE, + ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, + ARITHMETIC_UNARY_FUNC_SEQ); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/ndarray/ndarray_assign_core.hip.cpp b/oneflow/core/ndarray/ndarray_assign_core.hip.cpp index 16dbfed..f28bbea 100644 --- a/oneflow/core/ndarray/ndarray_assign_core.hip.cpp +++ b/oneflow/core/ndarray/ndarray_assign_core.hip.cpp @@ -1,63 +1,63 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/ndarray/ndarray_assign_core.h" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/kernel/kernel_util.h" - -namespace oneflow { - -namespace { - -template -__global__ void NdarrayAssignReducedGpu(XpuVarNdarray y, - const XpuReducedNdarray reduced) { - NdarrayAssignCore::Assign(y, reduced); -} - -template -__global__ void NdarrayAssignGpu(XpuVarNdarray y, const XpuVarNdarray x) { - NdarrayAssignCore::Assign(y, x); -} - -} // namespace - -template -struct NdarrayAssignCoreWrapper final { - static void Assign(ep::Stream* ctx, const XpuVarNdarray& y, - const XpuReducedNdarray& reduced) { - size_t n = y.host_shape().HostElemNum(); - if (n == 0) { return; } - RUN_CUDA_KERNEL((NdarrayAssignReducedGpu), ctx, n, y, reduced); - } - static void Assign(ep::Stream* ctx, const XpuVarNdarray& y, const XpuVarNdarray& x) { - size_t n = y.host_shape().HostElemNum(); - if (n == 0) { return; } - RUN_CUDA_KERNEL((NdarrayAssignGpu), ctx, n, y, x); - } -}; - -#define INSTANTIATE_NDARRAY_ASSIGN(ret_dtype_pair, dtype_pair, NDIMS) \ - template struct NdarrayAssignCoreWrapper; -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE( - INSTANTIATE_NDARRAY_ASSIGN, - ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, - ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, DIM_SEQ); -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_ASSIGN, HALF_DATA_TYPE_SEQ, HALF_DATA_TYPE_SEQ, - DIM_SEQ); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/ndarray/ndarray_assign_core.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/kernel/kernel_util.h" + +namespace oneflow { + +namespace { + +template +__global__ void NdarrayAssignReducedGpu(XpuVarNdarray y, + const XpuReducedNdarray reduced) { + NdarrayAssignCore::Assign(y, reduced); +} + +template +__global__ void NdarrayAssignGpu(XpuVarNdarray y, const XpuVarNdarray x) { + NdarrayAssignCore::Assign(y, x); +} + +} // namespace + +template +struct NdarrayAssignCoreWrapper final { + static void Assign(ep::Stream* ctx, const XpuVarNdarray& y, + const XpuReducedNdarray& reduced) { + size_t n = y.host_shape().HostElemNum(); + if (n == 0) { return; } + RUN_CUDA_KERNEL((NdarrayAssignReducedGpu), ctx, n, y, reduced); + } + static void Assign(ep::Stream* ctx, const XpuVarNdarray& y, const XpuVarNdarray& x) { + size_t n = y.host_shape().HostElemNum(); + if (n == 0) { return; } + RUN_CUDA_KERNEL((NdarrayAssignGpu), ctx, n, y, x); + } +}; + +#define INSTANTIATE_NDARRAY_ASSIGN(ret_dtype_pair, dtype_pair, NDIMS) \ + template struct NdarrayAssignCoreWrapper; +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE( + INSTANTIATE_NDARRAY_ASSIGN, + ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, + ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, DIM_SEQ); +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_ASSIGN, HALF_DATA_TYPE_SEQ, HALF_DATA_TYPE_SEQ, + DIM_SEQ); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/ndarray/ndarray_reduce_impl.hip.cpp b/oneflow/core/ndarray/ndarray_reduce_impl.hip.cpp index b651aa5..abd995b 100644 --- a/oneflow/core/ndarray/ndarray_reduce_impl.hip.cpp +++ b/oneflow/core/ndarray/ndarray_reduce_impl.hip.cpp @@ -1,383 +1,383 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include -#include "oneflow/core/ndarray/ndarray_reduce_impl.h" -#include "oneflow/core/ndarray/binary_func.h" -#include "oneflow/core/common/preprocessor.h" -#include "oneflow/core/common/shape.h" -#include "oneflow/core/common/permutation_iterator.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace hipcub { -struct Prod { - template - __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const { - return a * b; - } -}; -struct Any { - template - __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const { - return a || b; - } -}; -struct All { - template - __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const { - return a && b; - } -}; -} // namespace hipcub - -namespace oneflow { - -namespace { - -template class R, typename T, typename K, typename RetT> -__global__ void MatrixColReduceBy1ThreadPerColumn(K num_elems, K num_cols, const T* in, RetT* out) { - CUDA_1D_KERNEL_LOOP_T(K, j, num_cols) { - K index = j; - T sum = in[index]; - for (index += num_cols; index < num_elems; index += num_cols) { - sum = R::Invoke(sum, in[index]); - } - out[j] = sum; - } -} - -template -struct WithAlign2 { - union { - T value; - int32_t padding; - }; -}; - -template class R, typename T, typename K, typename RetT> -__global__ void MatrixColReduceByWarpBlock(K num_elems, K num_cols, const T* in, RetT* out) { - const K thread_col = threadIdx.x % kCudaWarpSize; - const K thread_row = threadIdx.x / kCudaWarpSize; - const K thread_dim_row = blockDim.x / kCudaWarpSize; - const K num_valid_threads = thread_dim_row * num_cols; // ASSERT: always <= num_elems - const K col = blockIdx.x * kCudaWarpSize + thread_col; - __shared__ WithAlign2 partial_values[kCudaWarpSize * kCudaWarpSize]; - if (col < num_cols) { - K index = thread_row * num_cols + col; - T val = in[index]; - for (index += num_valid_threads; index < num_elems; index += num_valid_threads) { - val = R::Invoke(val, in[index]); - } - partial_values[threadIdx.x].value = val; - } - __syncthreads(); - if (col < num_cols && thread_row == 0) { - int index = thread_col; - T val = partial_values[index].value; - for (index += kCudaWarpSize; index < blockDim.x; index += kCudaWarpSize) { - val = R::Invoke(val, partial_values[index].value); - } - out[col] = val; - } -} - -template class R, typename T, typename K, typename RetT> -void MatrixColReduceBy1BlockLayer(ep::Stream* stream, K num_elems, K num_cols, const T* in, - RetT* out) { - CHECK_LE(num_cols, kCudaMaxBlocksNum * kCudaWarpSize); - const K num_rows = num_elems / num_cols; - CHECK_GT(num_rows, 0); - if (num_rows < kCudaWarpSize) { - RUN_CUDA_KERNEL((MatrixColReduceBy1ThreadPerColumn), stream, num_cols, num_elems, - num_cols, in, out); - } else { - const int num_blocks = (num_cols + kCudaWarpSize - 1) / kCudaWarpSize; - const int num_threads = kCudaWarpSize * kCudaWarpSize; - auto Reduce = &MatrixColReduceByWarpBlock; - Reduce<<As()->cuda_stream()>>>( - num_elems, num_cols, in, out); - } -} - -const static int32_t kNumRows4OneBlockLayer = kCudaWarpSize * kCudaWarpSize; -const static int32_t kNumCols4OneBlockLayer = kCudaMaxBlocksNum * kCudaWarpSize / 2; - -template class R, typename T, typename K> -void MatrixColReduceK(ep::Stream* stream, K num_rows, K num_cols, const T* in, - typename BinaryFuncTrait::return_type* out, T* tmp) { - K num_elems = num_rows * num_cols; - if (num_rows < kNumRows4OneBlockLayer || num_cols > kNumCols4OneBlockLayer) { - MatrixColReduceBy1BlockLayer::return_type>( - stream, num_elems, num_cols, in, out); - } else { - int scale_shift = 1; - for (; true; ++scale_shift) { - if ((num_rows >> scale_shift) < kNumRows4OneBlockLayer) { break; } - if ((num_cols << scale_shift) > kNumCols4OneBlockLayer) { break; } - } - MatrixColReduceBy1BlockLayer(stream, num_elems, (num_cols << scale_shift), in, tmp); - // recursively calls MatrixColReduceK(...) log32(num_rows) times at most - MatrixColReduceK(stream, (1 << scale_shift), num_cols, tmp, out, tmp); - } -} - -template class R, typename T> -void MatrixColReduce(ep::Stream* stream, int64_t num_rows, int64_t num_cols, const T* in, - typename BinaryFuncTrait::return_type* out, T* tmp) { - if (IsKernelSafeInt32(num_rows * num_cols)) { - return MatrixColReduceK(stream, num_rows, num_cols, in, out, tmp); - } else { - return MatrixColReduceK(stream, num_rows, num_cols, in, out, tmp); - } -} - -} // namespace - -template class binary_func> -struct CubFunctor4BianryFunc; - -#define SPECIALIZE_CUB_FUNCTOR_4_BINARY_FUNC(func_name) \ - template \ - struct CubFunctor4BianryFunc final { \ - using type = hipcub::func_name; \ - }; -OF_PP_FOR_EACH_ATOMIC(SPECIALIZE_CUB_FUNCTOR_4_BINARY_FUNC, REDUCE_BINARY_FUNC_NAME_SEQ); -#undef SPECIALIZE_CUB_FUNCTOR_4_BINARY_FUNC - -struct RowOffsetFunctor final { - OF_DEVICE_FUNC explicit RowOffsetFunctor(int32_t num_cols) : num_cols_(num_cols) {} - OF_DEVICE_FUNC int32_t operator()(const int32_t& x) const { return x * num_cols_; } - int32_t num_cols_; -}; - -template class binary_func> -struct NdarrayScalarReduce final { - using RetT = typename BinaryFuncTrait::return_type; - static bool Matched(const XpuVarNdarray& y, const XpuVarNdarray& x) { - return y.shape().ElemNum() == 1; - } - - static void Reduce(ep::Stream* stream, const XpuVarNdarray& y, - const XpuVarNdarray& x, const XpuVarNdarray& tmp_storage) { - CHECK(Matched(y, x)); - size_t x_size = x.shape().ElemNum(); - size_t tmp_storage_bytes = 0; - auto DoReduce = [&](T* tmp_storage_ptr) { - int retcode = hipcub::DeviceReduce::Reduce( - tmp_storage_ptr, tmp_storage_bytes, x.ptr(), y.ptr(), x_size, - typename CubFunctor4BianryFunc::type(), - UnitOfBinaryFunc::Val(), stream->As()->cuda_stream()); - CHECK_EQ(retcode, 0) << "hipcub::DeviceSegmentedReduce::Reduce error"; - }; - DoReduce(nullptr); - // CHECK_GE(tmp_storage.shape().ElemNum() * sizeof(T), tmp_storage_bytes); - DoReduce(tmp_storage.ptr()); - } -}; - -template class binary_func> -struct NdarrayMatrixRowReduce final { - using RetT = typename BinaryFuncTrait::return_type; - static bool Matched(const XpuVarNdarray& y, const XpuVarNdarray& x) { - if (y.shape().ElemNum() > GetMaxVal()) { return false; } - if (x.shape().NumAxes() != 2) { return false; } - if (y.shape().NumAxes() != 2) { return false; } - return x.shape().At(0) == y.shape().At(0) && y.shape().At(1) == 1; - } - - static void Reduce(ep::Stream* stream, const XpuVarNdarray& y, - const XpuVarNdarray& x, const XpuVarNdarray& tmp_storage) { - CHECK(Matched(y, x)); - int32_t num_rows = y.shape().ElemNum(); - int32_t num_cols = x.shape().ElemNum() / y.shape().ElemNum(); - RowOffsetFunctor get_row_offset(num_cols); - hipcub::CountingInputIterator counting_intput_it(0); - hipcub::TransformInputIterator> - transform_input_iter(counting_intput_it, get_row_offset); - size_t tmp_storage_bytes = 0; - auto DoReduce = [&](T* tmp_storage_ptr) { - int retcode = hipcub::DeviceSegmentedReduce::Reduce( - tmp_storage_ptr, tmp_storage_bytes, x.ptr(), y.ptr(), num_rows, transform_input_iter, - transform_input_iter + 1, typename CubFunctor4BianryFunc::type(), - UnitOfBinaryFunc::Val(), stream->As()->cuda_stream()); - CHECK_EQ(retcode, 0) << "hipcub::DeviceSegmentedReduce::Reduce error"; - }; - DoReduce(nullptr); - CHECK_GE(tmp_storage.shape().ElemNum() * sizeof(T), tmp_storage_bytes); - DoReduce(tmp_storage.ptr()); - } -}; - -template class binary_func> -struct NdarrayMatrixColReduce final { - using RetT = typename BinaryFuncTrait::return_type; - static bool Matched(const XpuVarNdarray& y, const XpuVarNdarray& x) { - if (y.shape().ElemNum() > GetMaxVal()) { return false; } - if (x.shape().NumAxes() != 2) { return false; } - if (y.shape().NumAxes() != 2) { return false; } - return y.shape().At(0) == 1 && x.shape().At(1) == y.shape().At(1); - } - - struct XY2YXFunctor final { - __host__ __device__ XY2YXFunctor(int32_t dim_x, int32_t dim_y) : dim_x_(dim_x), dim_y_(dim_y) {} - - __host__ __device__ int32_t operator()(const int32_t& idx) const { - const int32_t y = idx / dim_x_; - const int32_t x = idx % dim_x_; - return x * dim_y_ + y; - } - - int32_t dim_x_; - int32_t dim_y_; - }; - - static void Reduce(ep::Stream* stream, const XpuVarNdarray& y, - const XpuVarNdarray& x, const XpuVarNdarray& tmp_storage) { - CHECK(Matched(y, x)); - int64_t num_rows = x.shape().At(0); - int64_t num_cols = x.shape().At(1); - if (num_cols < kNumCols4OneBlockLayer) { - return MatrixColReduce(stream, num_rows, num_cols, x.host_ptr(), y.host_ptr(), - tmp_storage.host_ptr()); - } - RowOffsetFunctor get_row_offset(num_rows); - hipcub::CountingInputIterator counting_intput_it(0); - hipcub::TransformInputIterator> - transform_input_iter(counting_intput_it, get_row_offset); - - XY2YXFunctor xy2yx(x.shape().At(0), x.shape().At(1)); - using XY2YxIndexIter = - hipcub::TransformInputIterator>; - XY2YxIndexIter xy2yx_iter(counting_intput_it, xy2yx); - PermutationIterator x_iter(x.ptr(), xy2yx_iter); - size_t tmp_storage_bytes = 0; - auto DoReduce = [&](T* tmp_storage_ptr) { - int retcode = hipcub::DeviceSegmentedReduce::Reduce( - tmp_storage_ptr, tmp_storage_bytes, x_iter, y.ptr(), num_cols, transform_input_iter, - transform_input_iter + 1, typename CubFunctor4BianryFunc::type(), - UnitOfBinaryFunc::Val(), stream->As()->cuda_stream()); - CHECK_EQ(retcode, 0) << "hipcub::DeviceSegmentedReduce::Reduce error"; - }; - DoReduce(nullptr); - CHECK_GE(tmp_storage.shape().ElemNum() * sizeof(T), tmp_storage_bytes); - DoReduce(tmp_storage.ptr()); - } -}; - -template class binary_func> -struct NdarrayXYZCubeXZReduce final { - using RetT = typename BinaryFuncTrait::return_type; - static bool Matched(const XpuVarNdarray& y, const XpuVarNdarray& x) { - if (y.shape().ElemNum() > GetMaxVal()) { return false; } - if (x.shape().NumAxes() != 3) { return false; } - if (y.shape().NumAxes() != 3) { return false; } - return y.shape().At(0) == 1 && x.shape().At(1) == y.shape().At(1) && y.shape().At(2) == 1; - } - - struct XYZ2YxzFunctor final { - __host__ __device__ XYZ2YxzFunctor(int32_t dim_x, int32_t dim_y, int32_t dim_z) - : dim_z_(dim_z), dim_xz_(dim_x * dim_z), dim_yz_(dim_y * dim_z) {} - - __host__ __device__ int32_t operator()(const int32_t& idx) const { - const int32_t y = idx / dim_xz_; - const int32_t xz_idx = idx % dim_xz_; - const int32_t x = xz_idx / dim_z_; - const int32_t z = xz_idx % dim_z_; - return x * dim_yz_ + y * dim_z_ + z; - } - - int32_t dim_z_; - int32_t dim_xz_; - int32_t dim_yz_; - }; - - static void Reduce(ep::Stream* stream, const XpuVarNdarray& y, - const XpuVarNdarray& x, const XpuVarNdarray& tmp_storage) { - CHECK(Matched(y, x)); - int32_t num_rows = y.shape().ElemNum(); - int32_t num_cols = x.shape().ElemNum() / y.shape().ElemNum(); - - RowOffsetFunctor get_row_offset(num_cols); - hipcub::CountingInputIterator counting_intput_it(0); - hipcub::TransformInputIterator> - transform_input_iter(counting_intput_it, get_row_offset); - - XYZ2YxzFunctor xyz2yxz(x.shape().At(0), x.shape().At(1), x.shape().At(2)); - using XYZ2YxzIndexIter = - hipcub::TransformInputIterator>; - XYZ2YxzIndexIter xyz2yxz_iter(counting_intput_it, xyz2yxz); - PermutationIterator x_iter(x.ptr(), xyz2yxz_iter); - size_t tmp_storage_bytes = 0; - auto DoReduce = [&](T* tmp_storage_ptr) { - int retcode = hipcub::DeviceSegmentedReduce::Reduce( - tmp_storage_ptr, tmp_storage_bytes, x_iter, y.ptr(), num_rows, transform_input_iter, - transform_input_iter + 1, typename CubFunctor4BianryFunc::type(), - UnitOfBinaryFunc::Val(), stream->As()->cuda_stream()); - CHECK_EQ(retcode, 0) << "hipcub::DeviceSegmentedReduce::Reduce error"; - }; - DoReduce(nullptr); - CHECK_GE(tmp_storage.shape().ElemNum() * sizeof(T), tmp_storage_bytes); - DoReduce(tmp_storage.ptr()); - } -}; - -namespace { - -template class binary_func> -__global__ void NdarrayReduceGpuInplaceReduceAxis(const XpuReducedNdarray dst_reduced, - const XpuReducedNdarray x, int axis) { - NdarrayReduceCore::ReduceAxis(dst_reduced, x, axis); -} - -} // namespace - -template class binary_func> -struct NdarrayReduceCoreWrapper final { - static void ReduceAxis(ep::Stream* stream, const XpuReducedNdarray& dst_reduced, - const XpuReducedNdarray& x, int axis) { - size_t n = x.host_shape().HostElemNum(); - RUN_CUDA_KERNEL((NdarrayReduceGpuInplaceReduceAxis), stream, n, - dst_reduced, x, axis); - } -}; - -#define INSTANTIATE_NDARRAY_REDUCE_IMPL(dtype, binary_func) \ - template struct NdarrayScalarReduce; \ - template struct NdarrayMatrixRowReduce; \ - template struct NdarrayMatrixColReduce; \ - template struct NdarrayXYZCubeXZReduce; -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_REDUCE_IMPL, - ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ - UNSIGNED_INT_DATA_TYPE_SEQ, - ARITHMETIC_REDUCE_BINARY_FUNC_SEQ); -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_REDUCE_IMPL, - ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ - BOOL_DATA_TYPE_SEQ, - LOGICAL_REDUCE_BINARY_FUNC_SEQ); - -#define INSTANTIATE_NDARRAY_REDUCE_CORE_WRAPPER(dtype_pair, NDIMS, binary_func) \ - template struct NdarrayReduceCoreWrapper; -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_REDUCE_CORE_WRAPPER, - ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ - UNSIGNED_INT_DATA_TYPE_SEQ, - DIM_SEQ, ARITHMETIC_REDUCE_BINARY_FUNC_SEQ); -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_REDUCE_CORE_WRAPPER, - ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ - BOOL_DATA_TYPE_SEQ, - DIM_SEQ, LOGICAL_REDUCE_BINARY_FUNC_SEQ); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include +#include "oneflow/core/ndarray/ndarray_reduce_impl.h" +#include "oneflow/core/ndarray/binary_func.h" +#include "oneflow/core/common/preprocessor.h" +#include "oneflow/core/common/shape.h" +#include "oneflow/core/common/permutation_iterator.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace hipcub { +struct Prod { + template + __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return a * b; + } +}; +struct Any { + template + __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return a || b; + } +}; +struct All { + template + __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return a && b; + } +}; +} // namespace hipcub + +namespace oneflow { + +namespace { + +template class R, typename T, typename K, typename RetT> +__global__ void MatrixColReduceBy1ThreadPerColumn(K num_elems, K num_cols, const T* in, RetT* out) { + CUDA_1D_KERNEL_LOOP_T(K, j, num_cols) { + K index = j; + T sum = in[index]; + for (index += num_cols; index < num_elems; index += num_cols) { + sum = R::Invoke(sum, in[index]); + } + out[j] = sum; + } +} + +template +struct WithAlign2 { + union { + T value; + int32_t padding; + }; +}; + +template class R, typename T, typename K, typename RetT> +__global__ void MatrixColReduceByWarpBlock(K num_elems, K num_cols, const T* in, RetT* out) { + const K thread_col = threadIdx.x % kCudaWarpSize; + const K thread_row = threadIdx.x / kCudaWarpSize; + const K thread_dim_row = blockDim.x / kCudaWarpSize; + const K num_valid_threads = thread_dim_row * num_cols; // ASSERT: always <= num_elems + const K col = blockIdx.x * kCudaWarpSize + thread_col; + __shared__ WithAlign2 partial_values[kCudaWarpSize * kCudaWarpSize]; + if (col < num_cols) { + K index = thread_row * num_cols + col; + T val = in[index]; + for (index += num_valid_threads; index < num_elems; index += num_valid_threads) { + val = R::Invoke(val, in[index]); + } + partial_values[threadIdx.x].value = val; + } + __syncthreads(); + if (col < num_cols && thread_row == 0) { + int index = thread_col; + T val = partial_values[index].value; + for (index += kCudaWarpSize; index < blockDim.x; index += kCudaWarpSize) { + val = R::Invoke(val, partial_values[index].value); + } + out[col] = val; + } +} + +template class R, typename T, typename K, typename RetT> +void MatrixColReduceBy1BlockLayer(ep::Stream* stream, K num_elems, K num_cols, const T* in, + RetT* out) { + CHECK_LE(num_cols, kCudaMaxBlocksNum * kCudaWarpSize); + const K num_rows = num_elems / num_cols; + CHECK_GT(num_rows, 0); + if (num_rows < kCudaWarpSize) { + RUN_CUDA_KERNEL((MatrixColReduceBy1ThreadPerColumn), stream, num_cols, num_elems, + num_cols, in, out); + } else { + const int num_blocks = (num_cols + kCudaWarpSize - 1) / kCudaWarpSize; + const int num_threads = kCudaWarpSize * kCudaWarpSize; + auto Reduce = &MatrixColReduceByWarpBlock; + Reduce<<As()->cuda_stream()>>>( + num_elems, num_cols, in, out); + } +} + +const static int32_t kNumRows4OneBlockLayer = kCudaWarpSize * kCudaWarpSize; +const static int32_t kNumCols4OneBlockLayer = kCudaMaxBlocksNum * kCudaWarpSize / 2; + +template class R, typename T, typename K> +void MatrixColReduceK(ep::Stream* stream, K num_rows, K num_cols, const T* in, + typename BinaryFuncTrait::return_type* out, T* tmp) { + K num_elems = num_rows * num_cols; + if (num_rows < kNumRows4OneBlockLayer || num_cols > kNumCols4OneBlockLayer) { + MatrixColReduceBy1BlockLayer::return_type>( + stream, num_elems, num_cols, in, out); + } else { + int scale_shift = 1; + for (; true; ++scale_shift) { + if ((num_rows >> scale_shift) < kNumRows4OneBlockLayer) { break; } + if ((num_cols << scale_shift) > kNumCols4OneBlockLayer) { break; } + } + MatrixColReduceBy1BlockLayer(stream, num_elems, (num_cols << scale_shift), in, tmp); + // recursively calls MatrixColReduceK(...) log32(num_rows) times at most + MatrixColReduceK(stream, (1 << scale_shift), num_cols, tmp, out, tmp); + } +} + +template class R, typename T> +void MatrixColReduce(ep::Stream* stream, int64_t num_rows, int64_t num_cols, const T* in, + typename BinaryFuncTrait::return_type* out, T* tmp) { + if (IsKernelSafeInt32(num_rows * num_cols)) { + return MatrixColReduceK(stream, num_rows, num_cols, in, out, tmp); + } else { + return MatrixColReduceK(stream, num_rows, num_cols, in, out, tmp); + } +} + +} // namespace + +template class binary_func> +struct CubFunctor4BianryFunc; + +#define SPECIALIZE_CUB_FUNCTOR_4_BINARY_FUNC(func_name) \ + template \ + struct CubFunctor4BianryFunc final { \ + using type = hipcub::func_name; \ + }; +OF_PP_FOR_EACH_ATOMIC(SPECIALIZE_CUB_FUNCTOR_4_BINARY_FUNC, REDUCE_BINARY_FUNC_NAME_SEQ); +#undef SPECIALIZE_CUB_FUNCTOR_4_BINARY_FUNC + +struct RowOffsetFunctor final { + OF_DEVICE_FUNC explicit RowOffsetFunctor(int32_t num_cols) : num_cols_(num_cols) {} + OF_DEVICE_FUNC int32_t operator()(const int32_t& x) const { return x * num_cols_; } + int32_t num_cols_; +}; + +template class binary_func> +struct NdarrayScalarReduce final { + using RetT = typename BinaryFuncTrait::return_type; + static bool Matched(const XpuVarNdarray& y, const XpuVarNdarray& x) { + return y.shape().ElemNum() == 1; + } + + static void Reduce(ep::Stream* stream, const XpuVarNdarray& y, + const XpuVarNdarray& x, const XpuVarNdarray& tmp_storage) { + CHECK(Matched(y, x)); + size_t x_size = x.shape().ElemNum(); + size_t tmp_storage_bytes = 0; + auto DoReduce = [&](T* tmp_storage_ptr) { + int retcode = hipcub::DeviceReduce::Reduce( + tmp_storage_ptr, tmp_storage_bytes, x.ptr(), y.ptr(), x_size, + typename CubFunctor4BianryFunc::type(), + UnitOfBinaryFunc::Val(), stream->As()->cuda_stream()); + CHECK_EQ(retcode, 0) << "hipcub::DeviceSegmentedReduce::Reduce error"; + }; + DoReduce(nullptr); + // CHECK_GE(tmp_storage.shape().ElemNum() * sizeof(T), tmp_storage_bytes); + DoReduce(tmp_storage.ptr()); + } +}; + +template class binary_func> +struct NdarrayMatrixRowReduce final { + using RetT = typename BinaryFuncTrait::return_type; + static bool Matched(const XpuVarNdarray& y, const XpuVarNdarray& x) { + if (y.shape().ElemNum() > GetMaxVal()) { return false; } + if (x.shape().NumAxes() != 2) { return false; } + if (y.shape().NumAxes() != 2) { return false; } + return x.shape().At(0) == y.shape().At(0) && y.shape().At(1) == 1; + } + + static void Reduce(ep::Stream* stream, const XpuVarNdarray& y, + const XpuVarNdarray& x, const XpuVarNdarray& tmp_storage) { + CHECK(Matched(y, x)); + int32_t num_rows = y.shape().ElemNum(); + int32_t num_cols = x.shape().ElemNum() / y.shape().ElemNum(); + RowOffsetFunctor get_row_offset(num_cols); + hipcub::CountingInputIterator counting_intput_it(0); + hipcub::TransformInputIterator> + transform_input_iter(counting_intput_it, get_row_offset); + size_t tmp_storage_bytes = 0; + auto DoReduce = [&](T* tmp_storage_ptr) { + int retcode = hipcub::DeviceSegmentedReduce::Reduce( + tmp_storage_ptr, tmp_storage_bytes, x.ptr(), y.ptr(), num_rows, transform_input_iter, + transform_input_iter + 1, typename CubFunctor4BianryFunc::type(), + UnitOfBinaryFunc::Val(), stream->As()->cuda_stream()); + CHECK_EQ(retcode, 0) << "hipcub::DeviceSegmentedReduce::Reduce error"; + }; + DoReduce(nullptr); + CHECK_GE(tmp_storage.shape().ElemNum() * sizeof(T), tmp_storage_bytes); + DoReduce(tmp_storage.ptr()); + } +}; + +template class binary_func> +struct NdarrayMatrixColReduce final { + using RetT = typename BinaryFuncTrait::return_type; + static bool Matched(const XpuVarNdarray& y, const XpuVarNdarray& x) { + if (y.shape().ElemNum() > GetMaxVal()) { return false; } + if (x.shape().NumAxes() != 2) { return false; } + if (y.shape().NumAxes() != 2) { return false; } + return y.shape().At(0) == 1 && x.shape().At(1) == y.shape().At(1); + } + + struct XY2YXFunctor final { + __host__ __device__ XY2YXFunctor(int32_t dim_x, int32_t dim_y) : dim_x_(dim_x), dim_y_(dim_y) {} + + __host__ __device__ int32_t operator()(const int32_t& idx) const { + const int32_t y = idx / dim_x_; + const int32_t x = idx % dim_x_; + return x * dim_y_ + y; + } + + int32_t dim_x_; + int32_t dim_y_; + }; + + static void Reduce(ep::Stream* stream, const XpuVarNdarray& y, + const XpuVarNdarray& x, const XpuVarNdarray& tmp_storage) { + CHECK(Matched(y, x)); + int64_t num_rows = x.shape().At(0); + int64_t num_cols = x.shape().At(1); + if (num_cols < kNumCols4OneBlockLayer) { + return MatrixColReduce(stream, num_rows, num_cols, x.host_ptr(), y.host_ptr(), + tmp_storage.host_ptr()); + } + RowOffsetFunctor get_row_offset(num_rows); + hipcub::CountingInputIterator counting_intput_it(0); + hipcub::TransformInputIterator> + transform_input_iter(counting_intput_it, get_row_offset); + + XY2YXFunctor xy2yx(x.shape().At(0), x.shape().At(1)); + using XY2YxIndexIter = + hipcub::TransformInputIterator>; + XY2YxIndexIter xy2yx_iter(counting_intput_it, xy2yx); + PermutationIterator x_iter(x.ptr(), xy2yx_iter); + size_t tmp_storage_bytes = 0; + auto DoReduce = [&](T* tmp_storage_ptr) { + int retcode = hipcub::DeviceSegmentedReduce::Reduce( + tmp_storage_ptr, tmp_storage_bytes, x_iter, y.ptr(), num_cols, transform_input_iter, + transform_input_iter + 1, typename CubFunctor4BianryFunc::type(), + UnitOfBinaryFunc::Val(), stream->As()->cuda_stream()); + CHECK_EQ(retcode, 0) << "hipcub::DeviceSegmentedReduce::Reduce error"; + }; + DoReduce(nullptr); + CHECK_GE(tmp_storage.shape().ElemNum() * sizeof(T), tmp_storage_bytes); + DoReduce(tmp_storage.ptr()); + } +}; + +template class binary_func> +struct NdarrayXYZCubeXZReduce final { + using RetT = typename BinaryFuncTrait::return_type; + static bool Matched(const XpuVarNdarray& y, const XpuVarNdarray& x) { + if (y.shape().ElemNum() > GetMaxVal()) { return false; } + if (x.shape().NumAxes() != 3) { return false; } + if (y.shape().NumAxes() != 3) { return false; } + return y.shape().At(0) == 1 && x.shape().At(1) == y.shape().At(1) && y.shape().At(2) == 1; + } + + struct XYZ2YxzFunctor final { + __host__ __device__ XYZ2YxzFunctor(int32_t dim_x, int32_t dim_y, int32_t dim_z) + : dim_z_(dim_z), dim_xz_(dim_x * dim_z), dim_yz_(dim_y * dim_z) {} + + __host__ __device__ int32_t operator()(const int32_t& idx) const { + const int32_t y = idx / dim_xz_; + const int32_t xz_idx = idx % dim_xz_; + const int32_t x = xz_idx / dim_z_; + const int32_t z = xz_idx % dim_z_; + return x * dim_yz_ + y * dim_z_ + z; + } + + int32_t dim_z_; + int32_t dim_xz_; + int32_t dim_yz_; + }; + + static void Reduce(ep::Stream* stream, const XpuVarNdarray& y, + const XpuVarNdarray& x, const XpuVarNdarray& tmp_storage) { + CHECK(Matched(y, x)); + int32_t num_rows = y.shape().ElemNum(); + int32_t num_cols = x.shape().ElemNum() / y.shape().ElemNum(); + + RowOffsetFunctor get_row_offset(num_cols); + hipcub::CountingInputIterator counting_intput_it(0); + hipcub::TransformInputIterator> + transform_input_iter(counting_intput_it, get_row_offset); + + XYZ2YxzFunctor xyz2yxz(x.shape().At(0), x.shape().At(1), x.shape().At(2)); + using XYZ2YxzIndexIter = + hipcub::TransformInputIterator>; + XYZ2YxzIndexIter xyz2yxz_iter(counting_intput_it, xyz2yxz); + PermutationIterator x_iter(x.ptr(), xyz2yxz_iter); + size_t tmp_storage_bytes = 0; + auto DoReduce = [&](T* tmp_storage_ptr) { + int retcode = hipcub::DeviceSegmentedReduce::Reduce( + tmp_storage_ptr, tmp_storage_bytes, x_iter, y.ptr(), num_rows, transform_input_iter, + transform_input_iter + 1, typename CubFunctor4BianryFunc::type(), + UnitOfBinaryFunc::Val(), stream->As()->cuda_stream()); + CHECK_EQ(retcode, 0) << "hipcub::DeviceSegmentedReduce::Reduce error"; + }; + DoReduce(nullptr); + CHECK_GE(tmp_storage.shape().ElemNum() * sizeof(T), tmp_storage_bytes); + DoReduce(tmp_storage.ptr()); + } +}; + +namespace { + +template class binary_func> +__global__ void NdarrayReduceGpuInplaceReduceAxis(const XpuReducedNdarray dst_reduced, + const XpuReducedNdarray x, int axis) { + NdarrayReduceCore::ReduceAxis(dst_reduced, x, axis); +} + +} // namespace + +template class binary_func> +struct NdarrayReduceCoreWrapper final { + static void ReduceAxis(ep::Stream* stream, const XpuReducedNdarray& dst_reduced, + const XpuReducedNdarray& x, int axis) { + size_t n = x.host_shape().HostElemNum(); + RUN_CUDA_KERNEL((NdarrayReduceGpuInplaceReduceAxis), stream, n, + dst_reduced, x, axis); + } +}; + +#define INSTANTIATE_NDARRAY_REDUCE_IMPL(dtype, binary_func) \ + template struct NdarrayScalarReduce; \ + template struct NdarrayMatrixRowReduce; \ + template struct NdarrayMatrixColReduce; \ + template struct NdarrayXYZCubeXZReduce; +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_REDUCE_IMPL, + ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ + UNSIGNED_INT_DATA_TYPE_SEQ, + ARITHMETIC_REDUCE_BINARY_FUNC_SEQ); +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_REDUCE_IMPL, + ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ + BOOL_DATA_TYPE_SEQ, + LOGICAL_REDUCE_BINARY_FUNC_SEQ); + +#define INSTANTIATE_NDARRAY_REDUCE_CORE_WRAPPER(dtype_pair, NDIMS, binary_func) \ + template struct NdarrayReduceCoreWrapper; +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_REDUCE_CORE_WRAPPER, + ARITHMETIC_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ + UNSIGNED_INT_DATA_TYPE_SEQ, + DIM_SEQ, ARITHMETIC_REDUCE_BINARY_FUNC_SEQ); +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_REDUCE_CORE_WRAPPER, + ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ + BOOL_DATA_TYPE_SEQ, + DIM_SEQ, LOGICAL_REDUCE_BINARY_FUNC_SEQ); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/ndarray/xpu_ndarray_assign.hip.cpp b/oneflow/core/ndarray/xpu_ndarray_assign.hip.cpp index 693b39c..2030335 100644 --- a/oneflow/core/ndarray/xpu_ndarray_assign.hip.cpp +++ b/oneflow/core/ndarray/xpu_ndarray_assign.hip.cpp @@ -1,62 +1,62 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/ndarray/ndarray_assign_core.h" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/kernel/kernel_util.h" - -namespace oneflow { - -namespace { - -template -__global__ void NdarrayAssignReducedGpu(XpuVarNdarray y, - const XpuReducedNdarray reduced) { - NdarrayAssignCore::Assign(y, reduced); -} - -template -__global__ void NdarrayAssignGpu(XpuVarNdarray y, const XpuVarNdarray x) { - NdarrayAssignCore::Assign(y, x); -} - -} // namespace - -template -struct NdarrayAssignCoreWrapper final { - static void Assign(ep::Stream* stream, XpuVarNdarray* y, - const XpuReducedNdarray& reduced) { - size_t n = y->host_shape().HostElemNum(); - RUN_CUDA_KERNEL((NdarrayAssignReducedGpu), stream, n, *y, reduced); - } - static void Assign(ep::Stream* ctx, const XpuVarNdarray& y, const XpuVarNdarray& x) { - size_t n = y.host_shape().HostElemNum(); - if (n == 0) { return; } - RUN_CUDA_KERNEL((NdarrayAssignGpu), ctx, n, y, x); - } -}; - -#define INSTANTIATE_NDARRAY_ASSIGN(ret_dtype_pair, dtype_pair, NDIMS) \ - template struct NdarrayAssignCoreWrapper; -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE( - INSTANTIATE_NDARRAY_ASSIGN, - ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, - ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, DIM_SEQ); -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_ASSIGN, HALF_DATA_TYPE_SEQ, HALF_DATA_TYPE_SEQ, - DIM_SEQ); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/ndarray/ndarray_assign_core.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/kernel/kernel_util.h" + +namespace oneflow { + +namespace { + +template +__global__ void NdarrayAssignReducedGpu(XpuVarNdarray y, + const XpuReducedNdarray reduced) { + NdarrayAssignCore::Assign(y, reduced); +} + +template +__global__ void NdarrayAssignGpu(XpuVarNdarray y, const XpuVarNdarray x) { + NdarrayAssignCore::Assign(y, x); +} + +} // namespace + +template +struct NdarrayAssignCoreWrapper final { + static void Assign(ep::Stream* stream, XpuVarNdarray* y, + const XpuReducedNdarray& reduced) { + size_t n = y->host_shape().HostElemNum(); + RUN_CUDA_KERNEL((NdarrayAssignReducedGpu), stream, n, *y, reduced); + } + static void Assign(ep::Stream* ctx, const XpuVarNdarray& y, const XpuVarNdarray& x) { + size_t n = y.host_shape().HostElemNum(); + if (n == 0) { return; } + RUN_CUDA_KERNEL((NdarrayAssignGpu), ctx, n, y, x); + } +}; + +#define INSTANTIATE_NDARRAY_ASSIGN(ret_dtype_pair, dtype_pair, NDIMS) \ + template struct NdarrayAssignCoreWrapper; +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE( + INSTANTIATE_NDARRAY_ASSIGN, + ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, + ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, DIM_SEQ); +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_ASSIGN, HALF_DATA_TYPE_SEQ, HALF_DATA_TYPE_SEQ, + DIM_SEQ); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/profiler/event.cpp b/oneflow/core/profiler/event.cpp index d56cebf..dfa0142 100644 --- a/oneflow/core/profiler/event.cpp +++ b/oneflow/core/profiler/event.cpp @@ -1,91 +1,91 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#include "fmt/core.h" -#include "fmt/format.h" -#include "oneflow/core/profiler/event.h" -#include "oneflow/core/profiler/util.h" - -using json = nlohmann::json; - -namespace oneflow { - -namespace profiler { -nlohmann::json IEvent::ToJson() { - return json{{"name", name_}, {"time", GetDuration()}, {"input_shapes", "-"}}; -} - -void IEvent::SetStartedAt(double t) { started_at_ = t; } - -void IEvent::SetFinishedAt(double t) { finished_at_ = t; } - -void IEvent::Start() { SetStartedAt(GetTimeNow()); } - -void IEvent::Finish() { SetFinishedAt(GetTimeNow()); } - -bool IEvent::IsChildOf(const IEvent* e) { - if (!e) { return false; } - if (this == e) { return false; } - return GetStartedAt() >= e->GetStartedAt() - && GetFinishedAt() <= e->GetFinishedAt(); -} - -const std::string& IEvent::GetName() const { return name_; } - -std::string CustomEvent::Key() { return name_; } - -nlohmann::json CustomEvent::ToJson() { - auto j = IEvent::ToJson(); - j["type"] = EventType::kCustom; - j["custom_type"] = type_; - return j; -} - -std::shared_ptr CustomEvent::Create(const std::string& name, CustomEventType type) { - return std::shared_ptr(new CustomEvent(name, type)); -} - -std::string KernelEvent::Key() { return fmt::format("{}.{}", name_, GetFormatedInputShapes()); } - -nlohmann::json KernelEvent::ToJson() { - auto j = IEvent::ToJson(); - j["type"] = EventType::kOneflowKernel; - j["input_shapes"] = GetFormatedInputShapes(); -#if defined(WITH_CUDA) || defined(WITH_ROCM) - j["memory_size"] = memory_size_; - if (!children_.empty()) { j["children"] = children_; } -#endif // WITH_CUDA - return j; -} - -std::shared_ptr KernelEvent::Create( - const std::string& name, const std::function(void)>& shape_getter) { - return std::shared_ptr(new KernelEvent(name, shape_getter)); -} - -std::string KernelEvent::GetFormatedInputShapes(size_t max_num_to_format) { - if (input_shapes_.size() == 0) { return "-"; } - std::vector shapes_formated(std::min(input_shapes_.size(), max_num_to_format)); - for (auto i = 0; i < shapes_formated.size(); ++i) { - const std::string current_shape = input_shapes_[i].ToString(); - shapes_formated[i] = current_shape == "()" ? "scalar" : current_shape; - } - if (input_shapes_.size() > max_num_to_format) { shapes_formated.emplace_back("..."); } - return fmt::format("[{}]", fmt::join(shapes_formated, ", ")); -} - -} // namespace profiler +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "fmt/core.h" +#include "fmt/format.h" +#include "oneflow/core/profiler/event.h" +#include "oneflow/core/profiler/util.h" + +using json = nlohmann::json; + +namespace oneflow { + +namespace profiler { +nlohmann::json IEvent::ToJson() { + return json{{"name", name_}, {"time", GetDuration()}, {"input_shapes", "-"}}; +} + +void IEvent::SetStartedAt(double t) { started_at_ = t; } + +void IEvent::SetFinishedAt(double t) { finished_at_ = t; } + +void IEvent::Start() { SetStartedAt(GetTimeNow()); } + +void IEvent::Finish() { SetFinishedAt(GetTimeNow()); } + +bool IEvent::IsChildOf(const IEvent* e) { + if (!e) { return false; } + if (this == e) { return false; } + return GetStartedAt() >= e->GetStartedAt() + && GetFinishedAt() <= e->GetFinishedAt(); +} + +const std::string& IEvent::GetName() const { return name_; } + +std::string CustomEvent::Key() { return name_; } + +nlohmann::json CustomEvent::ToJson() { + auto j = IEvent::ToJson(); + j["type"] = EventType::kCustom; + j["custom_type"] = type_; + return j; +} + +std::shared_ptr CustomEvent::Create(const std::string& name, CustomEventType type) { + return std::shared_ptr(new CustomEvent(name, type)); +} + +std::string KernelEvent::Key() { return fmt::format("{}.{}", name_, GetFormatedInputShapes()); } + +nlohmann::json KernelEvent::ToJson() { + auto j = IEvent::ToJson(); + j["type"] = EventType::kOneflowKernel; + j["input_shapes"] = GetFormatedInputShapes(); +#if defined(WITH_CUDA) || defined(WITH_ROCM) + j["memory_size"] = memory_size_; + if (!children_.empty()) { j["children"] = children_; } +#endif // WITH_CUDA + return j; +} + +std::shared_ptr KernelEvent::Create( + const std::string& name, const std::function(void)>& shape_getter) { + return std::shared_ptr(new KernelEvent(name, shape_getter)); +} + +std::string KernelEvent::GetFormatedInputShapes(size_t max_num_to_format) { + if (input_shapes_.size() == 0) { return "-"; } + std::vector shapes_formated(std::min(input_shapes_.size(), max_num_to_format)); + for (auto i = 0; i < shapes_formated.size(); ++i) { + const std::string current_shape = input_shapes_[i].ToString(); + shapes_formated[i] = current_shape == "()" ? "scalar" : current_shape; + } + if (input_shapes_.size() > max_num_to_format) { shapes_formated.emplace_back("..."); } + return fmt::format("[{}]", fmt::join(shapes_formated, ", ")); +} + +} // namespace profiler } // namespace oneflow \ No newline at end of file diff --git a/oneflow/core/profiler/event.h b/oneflow/core/profiler/event.h index af60ff5..59d8c68 100644 --- a/oneflow/core/profiler/event.h +++ b/oneflow/core/profiler/event.h @@ -1,186 +1,186 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_PROFILER_EVENT_H_ -#define ONEFLOW_CORE_PROFILER_EVENT_H_ - -#include -#include -#include -#include "nlohmann/json.hpp" -#include "oneflow/core/common/util.h" -#include "oneflow/core/common/shape_view.h" - -namespace oneflow { - -namespace profiler { - -class ProfileManager; - -enum class EventType { - kCustom, // has three kinds - kOneflowKernel // OneFlow cpu/cuda kernel -}; -enum class CustomEventType { - kDefault, // for record_function - kCudaKernel, // cuda kernel - kCudaRuntime // something like cudaLaunchKernel -}; -enum class EventTimeUnit { kNS, kUS }; - -class IEvent { - public: - OF_DISALLOW_COPY_AND_MOVE(IEvent); - - IEvent() = delete; - IEvent(const std::string& name, EventTimeUnit time_unit) : name_(name), time_unit_(time_unit) {} - - virtual std::string Key() = 0; - virtual nlohmann::json ToJson(); - virtual ~IEvent() = default; - - virtual void Start(); - virtual void Finish(); - bool IsChildOf(const IEvent* e); - - const std::string& GetName() const; - template - const T GetDuration(EventTimeUnit time_unit = EventTimeUnit::kUS) const; - template - const T GetStartedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const; - template - const T GetFinishedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const; - - protected: - virtual void SetStartedAt(double t); - virtual void SetFinishedAt(double t); - - std::string name_; - EventTimeUnit time_unit_; - double started_at_ = 0; - double finished_at_ = 0; -}; - -inline double ConvertTime(double time_, EventTimeUnit src_time_unit, EventTimeUnit dst_time_unit) { - if (src_time_unit == EventTimeUnit::kNS && dst_time_unit == EventTimeUnit::kUS) { - return time_ / 1000; - } - if (src_time_unit == EventTimeUnit::kUS && dst_time_unit == EventTimeUnit::kNS) { - return time_ * 1000; - } - return time_; -} - -template<> -const inline double IEvent::GetStartedAt(EventTimeUnit time_unit) const { - return ConvertTime(started_at_, time_unit_, time_unit); -} - -template<> -const inline time_t IEvent::GetStartedAt(EventTimeUnit time_unit) const { - return static_cast(GetStartedAt(time_unit)); -} - -template<> -const inline double IEvent::GetFinishedAt(EventTimeUnit time_unit) const { - return ConvertTime(finished_at_, time_unit_, time_unit); -} - -template<> -const inline time_t IEvent::GetFinishedAt(EventTimeUnit time_unit) const { - return static_cast(GetFinishedAt(time_unit)); -} - -template<> -const inline double IEvent::GetDuration(EventTimeUnit time_unit) const { - return GetFinishedAt(time_unit) - GetStartedAt(time_unit); -} - -template<> -const inline time_t IEvent::GetDuration(EventTimeUnit time_unit) const { - return static_cast(GetDuration(time_unit)); -} - -class CustomEvent final : public IEvent { - public: - friend class ProfileManager; - std::string Key() override; - - nlohmann::json ToJson() override; - - static std::shared_ptr Create(const std::string& name, - CustomEventType type = CustomEventType::kDefault); - - private: - CustomEventType type_; - CustomEvent(const std::string& custom_name, CustomEventType type) - : IEvent(custom_name, - type == CustomEventType::kDefault ? EventTimeUnit::kNS : EventTimeUnit::kUS), - type_(type) {} -}; - -class KernelEvent final : public IEvent { - public: - std::string Key() override; - - nlohmann::json ToJson() override; - - static std::shared_ptr Create( - const std::string& name, const std::function(void)>& shape_getter); - -#if defined(WITH_CUDA) || defined(WITH_ROCM) - void SetMemorySize(int64_t memory_size) { memory_size_ = memory_size; } - void AddChildEvent(const std::shared_ptr& e) { children_.emplace(e); } - bool AddChildEventIfSo(const std::shared_ptr& e) { - if (e->IsChildOf(dynamic_cast(this))) { - children_.emplace(e); - return true; - } - return false; - } - bool HasChildEvent(const std::shared_ptr& e) { return children_.count(e); } - void WalkAmongChildren(const std::function& e)>& f) const { - for (const auto& x : children_) { f(x); } - } -#endif // WITH_CUDA - - private: - KernelEvent(const std::string& kernel_name, - const std::function(void)>& shape_getter) - : IEvent(kernel_name, EventTimeUnit::kNS) { - if (shape_getter) { input_shapes_ = shape_getter(); } - } - -#if defined(WITH_CUDA) || defined(WITH_ROCM) - int64_t memory_size_ = -1; - std::set> children_; -#endif // WITH_CUDA - - std::vector input_shapes_; - std::string GetFormatedInputShapes(size_t max_num_to_format = 4); -}; - -} // namespace profiler -} // namespace oneflow - -namespace nlohmann { - -inline void to_json(json& j, const std::shared_ptr<::oneflow::profiler::IEvent>& event) { - j = event->ToJson(); -} - -} // namespace nlohmann - -#endif // ONEFLOW_CORE_PROFILER_EVENT_H_ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_PROFILER_EVENT_H_ +#define ONEFLOW_CORE_PROFILER_EVENT_H_ + +#include +#include +#include +#include "nlohmann/json.hpp" +#include "oneflow/core/common/util.h" +#include "oneflow/core/common/shape_view.h" + +namespace oneflow { + +namespace profiler { + +class ProfileManager; + +enum class EventType { + kCustom, // has three kinds + kOneflowKernel // OneFlow cpu/cuda kernel +}; +enum class CustomEventType { + kDefault, // for record_function + kCudaKernel, // cuda kernel + kCudaRuntime // something like cudaLaunchKernel +}; +enum class EventTimeUnit { kNS, kUS }; + +class IEvent { + public: + OF_DISALLOW_COPY_AND_MOVE(IEvent); + + IEvent() = delete; + IEvent(const std::string& name, EventTimeUnit time_unit) : name_(name), time_unit_(time_unit) {} + + virtual std::string Key() = 0; + virtual nlohmann::json ToJson(); + virtual ~IEvent() = default; + + virtual void Start(); + virtual void Finish(); + bool IsChildOf(const IEvent* e); + + const std::string& GetName() const; + template + const T GetDuration(EventTimeUnit time_unit = EventTimeUnit::kUS) const; + template + const T GetStartedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const; + template + const T GetFinishedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const; + + protected: + virtual void SetStartedAt(double t); + virtual void SetFinishedAt(double t); + + std::string name_; + EventTimeUnit time_unit_; + double started_at_ = 0; + double finished_at_ = 0; +}; + +inline double ConvertTime(double time_, EventTimeUnit src_time_unit, EventTimeUnit dst_time_unit) { + if (src_time_unit == EventTimeUnit::kNS && dst_time_unit == EventTimeUnit::kUS) { + return time_ / 1000; + } + if (src_time_unit == EventTimeUnit::kUS && dst_time_unit == EventTimeUnit::kNS) { + return time_ * 1000; + } + return time_; +} + +template<> +const inline double IEvent::GetStartedAt(EventTimeUnit time_unit) const { + return ConvertTime(started_at_, time_unit_, time_unit); +} + +template<> +const inline time_t IEvent::GetStartedAt(EventTimeUnit time_unit) const { + return static_cast(GetStartedAt(time_unit)); +} + +template<> +const inline double IEvent::GetFinishedAt(EventTimeUnit time_unit) const { + return ConvertTime(finished_at_, time_unit_, time_unit); +} + +template<> +const inline time_t IEvent::GetFinishedAt(EventTimeUnit time_unit) const { + return static_cast(GetFinishedAt(time_unit)); +} + +template<> +const inline double IEvent::GetDuration(EventTimeUnit time_unit) const { + return GetFinishedAt(time_unit) - GetStartedAt(time_unit); +} + +template<> +const inline time_t IEvent::GetDuration(EventTimeUnit time_unit) const { + return static_cast(GetDuration(time_unit)); +} + +class CustomEvent final : public IEvent { + public: + friend class ProfileManager; + std::string Key() override; + + nlohmann::json ToJson() override; + + static std::shared_ptr Create(const std::string& name, + CustomEventType type = CustomEventType::kDefault); + + private: + CustomEventType type_; + CustomEvent(const std::string& custom_name, CustomEventType type) + : IEvent(custom_name, + type == CustomEventType::kDefault ? EventTimeUnit::kNS : EventTimeUnit::kUS), + type_(type) {} +}; + +class KernelEvent final : public IEvent { + public: + std::string Key() override; + + nlohmann::json ToJson() override; + + static std::shared_ptr Create( + const std::string& name, const std::function(void)>& shape_getter); + +#if defined(WITH_CUDA) || defined(WITH_ROCM) + void SetMemorySize(int64_t memory_size) { memory_size_ = memory_size; } + void AddChildEvent(const std::shared_ptr& e) { children_.emplace(e); } + bool AddChildEventIfSo(const std::shared_ptr& e) { + if (e->IsChildOf(dynamic_cast(this))) { + children_.emplace(e); + return true; + } + return false; + } + bool HasChildEvent(const std::shared_ptr& e) { return children_.count(e); } + void WalkAmongChildren(const std::function& e)>& f) const { + for (const auto& x : children_) { f(x); } + } +#endif // WITH_CUDA + + private: + KernelEvent(const std::string& kernel_name, + const std::function(void)>& shape_getter) + : IEvent(kernel_name, EventTimeUnit::kNS) { + if (shape_getter) { input_shapes_ = shape_getter(); } + } + +#if defined(WITH_CUDA) || defined(WITH_ROCM) + int64_t memory_size_ = -1; + std::set> children_; +#endif // WITH_CUDA + + std::vector input_shapes_; + std::string GetFormatedInputShapes(size_t max_num_to_format = 4); +}; + +} // namespace profiler +} // namespace oneflow + +namespace nlohmann { + +inline void to_json(json& j, const std::shared_ptr<::oneflow::profiler::IEvent>& event) { + j = event->ToJson(); +} + +} // namespace nlohmann + +#endif // ONEFLOW_CORE_PROFILER_EVENT_H_ diff --git a/oneflow/core/profiler/event_recorder.h b/oneflow/core/profiler/event_recorder.h index 31b1c34..6332948 100644 --- a/oneflow/core/profiler/event_recorder.h +++ b/oneflow/core/profiler/event_recorder.h @@ -1,60 +1,60 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_ -#define ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_ - -#include "oneflow/core/common/util.h" -#include "oneflow/core/profiler/event.h" - -namespace oneflow { -namespace profiler { - -class EventRecorder { - public: - using ShapeGetterFuncType = std::function(void)>; - - OF_DISALLOW_COPY_AND_MOVE(EventRecorder); - - explicit EventRecorder(const std::shared_ptr& event) : event_(event) { - CHECK_JUST(RegisterEventToProfileManager(event)); - event_->Start(); - } - - Maybe RegisterEventToProfileManager(const std::shared_ptr& event); - - ~EventRecorder() { - if (event_) { - event_->Finish(); - event_.reset(); - } - } - static std::shared_ptr CreateCustomEventRecorder(const std::string& name); - - static Maybe CreateKernelEventRecorder( - const std::string& name, -#if defined(WITH_CUDA) || defined(WITH_ROCM) - const std::function& memory_size_getter, -#endif - const ShapeGetterFuncType& shape_getter); - - private: - std::shared_ptr event_; -}; - -} // namespace profiler -} // namespace oneflow - -#endif // ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_ +#define ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_ + +#include "oneflow/core/common/util.h" +#include "oneflow/core/profiler/event.h" + +namespace oneflow { +namespace profiler { + +class EventRecorder { + public: + using ShapeGetterFuncType = std::function(void)>; + + OF_DISALLOW_COPY_AND_MOVE(EventRecorder); + + explicit EventRecorder(const std::shared_ptr& event) : event_(event) { + CHECK_JUST(RegisterEventToProfileManager(event)); + event_->Start(); + } + + Maybe RegisterEventToProfileManager(const std::shared_ptr& event); + + ~EventRecorder() { + if (event_) { + event_->Finish(); + event_.reset(); + } + } + static std::shared_ptr CreateCustomEventRecorder(const std::string& name); + + static Maybe CreateKernelEventRecorder( + const std::string& name, +#if defined(WITH_CUDA) || defined(WITH_ROCM) + const std::function& memory_size_getter, +#endif + const ShapeGetterFuncType& shape_getter); + + private: + std::shared_ptr event_; +}; + +} // namespace profiler +} // namespace oneflow + +#endif // ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_ diff --git a/oneflow/core/vm/sync_vm_mode_guard.h b/oneflow/core/vm/sync_vm_mode_guard.h index 5e63607..40e7179 100644 --- a/oneflow/core/vm/sync_vm_mode_guard.h +++ b/oneflow/core/vm/sync_vm_mode_guard.h @@ -1,39 +1,39 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_ -#define ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_ - -#include "oneflow/core/common/thread_local_guard.h" - -namespace oneflow { - -enum class SyncVmMode { - kInvalid = 0, - kEnable = 1, - kDisable = 2, -}; - -class SyncVmModeGuard final : public ThreadLocalGuard { - public: - using ThreadLocalGuard::ThreadLocalGuard; - ~SyncVmModeGuard() = default; - - static bool IsCurrentSyncVmMode() { - const auto& opt_sync_mode = Current(); - return opt_sync_mode.has_value() && CHECK_JUST(opt_sync_mode) == SyncVmMode::kEnable; - } -}; - -} // namespace oneflow - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_ +#define ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_ + +#include "oneflow/core/common/thread_local_guard.h" + +namespace oneflow { + +enum class SyncVmMode { + kInvalid = 0, + kEnable = 1, + kDisable = 2, +}; + +class SyncVmModeGuard final : public ThreadLocalGuard { + public: + using ThreadLocalGuard::ThreadLocalGuard; + ~SyncVmModeGuard() = default; + + static bool IsCurrentSyncVmMode() { + const auto& opt_sync_mode = Current(); + return opt_sync_mode.has_value() && CHECK_JUST(opt_sync_mode) == SyncVmMode::kEnable; + } +}; + +} // namespace oneflow + #endif // ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_ \ No newline at end of file diff --git a/oneflow/user/kernels/adaptive_pool_gpu_kernel.hip.cpp b/oneflow/user/kernels/adaptive_pool_gpu_kernel.hip.cpp index 326c408..b925d3e 100644 --- a/oneflow/user/kernels/adaptive_pool_gpu_kernel.hip.cpp +++ b/oneflow/user/kernels/adaptive_pool_gpu_kernel.hip.cpp @@ -1,296 +1,296 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/kernel_util.hip.h" -#include "oneflow/core/common/data_type.h" -#include "oneflow/core/kernel/util/cuda_half_util.h" -#include "oneflow/core/hip/atomic.hip.h" -#include "oneflow/core/operator/operator_util.h" -#include "oneflow/user/utils/pool_util.h" - -#include -#include -#include - -namespace oneflow { - -namespace user_op { - -#define START_IND(a, b, c) (int)std::floor((float)(a * c) / b) -#define END_IND(a, b, c) (int)std::ceil((float)((a + 1) * c) / b) - -#define START_IND_INT(a, b, c) ((a * c) / b) -#define END_IND_INT(a, b, c) (((a + 1) * c + b - 1) / b) - -template -__global__ void InitPtr(int elements, T* ptr) { - int gid = (blockDim.x * blockIdx.x) + threadIdx.x; - int step = gridDim.x * blockDim.x; - while (gid < elements) { - ptr[gid] = static_cast(0); - gid += step; - } -} - -inline Shape GetShape5D(const Shape& shape, const std::string& data_format, int32_t dim) { - FixedDimVector shape_3d = {GetInDim(shape, data_format, 0, dim), - GetInDim(shape, data_format, 1, dim), - GetInDim(shape, data_format, 2, dim)}; - return Shape({shape.At(0), shape.At(1), shape_3d.at(0), shape_3d.at(1), shape_3d.at(2)}); -} - -template -__global__ void AdaptiveAvgPoolCudaKernel(const T* input, T* output, int num_elems, int in_d, - int in_h, int in_w, int out_d, int out_h, int out_w) { - const int out_panel_size = out_d * out_h * out_w; - const int in_panel_size = in_d * in_h * in_w; - - CUDA_1D_KERNEL_LOOP(idx, num_elems) { - // TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper' - int bc_idx = idx / out_panel_size; - int out_d_idx = (idx % out_panel_size) / out_w / out_h; - int out_h_idx = (idx % out_panel_size) % (out_h * out_w) / out_w; - int out_w_idx = (idx % out_panel_size) % (out_h * out_w) % out_w; - - int in_start_d = START_IND(out_d_idx, out_d, in_d); - int in_end_d = END_IND(out_d_idx, out_d, in_d); - int k_d = in_end_d - in_start_d; - - int in_start_h = START_IND(out_h_idx, out_h, in_h); - int in_end_h = END_IND(out_h_idx, out_h, in_h); - int k_h = in_end_h - in_start_h; - - int in_start_w = START_IND(out_w_idx, out_w, in_w); - int in_end_w = END_IND(out_w_idx, out_w, in_w); - int k_w = in_end_w - in_start_w; - - const T* in_ptr = - input + bc_idx * in_panel_size + in_start_d * in_h * in_w + in_start_h * in_w + in_start_w; - T sum = static_cast(0); - for (int id = 0; id < k_d; ++id) { - for (int ih = 0; ih < k_h; ++ih) { - for (int iw = 0; iw < k_w; ++iw) { - T val = *(in_ptr + ih * in_w + iw); - sum += val; - } - } - in_ptr += in_h * in_w; // next input depth - } - // Update output - output[idx] = sum / k_d / k_h / k_w; - } -} - -template -__global__ void AdaptiveAvgPoolGradCudaKernel(T* input, const T* output, int num_elems, int in_d, - int in_h, int in_w, int out_d, int out_h, int out_w) { - const int out_panel_size = out_d * out_h * out_w; - const int in_panel_size = in_d * in_h * in_w; - - CUDA_1D_KERNEL_LOOP(idx, num_elems) { - // TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper' - int bc_idx = idx / out_panel_size; - int out_d_idx = (idx % out_panel_size) / out_w / out_h; - int out_h_idx = (idx % out_panel_size) % (out_h * out_w) / out_w; - int out_w_idx = (idx % out_panel_size) % (out_h * out_w) % out_w; - - int in_start_d = START_IND(out_d_idx, out_d, in_d); - int in_end_d = END_IND(out_d_idx, out_d, in_d); - int k_d = in_end_d - in_start_d; - - int in_start_h = START_IND(out_h_idx, out_h, in_h); - int in_end_h = END_IND(out_h_idx, out_h, in_h); - int k_h = in_end_h - in_start_h; - - int in_start_w = START_IND(out_w_idx, out_w, in_w); - int in_end_w = END_IND(out_w_idx, out_w, in_w); - int k_w = in_end_w - in_start_w; - - const T grad_delta = output[idx] / k_d / k_h / k_w; - T* input_ptr = - input + bc_idx * in_panel_size + in_start_d * in_h * in_w + in_start_h * in_w + in_start_w; - for (int id = 0; id < k_d; ++id) { - for (int ih = 0; ih < k_h; ++ih) { - for (int iw = 0; iw < k_w; ++iw) { - // TODO (Tianyu): Use 'atmoic::Add' when necessary - cuda::atomic::Add(input_ptr + ih * in_w + iw, grad_delta); - } - } - input_ptr += in_h * in_w; // next input depth - } - } -} - -template -void AvgForwardCompute(KernelComputeContext* ctx, const int32_t& dim) { - const Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); - Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); - const T* in_ptr = in_tensor->dptr(); - T* out_ptr = out_tensor->mut_dptr(); - - const Shape& x_shape = ctx->TensorDesc4ArgNameAndIndex("x", 0)->shape(); - const Shape& y_shape = ctx->TensorDesc4ArgNameAndIndex("y", 0)->shape(); - - // TODO (Tianyu): Support 'channels_last' - std::string data_format = "channels_first"; - const Shape& in = GetShape5D(x_shape, data_format, dim); - const Shape& out = GetShape5D(y_shape, data_format, dim); - - const int out_elems = out_tensor->shape_view().elem_cnt(); - - RUN_CUDA_KERNEL((AdaptiveAvgPoolCudaKernel), ctx->stream(), out_elems, in_ptr, out_ptr, - out_elems, in.At(2), in.At(3), in.At(4), out.At(2), out.At(3), out.At(4)); -} - -template -void AvgBackwardCompute(KernelComputeContext* ctx, const int32_t& dim) { - const Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); - Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); - const T* out_ptr = out_tensor->dptr(); - T* in_ptr = in_tensor->mut_dptr(); - - const Shape& dx_shape = ctx->TensorDesc4ArgNameAndIndex("dx", 0)->shape(); - const Shape& dy_shape = ctx->TensorDesc4ArgNameAndIndex("dy", 0)->shape(); - - // TODO (Tianyu): Support 'channels_last' - std::string data_format = "channels_first"; - const Shape& in = GetShape5D(dx_shape, data_format, dim); - const Shape& out = GetShape5D(dy_shape, data_format, dim); - - const int in_elems = in_tensor->shape_view().elem_cnt(); - const int out_elems = out_tensor->shape_view().elem_cnt(); - - RUN_CUDA_KERNEL((InitPtr), ctx->stream(), in_elems, in_elems, in_ptr); - RUN_CUDA_KERNEL((AdaptiveAvgPoolGradCudaKernel), ctx->stream(), out_elems, in_ptr, out_ptr, - out_elems, in.At(2), in.At(3), in.At(4), out.At(2), out.At(3), out.At(4)); -} - -template -class GpuAdaptiveAvgPool1dKernel final : public OpKernel { - public: - GpuAdaptiveAvgPool1dKernel() = default; - ~GpuAdaptiveAvgPool1dKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute(ctx, 1); } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class GpuAdaptiveAvgPool2dKernel final : public OpKernel { - public: - GpuAdaptiveAvgPool2dKernel() = default; - ~GpuAdaptiveAvgPool2dKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute(ctx, 2); } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class GpuAdaptiveAvgPool3dKernel final : public OpKernel { - public: - GpuAdaptiveAvgPool3dKernel() = default; - ~GpuAdaptiveAvgPool3dKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute(ctx, 3); } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class GpuAdaptiveAvgPool1dGradKernel final : public OpKernel { - public: - GpuAdaptiveAvgPool1dGradKernel() = default; - ~GpuAdaptiveAvgPool1dGradKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute(ctx, 1); } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class GpuAdaptiveAvgPool2dGradKernel final : public OpKernel { - public: - GpuAdaptiveAvgPool2dGradKernel() = default; - ~GpuAdaptiveAvgPool2dGradKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute(ctx, 2); } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class GpuAdaptiveAvgPool3dGradKernel final : public OpKernel { - public: - GpuAdaptiveAvgPool3dGradKernel() = default; - ~GpuAdaptiveAvgPool3dGradKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute(ctx, 3); } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(device, dtype) \ - REGISTER_USER_KERNEL("adaptive_avg_pool1d") \ - .SetCreateFn>() \ - .SetIsMatchedHob((HobDeviceType() == device) \ - && (HobDataType("y", 0) == GetDataType::value)); \ - REGISTER_USER_KERNEL("adaptive_avg_pool2d") \ - .SetCreateFn>() \ - .SetIsMatchedHob((HobDeviceType() == device) \ - && (HobDataType("y", 0) == GetDataType::value)); \ - REGISTER_USER_KERNEL("adaptive_avg_pool3d") \ - .SetCreateFn>() \ - .SetIsMatchedHob((HobDeviceType() == device) \ - && (HobDataType("y", 0) == GetDataType::value)); - -REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, float); -REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, double); -REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, int); - -#define REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(device, dtype) \ - REGISTER_USER_KERNEL("adaptive_avg_pool1d_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((HobDeviceType() == device) \ - && (HobDataType("dx", 0) == GetDataType::value)); \ - REGISTER_USER_KERNEL("adaptive_avg_pool2d_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((HobDeviceType() == device) \ - && (HobDataType("dx", 0) == GetDataType::value)); \ - REGISTER_USER_KERNEL("adaptive_avg_pool3d_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((HobDeviceType() == device) \ - && (HobDataType("dx", 0) == GetDataType::value)); - -REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, float); -REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, double); -REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, int); - -} // namespace user_op - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/kernel_util.hip.h" +#include "oneflow/core/common/data_type.h" +#include "oneflow/core/kernel/util/cuda_half_util.h" +#include "oneflow/core/hip/atomic.hip.h" +#include "oneflow/core/operator/operator_util.h" +#include "oneflow/user/utils/pool_util.h" + +#include +#include +#include + +namespace oneflow { + +namespace user_op { + +#define START_IND(a, b, c) (int)std::floor((float)(a * c) / b) +#define END_IND(a, b, c) (int)std::ceil((float)((a + 1) * c) / b) + +#define START_IND_INT(a, b, c) ((a * c) / b) +#define END_IND_INT(a, b, c) (((a + 1) * c + b - 1) / b) + +template +__global__ void InitPtr(int elements, T* ptr) { + int gid = (blockDim.x * blockIdx.x) + threadIdx.x; + int step = gridDim.x * blockDim.x; + while (gid < elements) { + ptr[gid] = static_cast(0); + gid += step; + } +} + +inline Shape GetShape5D(const Shape& shape, const std::string& data_format, int32_t dim) { + FixedDimVector shape_3d = {GetInDim(shape, data_format, 0, dim), + GetInDim(shape, data_format, 1, dim), + GetInDim(shape, data_format, 2, dim)}; + return Shape({shape.At(0), shape.At(1), shape_3d.at(0), shape_3d.at(1), shape_3d.at(2)}); +} + +template +__global__ void AdaptiveAvgPoolCudaKernel(const T* input, T* output, int num_elems, int in_d, + int in_h, int in_w, int out_d, int out_h, int out_w) { + const int out_panel_size = out_d * out_h * out_w; + const int in_panel_size = in_d * in_h * in_w; + + CUDA_1D_KERNEL_LOOP(idx, num_elems) { + // TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper' + int bc_idx = idx / out_panel_size; + int out_d_idx = (idx % out_panel_size) / out_w / out_h; + int out_h_idx = (idx % out_panel_size) % (out_h * out_w) / out_w; + int out_w_idx = (idx % out_panel_size) % (out_h * out_w) % out_w; + + int in_start_d = START_IND(out_d_idx, out_d, in_d); + int in_end_d = END_IND(out_d_idx, out_d, in_d); + int k_d = in_end_d - in_start_d; + + int in_start_h = START_IND(out_h_idx, out_h, in_h); + int in_end_h = END_IND(out_h_idx, out_h, in_h); + int k_h = in_end_h - in_start_h; + + int in_start_w = START_IND(out_w_idx, out_w, in_w); + int in_end_w = END_IND(out_w_idx, out_w, in_w); + int k_w = in_end_w - in_start_w; + + const T* in_ptr = + input + bc_idx * in_panel_size + in_start_d * in_h * in_w + in_start_h * in_w + in_start_w; + T sum = static_cast(0); + for (int id = 0; id < k_d; ++id) { + for (int ih = 0; ih < k_h; ++ih) { + for (int iw = 0; iw < k_w; ++iw) { + T val = *(in_ptr + ih * in_w + iw); + sum += val; + } + } + in_ptr += in_h * in_w; // next input depth + } + // Update output + output[idx] = sum / k_d / k_h / k_w; + } +} + +template +__global__ void AdaptiveAvgPoolGradCudaKernel(T* input, const T* output, int num_elems, int in_d, + int in_h, int in_w, int out_d, int out_h, int out_w) { + const int out_panel_size = out_d * out_h * out_w; + const int in_panel_size = in_d * in_h * in_w; + + CUDA_1D_KERNEL_LOOP(idx, num_elems) { + // TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper' + int bc_idx = idx / out_panel_size; + int out_d_idx = (idx % out_panel_size) / out_w / out_h; + int out_h_idx = (idx % out_panel_size) % (out_h * out_w) / out_w; + int out_w_idx = (idx % out_panel_size) % (out_h * out_w) % out_w; + + int in_start_d = START_IND(out_d_idx, out_d, in_d); + int in_end_d = END_IND(out_d_idx, out_d, in_d); + int k_d = in_end_d - in_start_d; + + int in_start_h = START_IND(out_h_idx, out_h, in_h); + int in_end_h = END_IND(out_h_idx, out_h, in_h); + int k_h = in_end_h - in_start_h; + + int in_start_w = START_IND(out_w_idx, out_w, in_w); + int in_end_w = END_IND(out_w_idx, out_w, in_w); + int k_w = in_end_w - in_start_w; + + const T grad_delta = output[idx] / k_d / k_h / k_w; + T* input_ptr = + input + bc_idx * in_panel_size + in_start_d * in_h * in_w + in_start_h * in_w + in_start_w; + for (int id = 0; id < k_d; ++id) { + for (int ih = 0; ih < k_h; ++ih) { + for (int iw = 0; iw < k_w; ++iw) { + // TODO (Tianyu): Use 'atmoic::Add' when necessary + cuda::atomic::Add(input_ptr + ih * in_w + iw, grad_delta); + } + } + input_ptr += in_h * in_w; // next input depth + } + } +} + +template +void AvgForwardCompute(KernelComputeContext* ctx, const int32_t& dim) { + const Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); + Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); + const T* in_ptr = in_tensor->dptr(); + T* out_ptr = out_tensor->mut_dptr(); + + const Shape& x_shape = ctx->TensorDesc4ArgNameAndIndex("x", 0)->shape(); + const Shape& y_shape = ctx->TensorDesc4ArgNameAndIndex("y", 0)->shape(); + + // TODO (Tianyu): Support 'channels_last' + std::string data_format = "channels_first"; + const Shape& in = GetShape5D(x_shape, data_format, dim); + const Shape& out = GetShape5D(y_shape, data_format, dim); + + const int out_elems = out_tensor->shape_view().elem_cnt(); + + RUN_CUDA_KERNEL((AdaptiveAvgPoolCudaKernel), ctx->stream(), out_elems, in_ptr, out_ptr, + out_elems, in.At(2), in.At(3), in.At(4), out.At(2), out.At(3), out.At(4)); +} + +template +void AvgBackwardCompute(KernelComputeContext* ctx, const int32_t& dim) { + const Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); + Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); + const T* out_ptr = out_tensor->dptr(); + T* in_ptr = in_tensor->mut_dptr(); + + const Shape& dx_shape = ctx->TensorDesc4ArgNameAndIndex("dx", 0)->shape(); + const Shape& dy_shape = ctx->TensorDesc4ArgNameAndIndex("dy", 0)->shape(); + + // TODO (Tianyu): Support 'channels_last' + std::string data_format = "channels_first"; + const Shape& in = GetShape5D(dx_shape, data_format, dim); + const Shape& out = GetShape5D(dy_shape, data_format, dim); + + const int in_elems = in_tensor->shape_view().elem_cnt(); + const int out_elems = out_tensor->shape_view().elem_cnt(); + + RUN_CUDA_KERNEL((InitPtr), ctx->stream(), in_elems, in_elems, in_ptr); + RUN_CUDA_KERNEL((AdaptiveAvgPoolGradCudaKernel), ctx->stream(), out_elems, in_ptr, out_ptr, + out_elems, in.At(2), in.At(3), in.At(4), out.At(2), out.At(3), out.At(4)); +} + +template +class GpuAdaptiveAvgPool1dKernel final : public OpKernel { + public: + GpuAdaptiveAvgPool1dKernel() = default; + ~GpuAdaptiveAvgPool1dKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute(ctx, 1); } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class GpuAdaptiveAvgPool2dKernel final : public OpKernel { + public: + GpuAdaptiveAvgPool2dKernel() = default; + ~GpuAdaptiveAvgPool2dKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute(ctx, 2); } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class GpuAdaptiveAvgPool3dKernel final : public OpKernel { + public: + GpuAdaptiveAvgPool3dKernel() = default; + ~GpuAdaptiveAvgPool3dKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute(ctx, 3); } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class GpuAdaptiveAvgPool1dGradKernel final : public OpKernel { + public: + GpuAdaptiveAvgPool1dGradKernel() = default; + ~GpuAdaptiveAvgPool1dGradKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute(ctx, 1); } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class GpuAdaptiveAvgPool2dGradKernel final : public OpKernel { + public: + GpuAdaptiveAvgPool2dGradKernel() = default; + ~GpuAdaptiveAvgPool2dGradKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute(ctx, 2); } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class GpuAdaptiveAvgPool3dGradKernel final : public OpKernel { + public: + GpuAdaptiveAvgPool3dGradKernel() = default; + ~GpuAdaptiveAvgPool3dGradKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute(ctx, 3); } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(device, dtype) \ + REGISTER_USER_KERNEL("adaptive_avg_pool1d") \ + .SetCreateFn>() \ + .SetIsMatchedHob((HobDeviceType() == device) \ + && (HobDataType("y", 0) == GetDataType::value)); \ + REGISTER_USER_KERNEL("adaptive_avg_pool2d") \ + .SetCreateFn>() \ + .SetIsMatchedHob((HobDeviceType() == device) \ + && (HobDataType("y", 0) == GetDataType::value)); \ + REGISTER_USER_KERNEL("adaptive_avg_pool3d") \ + .SetCreateFn>() \ + .SetIsMatchedHob((HobDeviceType() == device) \ + && (HobDataType("y", 0) == GetDataType::value)); + +REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, float); +REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, double); +REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, int); + +#define REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(device, dtype) \ + REGISTER_USER_KERNEL("adaptive_avg_pool1d_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((HobDeviceType() == device) \ + && (HobDataType("dx", 0) == GetDataType::value)); \ + REGISTER_USER_KERNEL("adaptive_avg_pool2d_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((HobDeviceType() == device) \ + && (HobDataType("dx", 0) == GetDataType::value)); \ + REGISTER_USER_KERNEL("adaptive_avg_pool3d_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((HobDeviceType() == device) \ + && (HobDataType("dx", 0) == GetDataType::value)); + +REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, float); +REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, double); +REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, int); + +} // namespace user_op + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/affine_grid_kernel.hip.cpp b/oneflow/user/kernels/affine_grid_kernel.hip.cpp index 9fe19bd..c5a445d 100644 --- a/oneflow/user/kernels/affine_grid_kernel.hip.cpp +++ b/oneflow/user/kernels/affine_grid_kernel.hip.cpp @@ -1,133 +1,133 @@ - -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/kernel/kernel_util.h" -#include "oneflow/core/device/cuda_util.h" -#include "affine_grid_kernel.h" - -namespace oneflow { - -namespace { - -template -OF_DEVICE_FUNC data_type LinspaceGPU(int32_t index, int32_t num_steps) { - if (num_steps <= 1) { return static_cast(0.0); } - - if (align_corners) { - return static_cast(-1.0 + 2.0 / (num_steps - 1) * index); - } else { - return static_cast((-1.0 + 2.0 / (num_steps - 1) * index) * (num_steps - 1) - / num_steps); - } -} - -template -__global__ void Generate2DBaseGridGPUKernel(const int32_t nthreads, data_type* grid_ptr, int32_t H, - int32_t W) { - CUDA_1D_KERNEL_LOOP(index, nthreads) { - const int32_t h = index / W; - const int32_t w = index % W; - const int32_t pixel_length = 3; - data_type* row_ptr = grid_ptr + h * W * pixel_length; - data_type* pixel_ptr = row_ptr + w * pixel_length; - data_type h_value = LinspaceGPU(h, H); - data_type w_value = LinspaceGPU(w, W); - - pixel_ptr[0] = w_value; - pixel_ptr[1] = h_value; - pixel_ptr[2] = static_cast(1.0); - } -} - -template -__global__ void Generate3DBaseGridGPUKernel(const int32_t nthreads, data_type* grid_ptr, int32_t D, - int32_t H, int32_t W) { - CUDA_1D_KERNEL_LOOP(index, nthreads) { - const int32_t d = index / H; - const int32_t h = index % H; - const int32_t pixel_length = 4; - data_type* image_ptr = grid_ptr + d * H * W * pixel_length; - data_type* row_ptr = image_ptr + h * W * pixel_length; - data_type d_value = LinspaceGPU(d, D); - data_type h_value = LinspaceGPU(h, H); - - for (int32_t w = 0; w < W; ++w) { - data_type* pixel_ptr = row_ptr + w * pixel_length; - data_type w_value = LinspaceGPU(w, W); - pixel_ptr[0] = w_value; - pixel_ptr[1] = h_value; - pixel_ptr[2] = d_value; - pixel_ptr[3] = static_cast(1.0); - } - } -} - -} // namespace - -void GenerateBaseGridImp::Generate2D(user_op::KernelComputeContext* ctx, - float* grid_ptr, int64_t H, int64_t W, - bool align_corners) { - int count = H * W; - if (align_corners) { - RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel), ctx->stream(), count, count, - grid_ptr, H, W); - } else { - RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel), ctx->stream(), count, count, - grid_ptr, H, W); - } -} -void GenerateBaseGridImp::Generate2D(user_op::KernelComputeContext* ctx, - double* grid_ptr, int64_t H, int64_t W, - bool align_corners) { - int count = H * W; - if (align_corners) { - RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel), ctx->stream(), count, count, - grid_ptr, H, W); - } else { - RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel), ctx->stream(), count, count, - grid_ptr, H, W); - } -} - -void GenerateBaseGridImp::Generate3D(user_op::KernelComputeContext* ctx, - float* grid_ptr, int64_t D, int64_t H, - int64_t W, bool align_corners) { - int count = D * H; - if (align_corners) { - RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel), ctx->stream(), count, count, - grid_ptr, D, H, W); - } else { - RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel), ctx->stream(), count, count, - grid_ptr, D, H, W); - } -} - -void GenerateBaseGridImp::Generate3D(user_op::KernelComputeContext* ctx, - double* grid_ptr, int64_t D, int64_t H, - int64_t W, bool align_corners) { - int count = D * H; - if (align_corners) { - RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel), ctx->stream(), count, count, - grid_ptr, D, H, W); - } else { - RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel), ctx->stream(), count, count, - grid_ptr, D, H, W); - } -} - + +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/kernel/kernel_util.h" +#include "oneflow/core/device/cuda_util.h" +#include "affine_grid_kernel.h" + +namespace oneflow { + +namespace { + +template +OF_DEVICE_FUNC data_type LinspaceGPU(int32_t index, int32_t num_steps) { + if (num_steps <= 1) { return static_cast(0.0); } + + if (align_corners) { + return static_cast(-1.0 + 2.0 / (num_steps - 1) * index); + } else { + return static_cast((-1.0 + 2.0 / (num_steps - 1) * index) * (num_steps - 1) + / num_steps); + } +} + +template +__global__ void Generate2DBaseGridGPUKernel(const int32_t nthreads, data_type* grid_ptr, int32_t H, + int32_t W) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + const int32_t h = index / W; + const int32_t w = index % W; + const int32_t pixel_length = 3; + data_type* row_ptr = grid_ptr + h * W * pixel_length; + data_type* pixel_ptr = row_ptr + w * pixel_length; + data_type h_value = LinspaceGPU(h, H); + data_type w_value = LinspaceGPU(w, W); + + pixel_ptr[0] = w_value; + pixel_ptr[1] = h_value; + pixel_ptr[2] = static_cast(1.0); + } +} + +template +__global__ void Generate3DBaseGridGPUKernel(const int32_t nthreads, data_type* grid_ptr, int32_t D, + int32_t H, int32_t W) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + const int32_t d = index / H; + const int32_t h = index % H; + const int32_t pixel_length = 4; + data_type* image_ptr = grid_ptr + d * H * W * pixel_length; + data_type* row_ptr = image_ptr + h * W * pixel_length; + data_type d_value = LinspaceGPU(d, D); + data_type h_value = LinspaceGPU(h, H); + + for (int32_t w = 0; w < W; ++w) { + data_type* pixel_ptr = row_ptr + w * pixel_length; + data_type w_value = LinspaceGPU(w, W); + pixel_ptr[0] = w_value; + pixel_ptr[1] = h_value; + pixel_ptr[2] = d_value; + pixel_ptr[3] = static_cast(1.0); + } + } +} + +} // namespace + +void GenerateBaseGridImp::Generate2D(user_op::KernelComputeContext* ctx, + float* grid_ptr, int64_t H, int64_t W, + bool align_corners) { + int count = H * W; + if (align_corners) { + RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel), ctx->stream(), count, count, + grid_ptr, H, W); + } else { + RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel), ctx->stream(), count, count, + grid_ptr, H, W); + } +} +void GenerateBaseGridImp::Generate2D(user_op::KernelComputeContext* ctx, + double* grid_ptr, int64_t H, int64_t W, + bool align_corners) { + int count = H * W; + if (align_corners) { + RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel), ctx->stream(), count, count, + grid_ptr, H, W); + } else { + RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel), ctx->stream(), count, count, + grid_ptr, H, W); + } +} + +void GenerateBaseGridImp::Generate3D(user_op::KernelComputeContext* ctx, + float* grid_ptr, int64_t D, int64_t H, + int64_t W, bool align_corners) { + int count = D * H; + if (align_corners) { + RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel), ctx->stream(), count, count, + grid_ptr, D, H, W); + } else { + RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel), ctx->stream(), count, count, + grid_ptr, D, H, W); + } +} + +void GenerateBaseGridImp::Generate3D(user_op::KernelComputeContext* ctx, + double* grid_ptr, int64_t D, int64_t H, + int64_t W, bool align_corners) { + int count = D * H; + if (align_corners) { + RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel), ctx->stream(), count, count, + grid_ptr, D, H, W); + } else { + RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel), ctx->stream(), count, count, + grid_ptr, D, H, W); + } +} + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/arange_kernel_util.hip.cpp b/oneflow/user/kernels/arange_kernel_util.hip.cpp index 2df4427..d7e6b59 100644 --- a/oneflow/user/kernels/arange_kernel_util.hip.cpp +++ b/oneflow/user/kernels/arange_kernel_util.hip.cpp @@ -1,48 +1,48 @@ - -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifdef WITH_ROCM -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/user/kernels/arange_kernel_util.h" - -namespace oneflow { - -namespace user_op { - -template -__global__ void ArangeForwardGpuKernel(const T start, const T delta, const int64_t arange_elem_cnt, - T* out) { - // Use Loop to set the value - DoArange(start, delta, arange_elem_cnt, out); -} - -template -struct ArangeFunctor final { - void operator()(ep::Stream* stream, const T start, const T delta, const int64_t arange_elem_cnt, - T* out) { - // The thread num is set as arange_elem_cnt - RUN_CUDA_KERNEL((ArangeForwardGpuKernel), stream, arange_elem_cnt, start, delta, - arange_elem_cnt, out); - } -}; - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_ARANGE_FUNCTOR, (DeviceType::kCUDA), - ARANGE_DATA_TYPE_SEQ); -} // namespace user_op -} // namespace oneflow - + +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef WITH_ROCM +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/user/kernels/arange_kernel_util.h" + +namespace oneflow { + +namespace user_op { + +template +__global__ void ArangeForwardGpuKernel(const T start, const T delta, const int64_t arange_elem_cnt, + T* out) { + // Use Loop to set the value + DoArange(start, delta, arange_elem_cnt, out); +} + +template +struct ArangeFunctor final { + void operator()(ep::Stream* stream, const T start, const T delta, const int64_t arange_elem_cnt, + T* out) { + // The thread num is set as arange_elem_cnt + RUN_CUDA_KERNEL((ArangeForwardGpuKernel), stream, arange_elem_cnt, start, delta, + arange_elem_cnt, out); + } +}; + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_ARANGE_FUNCTOR, (DeviceType::kCUDA), + ARANGE_DATA_TYPE_SEQ); +} // namespace user_op +} // namespace oneflow + #endif // End WITH_ROCM \ No newline at end of file diff --git a/oneflow/user/kernels/arg_sort_kernel.hip.cpp b/oneflow/user/kernels/arg_sort_kernel.hip.cpp index 46372db..1f2f276 100644 --- a/oneflow/user/kernels/arg_sort_kernel.hip.cpp +++ b/oneflow/user/kernels/arg_sort_kernel.hip.cpp @@ -1,148 +1,148 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/user/kernels/radix_sort.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -class TmpBufferManager final { - public: - OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager); - TmpBufferManager(int32_t capacity, void* ptr, const ShapeView& in_shape) - : capacity_{capacity}, - sorted_in_elem_cnt_{in_shape.elem_cnt()}, - indices_elem_cnt_{sorted_in_elem_cnt_} { - const int32_t sorted_in_aligned_bytes = GetCudaAlignedSize(sorted_in_elem_cnt_ * sizeof(T)); - const int32_t indices_aligned_bytes = GetCudaAlignedSize(indices_elem_cnt_ * sizeof(int32_t)); - sorted_in_ptr_ = reinterpret_cast(ptr); - indices_ptr_ = reinterpret_cast(reinterpret_cast(sorted_in_ptr_) - + sorted_in_aligned_bytes); - temp_storage_ptr_ = - reinterpret_cast(reinterpret_cast(indices_ptr_) + indices_aligned_bytes); - temp_storage_bytes_ = capacity_ - sorted_in_aligned_bytes - indices_aligned_bytes; - CHECK_GE(temp_storage_bytes_, 0); - } - ~TmpBufferManager() = default; - - T* SortedInPtr() const { return sorted_in_ptr_; } - int32_t* IndicesPtr() const { return indices_ptr_; } - void* TempStoragePtr() const { return temp_storage_ptr_; } - - int32_t TempStorageBytes() const { return temp_storage_bytes_; } - - private: - int32_t capacity_; - - T* sorted_in_ptr_; - int32_t* indices_ptr_; - void* temp_storage_ptr_; - - int64_t sorted_in_elem_cnt_; - int64_t indices_elem_cnt_; - int32_t temp_storage_bytes_; -}; - -__global__ void InitializeIndices(int32_t elem_cnt, int32_t* indices_ptr, int32_t instance_size) { - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { indices_ptr[i] = i % instance_size; }; -} - -} // namespace - -template -class GpuArgSortKernel final : public user_op::OpKernel { - public: - GpuArgSortKernel() = default; - ~GpuArgSortKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - TmpBufferManager buf_manager(static_cast(tmp_buffer->shape_view().elem_cnt()), - tmp_buffer->mut_dptr(), in->shape_view()); - - const int32_t elem_cnt = in->shape_view().elem_cnt(); - const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); - const int32_t instance_num = elem_cnt / instance_size; - const std::string& direction = ctx->Attr("direction"); - InitializeIndices<<stream()->As()->cuda_stream()>>>( - elem_cnt, buf_manager.IndicesPtr(), instance_size); - if (direction == "ASCENDING") { - SortPairsAscending(in->dptr(), buf_manager.IndicesPtr(), instance_num, instance_size, - buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(), - buf_manager.SortedInPtr(), out->mut_dptr(), - ctx->stream()->As()->cuda_stream()); - } else if (direction == "DESCENDING") { - SortPairsDescending(in->dptr(), buf_manager.IndicesPtr(), instance_num, instance_size, - buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(), - buf_manager.SortedInPtr(), out->mut_dptr(), - ctx->stream()->As()->cuda_stream()); - } else { - UNIMPLEMENTED(); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_ARG_SORT_KERNEL(dtype) \ - REGISTER_USER_KERNEL("arg_sort") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("in", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ - const Shape& in_shape = ctx->InputShape("in", 0); \ - const int32_t elem_cnt = in_shape.elem_cnt(); \ - const int32_t instance_size = in_shape.dim_vec().back(); \ - const int32_t instance_num = elem_cnt / instance_size; \ - \ - /* Sorted In */ \ - const int32_t sorted_in_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(dtype)); \ - /* Indices */ \ - const int32_t indices_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(int32_t)); \ - /* CUB Temp Storage */ \ - int32_t temp_storage_bytes = -1; \ - const std::string& direction = ctx->Attr("direction"); \ - if (direction == "ASCENDING") { \ - temp_storage_bytes = \ - InferTempStorageForSortPairsAscending(instance_num, instance_size); \ - } else if (direction == "DESCENDING") { \ - temp_storage_bytes = \ - InferTempStorageForSortPairsDescending(instance_num, instance_size); \ - } else { \ - UNIMPLEMENTED(); \ - } \ - \ - return sorted_in_aligned_bytes + indices_aligned_bytes + temp_storage_bytes; \ - }); - -REGISTER_CUDA_ARG_SORT_KERNEL(float) -REGISTER_CUDA_ARG_SORT_KERNEL(double) -REGISTER_CUDA_ARG_SORT_KERNEL(bool) -REGISTER_CUDA_ARG_SORT_KERNEL(int8_t) -REGISTER_CUDA_ARG_SORT_KERNEL(uint8_t) -REGISTER_CUDA_ARG_SORT_KERNEL(int32_t) -REGISTER_CUDA_ARG_SORT_KERNEL(int64_t) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/user/kernels/radix_sort.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +class TmpBufferManager final { + public: + OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager); + TmpBufferManager(int32_t capacity, void* ptr, const ShapeView& in_shape) + : capacity_{capacity}, + sorted_in_elem_cnt_{in_shape.elem_cnt()}, + indices_elem_cnt_{sorted_in_elem_cnt_} { + const int32_t sorted_in_aligned_bytes = GetCudaAlignedSize(sorted_in_elem_cnt_ * sizeof(T)); + const int32_t indices_aligned_bytes = GetCudaAlignedSize(indices_elem_cnt_ * sizeof(int32_t)); + sorted_in_ptr_ = reinterpret_cast(ptr); + indices_ptr_ = reinterpret_cast(reinterpret_cast(sorted_in_ptr_) + + sorted_in_aligned_bytes); + temp_storage_ptr_ = + reinterpret_cast(reinterpret_cast(indices_ptr_) + indices_aligned_bytes); + temp_storage_bytes_ = capacity_ - sorted_in_aligned_bytes - indices_aligned_bytes; + CHECK_GE(temp_storage_bytes_, 0); + } + ~TmpBufferManager() = default; + + T* SortedInPtr() const { return sorted_in_ptr_; } + int32_t* IndicesPtr() const { return indices_ptr_; } + void* TempStoragePtr() const { return temp_storage_ptr_; } + + int32_t TempStorageBytes() const { return temp_storage_bytes_; } + + private: + int32_t capacity_; + + T* sorted_in_ptr_; + int32_t* indices_ptr_; + void* temp_storage_ptr_; + + int64_t sorted_in_elem_cnt_; + int64_t indices_elem_cnt_; + int32_t temp_storage_bytes_; +}; + +__global__ void InitializeIndices(int32_t elem_cnt, int32_t* indices_ptr, int32_t instance_size) { + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { indices_ptr[i] = i % instance_size; }; +} + +} // namespace + +template +class GpuArgSortKernel final : public user_op::OpKernel { + public: + GpuArgSortKernel() = default; + ~GpuArgSortKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + TmpBufferManager buf_manager(static_cast(tmp_buffer->shape_view().elem_cnt()), + tmp_buffer->mut_dptr(), in->shape_view()); + + const int32_t elem_cnt = in->shape_view().elem_cnt(); + const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); + const int32_t instance_num = elem_cnt / instance_size; + const std::string& direction = ctx->Attr("direction"); + InitializeIndices<<stream()->As()->cuda_stream()>>>( + elem_cnt, buf_manager.IndicesPtr(), instance_size); + if (direction == "ASCENDING") { + SortPairsAscending(in->dptr(), buf_manager.IndicesPtr(), instance_num, instance_size, + buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(), + buf_manager.SortedInPtr(), out->mut_dptr(), + ctx->stream()->As()->cuda_stream()); + } else if (direction == "DESCENDING") { + SortPairsDescending(in->dptr(), buf_manager.IndicesPtr(), instance_num, instance_size, + buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(), + buf_manager.SortedInPtr(), out->mut_dptr(), + ctx->stream()->As()->cuda_stream()); + } else { + UNIMPLEMENTED(); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_ARG_SORT_KERNEL(dtype) \ + REGISTER_USER_KERNEL("arg_sort") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("in", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ + const Shape& in_shape = ctx->InputShape("in", 0); \ + const int32_t elem_cnt = in_shape.elem_cnt(); \ + const int32_t instance_size = in_shape.dim_vec().back(); \ + const int32_t instance_num = elem_cnt / instance_size; \ + \ + /* Sorted In */ \ + const int32_t sorted_in_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(dtype)); \ + /* Indices */ \ + const int32_t indices_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(int32_t)); \ + /* CUB Temp Storage */ \ + int32_t temp_storage_bytes = -1; \ + const std::string& direction = ctx->Attr("direction"); \ + if (direction == "ASCENDING") { \ + temp_storage_bytes = \ + InferTempStorageForSortPairsAscending(instance_num, instance_size); \ + } else if (direction == "DESCENDING") { \ + temp_storage_bytes = \ + InferTempStorageForSortPairsDescending(instance_num, instance_size); \ + } else { \ + UNIMPLEMENTED(); \ + } \ + \ + return sorted_in_aligned_bytes + indices_aligned_bytes + temp_storage_bytes; \ + }); + +REGISTER_CUDA_ARG_SORT_KERNEL(float) +REGISTER_CUDA_ARG_SORT_KERNEL(double) +REGISTER_CUDA_ARG_SORT_KERNEL(bool) +REGISTER_CUDA_ARG_SORT_KERNEL(int8_t) +REGISTER_CUDA_ARG_SORT_KERNEL(uint8_t) +REGISTER_CUDA_ARG_SORT_KERNEL(int32_t) +REGISTER_CUDA_ARG_SORT_KERNEL(int64_t) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/arg_where_kernel_util.hip.cpp b/oneflow/user/kernels/arg_where_kernel_util.hip.cpp index 4157777..9d78b08 100644 --- a/oneflow/user/kernels/arg_where_kernel_util.hip.cpp +++ b/oneflow/user/kernels/arg_where_kernel_util.hip.cpp @@ -1,142 +1,142 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/user/kernels/arg_where_kernel_util.h" -#include "oneflow/core/common/nd_index_offset_helper.h" -#include "oneflow/core/common/small_vector.h" -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/core/kernel/kernel_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include - -namespace oneflow { - -namespace { - -constexpr int kBlockSize = cuda::elementwise::kBlockSize; - -int GetNumBlocks(int64_t elem_cnt) { - int num_blocks = 0; - OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks)); - return num_blocks; -} - -template -struct StrideIterator { - typedef StrideIterator self_type; - typedef std::ptrdiff_t difference_type; - typedef T value_type; - typedef T* pointer; - typedef T& reference; - typedef std::random_access_iterator_tag iterator_category; - - explicit StrideIterator(T* ptr, size_t max_iters) : ptr_(ptr), max_iters_(max_iters) {} - - OF_DEVICE_FUNC reference operator[](int i) { - assert(0 <= i && i < max_iters_); - return *(ptr_ + (i * NDIM)); - } - - private: - T* ptr_; - size_t max_iters_; -}; - -template -__global__ void __launch_bounds__(kBlockSize) - CudaOffsetToNdIndexInplace(NdIndexOffsetHelper index_converter, - const T* output_size_ptr, T* output_ptr) { - CUDA_1D_KERNEL_LOOP_T(T, i, *output_size_ptr) { - T* index_ptr = output_ptr + i * NDIM; - index_converter.OffsetToNdIndex(*index_ptr, index_ptr); - } -} - -template -struct IsTrue { - __device__ __forceinline__ bool operator()(const T& val) const { return static_cast(val); } -}; - -template -hipError_t SelectTrue(hipStream_t stream, int num_items, void* temp_storage, - size_t& temp_storage_bytes, const IN_T* input, OUT_ITER output_iter, - OUT_T* num_selected) { - IsTrue is_true; - hipcub::TransformInputIterator, const IN_T*> flag_iter(input, is_true); - hipcub::CountingInputIterator offset_counter(0); - return hipcub::DeviceSelect::Flagged(temp_storage, temp_storage_bytes, offset_counter, flag_iter, - output_iter, num_selected, num_items, stream, false); -} - -} // namespace - -template -struct ArgWhereKernelUtil { - static void ArgWhere(ep::Stream* stream, const ShapeView& input_shape, const IN_T* input_ptr, - void* temp_storage, size_t temp_storage_bytes, OUT_T* output_ptr, - OUT_T* output_size_ptr) { - const int64_t elem_cnt = input_shape.elem_cnt(); - // deal with empty blob - if (elem_cnt == 0) { - Memset(stream, output_size_ptr, 0, sizeof(OUT_T)); - return; - } - - CHECK_NOTNULL(stream); - CHECK_LE(elem_cnt, std::numeric_limits::max()); - size_t workspace = GetWorkspaceBytesSize(stream, elem_cnt); - CHECK_LE(workspace, temp_storage_bytes); - - if (NDIM == 1) { - OF_CUDA_CHECK((SelectTrue( - stream->As()->cuda_stream(), input_shape.elem_cnt(), temp_storage, - workspace, input_ptr, output_ptr, output_size_ptr))); - } else { - using OutputIterator = StrideIterator; - OutputIterator output_iter(output_ptr, elem_cnt); - OF_CUDA_CHECK((SelectTrue( - stream->As()->cuda_stream(), elem_cnt, temp_storage, workspace, input_ptr, - output_iter, output_size_ptr))); - - OUT_T dims[NDIM] = {0}; - std::transform(input_shape.ptr(), input_shape.ptr() + input_shape.NumAxes(), dims, - [](int64_t dim) { return static_cast(dim); }); - NdIndexOffsetHelper index_converter(dims); - CudaOffsetToNdIndexInplace - <<As()->cuda_stream()>>>( - index_converter, output_size_ptr, output_ptr); - } - } - - static size_t GetWorkspaceBytesSize(ep::Stream* stream, int64_t elem_cnt) { - hipStream_t cuda_stream = stream ? stream->As()->cuda_stream() : 0; - size_t workspace = 0; - if (NDIM == 1) { - OF_CUDA_CHECK((SelectTrue(cuda_stream, elem_cnt, nullptr, workspace, - nullptr, nullptr, nullptr))); - } else { - using OutputIterator = StrideIterator; - OutputIterator output_iter(nullptr, elem_cnt); - OF_CUDA_CHECK((SelectTrue( - cuda_stream, elem_cnt, nullptr, workspace, nullptr, output_iter, nullptr))); - } - return workspace; - } -}; - -INSTANTIATE_ARG_WHERE_KERNEL_UTIL_FOR_DEVICE(DeviceType::kCUDA) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/user/kernels/arg_where_kernel_util.h" +#include "oneflow/core/common/nd_index_offset_helper.h" +#include "oneflow/core/common/small_vector.h" +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/core/kernel/kernel_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include + +namespace oneflow { + +namespace { + +constexpr int kBlockSize = cuda::elementwise::kBlockSize; + +int GetNumBlocks(int64_t elem_cnt) { + int num_blocks = 0; + OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks)); + return num_blocks; +} + +template +struct StrideIterator { + typedef StrideIterator self_type; + typedef std::ptrdiff_t difference_type; + typedef T value_type; + typedef T* pointer; + typedef T& reference; + typedef std::random_access_iterator_tag iterator_category; + + explicit StrideIterator(T* ptr, size_t max_iters) : ptr_(ptr), max_iters_(max_iters) {} + + OF_DEVICE_FUNC reference operator[](int i) { + assert(0 <= i && i < max_iters_); + return *(ptr_ + (i * NDIM)); + } + + private: + T* ptr_; + size_t max_iters_; +}; + +template +__global__ void __launch_bounds__(kBlockSize) + CudaOffsetToNdIndexInplace(NdIndexOffsetHelper index_converter, + const T* output_size_ptr, T* output_ptr) { + CUDA_1D_KERNEL_LOOP_T(T, i, *output_size_ptr) { + T* index_ptr = output_ptr + i * NDIM; + index_converter.OffsetToNdIndex(*index_ptr, index_ptr); + } +} + +template +struct IsTrue { + __device__ __forceinline__ bool operator()(const T& val) const { return static_cast(val); } +}; + +template +hipError_t SelectTrue(hipStream_t stream, int num_items, void* temp_storage, + size_t& temp_storage_bytes, const IN_T* input, OUT_ITER output_iter, + OUT_T* num_selected) { + IsTrue is_true; + hipcub::TransformInputIterator, const IN_T*> flag_iter(input, is_true); + hipcub::CountingInputIterator offset_counter(0); + return hipcub::DeviceSelect::Flagged(temp_storage, temp_storage_bytes, offset_counter, flag_iter, + output_iter, num_selected, num_items, stream, false); +} + +} // namespace + +template +struct ArgWhereKernelUtil { + static void ArgWhere(ep::Stream* stream, const ShapeView& input_shape, const IN_T* input_ptr, + void* temp_storage, size_t temp_storage_bytes, OUT_T* output_ptr, + OUT_T* output_size_ptr) { + const int64_t elem_cnt = input_shape.elem_cnt(); + // deal with empty blob + if (elem_cnt == 0) { + Memset(stream, output_size_ptr, 0, sizeof(OUT_T)); + return; + } + + CHECK_NOTNULL(stream); + CHECK_LE(elem_cnt, std::numeric_limits::max()); + size_t workspace = GetWorkspaceBytesSize(stream, elem_cnt); + CHECK_LE(workspace, temp_storage_bytes); + + if (NDIM == 1) { + OF_CUDA_CHECK((SelectTrue( + stream->As()->cuda_stream(), input_shape.elem_cnt(), temp_storage, + workspace, input_ptr, output_ptr, output_size_ptr))); + } else { + using OutputIterator = StrideIterator; + OutputIterator output_iter(output_ptr, elem_cnt); + OF_CUDA_CHECK((SelectTrue( + stream->As()->cuda_stream(), elem_cnt, temp_storage, workspace, input_ptr, + output_iter, output_size_ptr))); + + OUT_T dims[NDIM] = {0}; + std::transform(input_shape.ptr(), input_shape.ptr() + input_shape.NumAxes(), dims, + [](int64_t dim) { return static_cast(dim); }); + NdIndexOffsetHelper index_converter(dims); + CudaOffsetToNdIndexInplace + <<As()->cuda_stream()>>>( + index_converter, output_size_ptr, output_ptr); + } + } + + static size_t GetWorkspaceBytesSize(ep::Stream* stream, int64_t elem_cnt) { + hipStream_t cuda_stream = stream ? stream->As()->cuda_stream() : 0; + size_t workspace = 0; + if (NDIM == 1) { + OF_CUDA_CHECK((SelectTrue(cuda_stream, elem_cnt, nullptr, workspace, + nullptr, nullptr, nullptr))); + } else { + using OutputIterator = StrideIterator; + OutputIterator output_iter(nullptr, elem_cnt); + OF_CUDA_CHECK((SelectTrue( + cuda_stream, elem_cnt, nullptr, workspace, nullptr, output_iter, nullptr))); + } + return workspace; + } +}; + +INSTANTIATE_ARG_WHERE_KERNEL_UTIL_FOR_DEVICE(DeviceType::kCUDA) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/argmax_kernel.hip.cpp b/oneflow/user/kernels/argmax_kernel.hip.cpp index 80b75e4..2d84443 100644 --- a/oneflow/user/kernels/argmax_kernel.hip.cpp +++ b/oneflow/user/kernels/argmax_kernel.hip.cpp @@ -1,194 +1,194 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -class TmpBufferManager final { - public: - OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager); - TmpBufferManager(int32_t capacity, void* ptr, int32_t instance_num) - : capacity_{capacity}, key_value_out_elem_cnt_{instance_num} { - const int32_t key_value_out_aligned_bytes = - GetCudaAlignedSize(key_value_out_elem_cnt_ * sizeof(hipcub::KeyValuePair)); - key_value_out_ptr_ = reinterpret_cast*>(ptr); - temp_storage_ptr_ = reinterpret_cast(reinterpret_cast(key_value_out_ptr_) - + key_value_out_aligned_bytes); - temp_storage_bytes_ = capacity_ - key_value_out_aligned_bytes; - CHECK_GE(temp_storage_bytes_, 0); - } - ~TmpBufferManager() = default; - - hipcub::KeyValuePair* KeyValueOutPtr() const { return key_value_out_ptr_; } - void* TempStoragePtr() const { return temp_storage_ptr_; } - - int32_t TempStorageBytes() const { return temp_storage_bytes_; } - - private: - int32_t capacity_; - - hipcub::KeyValuePair* key_value_out_ptr_; - void* temp_storage_ptr_; - - int32_t key_value_out_elem_cnt_; - int32_t temp_storage_bytes_; -}; - -class MultiplyFunctor final { - public: - MultiplyFunctor(int32_t num_col) : num_col_(num_col) {} - __host__ __device__ __forceinline__ int32_t operator()(int32_t idx) const { - return idx * num_col_; - } - - private: - int32_t num_col_; -}; - -template -size_t InferTempStorageForArgMax(int32_t num_row, int32_t num_col) { - using SegmentOffsetIter = - hipcub::TransformInputIterator>; - hipcub::CountingInputIterator counting_iter(0); - MultiplyFunctor multiply_functor(num_col); - SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor); - - size_t temp_storage_bytes = 0; - auto err = - hipcub::DeviceSegmentedReduce::ArgMax*, SegmentOffsetIter>( - /* d_temp_storage */ nullptr, /* temp_storage_bytes */ temp_storage_bytes, - /* d_in */ nullptr, /* d_out */ nullptr, /* num_segments */ num_row, - /* d_begin_offsets */ segment_offset_iter, /* d_end_offsets */ segment_offset_iter + 1, - /* stream */ 0); - - // auto err = - // hipcub::DeviceReduce::ArgMax*>( - // nullptr, temp_storage_bytes, - // nullptr, nullptr, num_row, - // 0); - - OF_CUDA_CHECK(err); - - return temp_storage_bytes; -} - -template -void ArgMax(const T* in_ptr, int32_t num_row, int32_t num_col, void* temp_storage_ptr, - int32_t temp_storage_bytes, hipcub::KeyValuePair* out_ptr, - hipStream_t stream) { - size_t rt_inferred_temp_storage_bytes = InferTempStorageForArgMax(num_row, num_col); - CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes); - - using SegmentOffsetIter = - hipcub::TransformInputIterator>; - hipcub::CountingInputIterator counting_iter(0); - MultiplyFunctor multiply_functor(num_col); - SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor); - - // void * d_temp_storage = nullptr; - // hipMalloc((void **)&d_temp_storage, rt_inferred_temp_storage_bytes); - - auto err = hipcub::DeviceSegmentedReduce::ArgMax( - /* d_temp_storage */ temp_storage_ptr, - /* temp_storage_bytes */ rt_inferred_temp_storage_bytes, - /* d_in */ in_ptr, - /* d_out */ out_ptr, - /* num_segments */ num_row, - /* d_begin_offsets */ segment_offset_iter, - /* d_end_offsets */ segment_offset_iter + 1, - /* stream */ stream); - - // auto err = - // hipcub::DeviceReduce::ArgMax( - // d_temp_storage, rt_inferred_temp_storage_bytes, - // in_ptr, out_ptr, num_row, - // stream); - - OF_CUDA_CHECK(err); -} - -template -__global__ void WriteKeysToOutput(const int32_t instance_num, const int32_t instance_size, - const hipcub::KeyValuePair* key_value_out_ptr, - int64_t* out_ptr) { - CUDA_1D_KERNEL_LOOP(i, instance_num) { out_ptr[i] = key_value_out_ptr[i].key % instance_size; } -} - -} // namespace - -template -class GpuArgMaxKernel final : public user_op::OpKernel { - public: - GpuArgMaxKernel() = default; - ~GpuArgMaxKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - - const int32_t elem_cnt = in->shape_view().elem_cnt(); - const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); - const int32_t instance_num = elem_cnt / instance_size; - TmpBufferManager buffer_manager(tmp_buffer->shape_view().elem_cnt(), - tmp_buffer->mut_dptr(), instance_num); - - ArgMax(in->dptr(), instance_num, instance_size, buffer_manager.TempStoragePtr(), - buffer_manager.TempStorageBytes(), buffer_manager.KeyValueOutPtr(), - ctx->stream()->As()->cuda_stream()); - WriteKeysToOutput<<stream()->As()->cuda_stream()>>>( - instance_num, instance_size, buffer_manager.KeyValueOutPtr(), out->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_ARGMAX_KERNEL(dtype) \ - REGISTER_USER_KERNEL("argmax") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("in", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ - const Shape& in_shape = ctx->InputShape("in", 0); \ - const int32_t instance_size = in_shape.dim_vec().back(); \ - const int32_t instance_num = in_shape.elem_cnt() / instance_size; \ - \ - /* Key-Value Out */ \ - int32_t key_value_out_bytes = \ - GetCudaAlignedSize(instance_num * sizeof(hipcub::KeyValuePair)); \ - \ - /* CUB Temp Storage */ \ - size_t temp_storage_bytes = InferTempStorageForArgMax(instance_num, instance_size); \ - \ - return key_value_out_bytes + temp_storage_bytes; \ - }); - -REGISTER_CUDA_ARGMAX_KERNEL(float) -REGISTER_CUDA_ARGMAX_KERNEL(double) -REGISTER_CUDA_ARGMAX_KERNEL(uint8_t) -REGISTER_CUDA_ARGMAX_KERNEL(int8_t) -REGISTER_CUDA_ARGMAX_KERNEL(int32_t) -REGISTER_CUDA_ARGMAX_KERNEL(int64_t) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +class TmpBufferManager final { + public: + OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager); + TmpBufferManager(int32_t capacity, void* ptr, int32_t instance_num) + : capacity_{capacity}, key_value_out_elem_cnt_{instance_num} { + const int32_t key_value_out_aligned_bytes = + GetCudaAlignedSize(key_value_out_elem_cnt_ * sizeof(hipcub::KeyValuePair)); + key_value_out_ptr_ = reinterpret_cast*>(ptr); + temp_storage_ptr_ = reinterpret_cast(reinterpret_cast(key_value_out_ptr_) + + key_value_out_aligned_bytes); + temp_storage_bytes_ = capacity_ - key_value_out_aligned_bytes; + CHECK_GE(temp_storage_bytes_, 0); + } + ~TmpBufferManager() = default; + + hipcub::KeyValuePair* KeyValueOutPtr() const { return key_value_out_ptr_; } + void* TempStoragePtr() const { return temp_storage_ptr_; } + + int32_t TempStorageBytes() const { return temp_storage_bytes_; } + + private: + int32_t capacity_; + + hipcub::KeyValuePair* key_value_out_ptr_; + void* temp_storage_ptr_; + + int32_t key_value_out_elem_cnt_; + int32_t temp_storage_bytes_; +}; + +class MultiplyFunctor final { + public: + MultiplyFunctor(int32_t num_col) : num_col_(num_col) {} + __host__ __device__ __forceinline__ int32_t operator()(int32_t idx) const { + return idx * num_col_; + } + + private: + int32_t num_col_; +}; + +template +size_t InferTempStorageForArgMax(int32_t num_row, int32_t num_col) { + using SegmentOffsetIter = + hipcub::TransformInputIterator>; + hipcub::CountingInputIterator counting_iter(0); + MultiplyFunctor multiply_functor(num_col); + SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor); + + size_t temp_storage_bytes = 0; + auto err = + hipcub::DeviceSegmentedReduce::ArgMax*, SegmentOffsetIter>( + /* d_temp_storage */ nullptr, /* temp_storage_bytes */ temp_storage_bytes, + /* d_in */ nullptr, /* d_out */ nullptr, /* num_segments */ num_row, + /* d_begin_offsets */ segment_offset_iter, /* d_end_offsets */ segment_offset_iter + 1, + /* stream */ 0); + + // auto err = + // hipcub::DeviceReduce::ArgMax*>( + // nullptr, temp_storage_bytes, + // nullptr, nullptr, num_row, + // 0); + + OF_CUDA_CHECK(err); + + return temp_storage_bytes; +} + +template +void ArgMax(const T* in_ptr, int32_t num_row, int32_t num_col, void* temp_storage_ptr, + int32_t temp_storage_bytes, hipcub::KeyValuePair* out_ptr, + hipStream_t stream) { + size_t rt_inferred_temp_storage_bytes = InferTempStorageForArgMax(num_row, num_col); + CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes); + + using SegmentOffsetIter = + hipcub::TransformInputIterator>; + hipcub::CountingInputIterator counting_iter(0); + MultiplyFunctor multiply_functor(num_col); + SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor); + + // void * d_temp_storage = nullptr; + // hipMalloc((void **)&d_temp_storage, rt_inferred_temp_storage_bytes); + + auto err = hipcub::DeviceSegmentedReduce::ArgMax( + /* d_temp_storage */ temp_storage_ptr, + /* temp_storage_bytes */ rt_inferred_temp_storage_bytes, + /* d_in */ in_ptr, + /* d_out */ out_ptr, + /* num_segments */ num_row, + /* d_begin_offsets */ segment_offset_iter, + /* d_end_offsets */ segment_offset_iter + 1, + /* stream */ stream); + + // auto err = + // hipcub::DeviceReduce::ArgMax( + // d_temp_storage, rt_inferred_temp_storage_bytes, + // in_ptr, out_ptr, num_row, + // stream); + + OF_CUDA_CHECK(err); +} + +template +__global__ void WriteKeysToOutput(const int32_t instance_num, const int32_t instance_size, + const hipcub::KeyValuePair* key_value_out_ptr, + int64_t* out_ptr) { + CUDA_1D_KERNEL_LOOP(i, instance_num) { out_ptr[i] = key_value_out_ptr[i].key % instance_size; } +} + +} // namespace + +template +class GpuArgMaxKernel final : public user_op::OpKernel { + public: + GpuArgMaxKernel() = default; + ~GpuArgMaxKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + + const int32_t elem_cnt = in->shape_view().elem_cnt(); + const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); + const int32_t instance_num = elem_cnt / instance_size; + TmpBufferManager buffer_manager(tmp_buffer->shape_view().elem_cnt(), + tmp_buffer->mut_dptr(), instance_num); + + ArgMax(in->dptr(), instance_num, instance_size, buffer_manager.TempStoragePtr(), + buffer_manager.TempStorageBytes(), buffer_manager.KeyValueOutPtr(), + ctx->stream()->As()->cuda_stream()); + WriteKeysToOutput<<stream()->As()->cuda_stream()>>>( + instance_num, instance_size, buffer_manager.KeyValueOutPtr(), out->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_ARGMAX_KERNEL(dtype) \ + REGISTER_USER_KERNEL("argmax") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("in", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ + const Shape& in_shape = ctx->InputShape("in", 0); \ + const int32_t instance_size = in_shape.dim_vec().back(); \ + const int32_t instance_num = in_shape.elem_cnt() / instance_size; \ + \ + /* Key-Value Out */ \ + int32_t key_value_out_bytes = \ + GetCudaAlignedSize(instance_num * sizeof(hipcub::KeyValuePair)); \ + \ + /* CUB Temp Storage */ \ + size_t temp_storage_bytes = InferTempStorageForArgMax(instance_num, instance_size); \ + \ + return key_value_out_bytes + temp_storage_bytes; \ + }); + +REGISTER_CUDA_ARGMAX_KERNEL(float) +REGISTER_CUDA_ARGMAX_KERNEL(double) +REGISTER_CUDA_ARGMAX_KERNEL(uint8_t) +REGISTER_CUDA_ARGMAX_KERNEL(int8_t) +REGISTER_CUDA_ARGMAX_KERNEL(int32_t) +REGISTER_CUDA_ARGMAX_KERNEL(int64_t) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/as_strided_kernel.hip.cpp b/oneflow/user/kernels/as_strided_kernel.hip.cpp index ef8972e..2448f45 100644 --- a/oneflow/user/kernels/as_strided_kernel.hip.cpp +++ b/oneflow/user/kernels/as_strided_kernel.hip.cpp @@ -1,199 +1,199 @@ -#include "hip/hip_runtime.h" -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#include -#include "oneflow/core/hip/atomic.hip.h" -#include "oneflow/core/common/just.h" -#include "oneflow/core/common/util.h" -#include "oneflow/core/framework/consistency_check.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/kernel/kernel_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include "oneflow/core/common/nd_index_offset_helper.h" - -namespace oneflow { - -namespace { - -constexpr size_t NUM_DIM = 8; - -template -struct AsStridedParams { - NdIndexOffsetHelper destIndexOffsetHelper; - int64_t dest_dims[num_dims]; - int32_t stride[num_dims]; - int32_t dest_num_dims; - int32_t storage_offset; - int32_t input_num; - int32_t output_num; -}; - -template -__global__ void AsStrided_kernel(const T* input_buf, T* output_buf, - AsStridedParams params) { - const int64_t* dest_dims = reinterpret_cast(params.dest_dims); - const int32_t* stride = reinterpret_cast(params.stride); - - CUDA_1D_KERNEL_LOOP_T(int64_t, i, params.output_num) { - int64_t dst_index[NUM_DIM]; - params.destIndexOffsetHelper.OffsetToNdIndex(i, dst_index, params.dest_num_dims); - int32_t index_in_input = params.storage_offset; - FOR_RANGE(int64_t, j, 0, params.dest_num_dims) { index_in_input += dst_index[j] * stride[j]; } - output_buf[i] = input_buf[index_in_input]; - } -} - -template -__global__ void AsStridedGrad_kernel(const T* dy_buf, T* dx_buf, - AsStridedParams params) { - const int64_t* dest_dims = reinterpret_cast(params.dest_dims); - const int32_t* stride = reinterpret_cast(params.stride); - CUDA_1D_KERNEL_LOOP_T(int64_t, i, params.output_num) { - int64_t dy_index[NUM_DIM]; - params.destIndexOffsetHelper.OffsetToNdIndex(i, dy_index, params.dest_num_dims); - int32_t index_in_dx = params.storage_offset; - FOR_RANGE(int64_t, j, 0, params.dest_num_dims) { index_in_dx += dy_index[j] * stride[j]; } - cuda::atomic::Add(dx_buf + index_in_dx, dy_buf[i]); - } -} - -template -struct AsStridedFunctor final { - void operator()(ep::Stream* stream, const T* input_buf, T* output_buf, const int64_t* dest_dims, - const int32_t* stride, const int32_t dest_num_dims, const int32_t storage_offset, - const int32_t input_num, const int32_t output_num) { - NdIndexOffsetHelper destIndexOffsetHelper(dest_dims, dest_num_dims); - AsStridedParams params; - params.destIndexOffsetHelper = destIndexOffsetHelper; - FOR_RANGE(size_t, i, 0, dest_num_dims) { - params.dest_dims[i] = dest_dims[i]; - params.stride[i] = stride[i]; - } - params.dest_num_dims = dest_num_dims; - params.storage_offset = storage_offset; - params.input_num = input_num; - params.output_num = output_num; - - AsStrided_kernel - <<As()->cuda_stream()>>>(input_buf, output_buf, params); - } -}; - -template -struct AsStridedGradFunctor final { - void operator()(ep::Stream* stream, const T* dy_buf, T* dx_buf, const int64_t* dy_dims, - const int32_t* stride, const int32_t dy_num_dims, const int32_t storage_offset, - const int32_t dx_num, const int32_t dy_num) { - NdIndexOffsetHelper dyIndexOffsetHelper(dy_dims, dy_num_dims); - AsStridedParams params; - params.destIndexOffsetHelper = dyIndexOffsetHelper; - FOR_RANGE(size_t, i, 0, dy_num_dims) { - params.dest_dims[i] = dy_dims[i]; - params.stride[i] = stride[i]; - } - params.dest_num_dims = dy_num_dims; - params.storage_offset = storage_offset; - params.input_num = dx_num; - params.output_num = dy_num; - - AsStridedGrad_kernel - <<As()->cuda_stream()>>>(dy_buf, dx_buf, params); - } -}; - -} // namespace - -template -class GpuAsStridedKernel final : public user_op::OpKernel { - public: - GpuAsStridedKernel() = default; - ~GpuAsStridedKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* input = ctx->Tensor4ArgNameAndIndex("input", 0); - user_op::Tensor* output = ctx->Tensor4ArgNameAndIndex("output", 0); - const auto size = ctx->Attr>("size"); - const auto stride = ctx->Attr>("stride"); - const int32_t storage_offset = ctx->Attr("storage_offset"); - - size_t dest_num_dims = output->shape_view().NumAxes(); - const int64_t* dest_dims = output->shape_view().ptr(); - const size_t input_num = input->shape_view().Count(0); - const size_t output_num = output->shape_view().Count(0); - if (input_num == 0) { - // 0-size tensor - return; - } - - AsStridedFunctor()(ctx->stream(), input->dptr(), output->mut_dptr(), dest_dims, - stride.data(), dest_num_dims, storage_offset, input_num, output_num); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class GpuAsStridedGradKernel final : public user_op::OpKernel { - public: - GpuAsStridedGradKernel() = default; - ~GpuAsStridedGradKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const auto size = ctx->Attr>("size"); - const auto stride = ctx->Attr>("stride"); - const int32_t storage_offset = ctx->Attr("storage_offset"); - - size_t dy_num_dims = dy->shape_view().NumAxes(); - const int64_t* dy_dims = dy->shape_view().ptr(); - const size_t dx_num = dx->shape_view().Count(0); - const size_t dy_num = dy->shape_view().Count(0); - - Memset(ctx->stream(), dx->mut_dptr(), 0, - dx->shape_view().Count(0) * sizeof(T)); - - AsStridedGradFunctor()(ctx->stream(), dy->dptr(), dx->mut_dptr(), dy_dims, - stride.data(), dy_num_dims, storage_offset, dx_num, dy_num); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_GPUASSTRIDED_KERNEL(in_type) \ - REGISTER_USER_KERNEL("as_strided") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("input", 0) == GetDataType::value)); \ - REGISTER_USER_KERNEL("as_strided_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("input", 0) == GetDataType::value)); - -REGISTER_GPUASSTRIDED_KERNEL(half); -REGISTER_GPUASSTRIDED_KERNEL(float); -REGISTER_GPUASSTRIDED_KERNEL(double); -REGISTER_GPUASSTRIDED_KERNEL(int64_t); - -#undef REGISTER_GPUASSTRIDED_KERNEL - +#include "hip/hip_runtime.h" +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include "oneflow/core/hip/atomic.hip.h" +#include "oneflow/core/common/just.h" +#include "oneflow/core/common/util.h" +#include "oneflow/core/framework/consistency_check.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/kernel/kernel_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include "oneflow/core/common/nd_index_offset_helper.h" + +namespace oneflow { + +namespace { + +constexpr size_t NUM_DIM = 8; + +template +struct AsStridedParams { + NdIndexOffsetHelper destIndexOffsetHelper; + int64_t dest_dims[num_dims]; + int32_t stride[num_dims]; + int32_t dest_num_dims; + int32_t storage_offset; + int32_t input_num; + int32_t output_num; +}; + +template +__global__ void AsStrided_kernel(const T* input_buf, T* output_buf, + AsStridedParams params) { + const int64_t* dest_dims = reinterpret_cast(params.dest_dims); + const int32_t* stride = reinterpret_cast(params.stride); + + CUDA_1D_KERNEL_LOOP_T(int64_t, i, params.output_num) { + int64_t dst_index[NUM_DIM]; + params.destIndexOffsetHelper.OffsetToNdIndex(i, dst_index, params.dest_num_dims); + int32_t index_in_input = params.storage_offset; + FOR_RANGE(int64_t, j, 0, params.dest_num_dims) { index_in_input += dst_index[j] * stride[j]; } + output_buf[i] = input_buf[index_in_input]; + } +} + +template +__global__ void AsStridedGrad_kernel(const T* dy_buf, T* dx_buf, + AsStridedParams params) { + const int64_t* dest_dims = reinterpret_cast(params.dest_dims); + const int32_t* stride = reinterpret_cast(params.stride); + CUDA_1D_KERNEL_LOOP_T(int64_t, i, params.output_num) { + int64_t dy_index[NUM_DIM]; + params.destIndexOffsetHelper.OffsetToNdIndex(i, dy_index, params.dest_num_dims); + int32_t index_in_dx = params.storage_offset; + FOR_RANGE(int64_t, j, 0, params.dest_num_dims) { index_in_dx += dy_index[j] * stride[j]; } + cuda::atomic::Add(dx_buf + index_in_dx, dy_buf[i]); + } +} + +template +struct AsStridedFunctor final { + void operator()(ep::Stream* stream, const T* input_buf, T* output_buf, const int64_t* dest_dims, + const int32_t* stride, const int32_t dest_num_dims, const int32_t storage_offset, + const int32_t input_num, const int32_t output_num) { + NdIndexOffsetHelper destIndexOffsetHelper(dest_dims, dest_num_dims); + AsStridedParams params; + params.destIndexOffsetHelper = destIndexOffsetHelper; + FOR_RANGE(size_t, i, 0, dest_num_dims) { + params.dest_dims[i] = dest_dims[i]; + params.stride[i] = stride[i]; + } + params.dest_num_dims = dest_num_dims; + params.storage_offset = storage_offset; + params.input_num = input_num; + params.output_num = output_num; + + AsStrided_kernel + <<As()->cuda_stream()>>>(input_buf, output_buf, params); + } +}; + +template +struct AsStridedGradFunctor final { + void operator()(ep::Stream* stream, const T* dy_buf, T* dx_buf, const int64_t* dy_dims, + const int32_t* stride, const int32_t dy_num_dims, const int32_t storage_offset, + const int32_t dx_num, const int32_t dy_num) { + NdIndexOffsetHelper dyIndexOffsetHelper(dy_dims, dy_num_dims); + AsStridedParams params; + params.destIndexOffsetHelper = dyIndexOffsetHelper; + FOR_RANGE(size_t, i, 0, dy_num_dims) { + params.dest_dims[i] = dy_dims[i]; + params.stride[i] = stride[i]; + } + params.dest_num_dims = dy_num_dims; + params.storage_offset = storage_offset; + params.input_num = dx_num; + params.output_num = dy_num; + + AsStridedGrad_kernel + <<As()->cuda_stream()>>>(dy_buf, dx_buf, params); + } +}; + +} // namespace + +template +class GpuAsStridedKernel final : public user_op::OpKernel { + public: + GpuAsStridedKernel() = default; + ~GpuAsStridedKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* input = ctx->Tensor4ArgNameAndIndex("input", 0); + user_op::Tensor* output = ctx->Tensor4ArgNameAndIndex("output", 0); + const auto size = ctx->Attr>("size"); + const auto stride = ctx->Attr>("stride"); + const int32_t storage_offset = ctx->Attr("storage_offset"); + + size_t dest_num_dims = output->shape_view().NumAxes(); + const int64_t* dest_dims = output->shape_view().ptr(); + const size_t input_num = input->shape_view().Count(0); + const size_t output_num = output->shape_view().Count(0); + if (input_num == 0) { + // 0-size tensor + return; + } + + AsStridedFunctor()(ctx->stream(), input->dptr(), output->mut_dptr(), dest_dims, + stride.data(), dest_num_dims, storage_offset, input_num, output_num); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class GpuAsStridedGradKernel final : public user_op::OpKernel { + public: + GpuAsStridedGradKernel() = default; + ~GpuAsStridedGradKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + const auto size = ctx->Attr>("size"); + const auto stride = ctx->Attr>("stride"); + const int32_t storage_offset = ctx->Attr("storage_offset"); + + size_t dy_num_dims = dy->shape_view().NumAxes(); + const int64_t* dy_dims = dy->shape_view().ptr(); + const size_t dx_num = dx->shape_view().Count(0); + const size_t dy_num = dy->shape_view().Count(0); + + Memset(ctx->stream(), dx->mut_dptr(), 0, + dx->shape_view().Count(0) * sizeof(T)); + + AsStridedGradFunctor()(ctx->stream(), dy->dptr(), dx->mut_dptr(), dy_dims, + stride.data(), dy_num_dims, storage_offset, dx_num, dy_num); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_GPUASSTRIDED_KERNEL(in_type) \ + REGISTER_USER_KERNEL("as_strided") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("input", 0) == GetDataType::value)); \ + REGISTER_USER_KERNEL("as_strided_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("input", 0) == GetDataType::value)); + +REGISTER_GPUASSTRIDED_KERNEL(half); +REGISTER_GPUASSTRIDED_KERNEL(float); +REGISTER_GPUASSTRIDED_KERNEL(double); +REGISTER_GPUASSTRIDED_KERNEL(int64_t); + +#undef REGISTER_GPUASSTRIDED_KERNEL + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/assign_if_kernel.hip.cpp b/oneflow/user/kernels/assign_if_kernel.hip.cpp index 3752a71..6163c48 100644 --- a/oneflow/user/kernels/assign_if_kernel.hip.cpp +++ b/oneflow/user/kernels/assign_if_kernel.hip.cpp @@ -1,76 +1,76 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/kernel_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -__global__ void AssignGpu(int64_t elem_cnt, const C* condition, const T* value, T* ref) { - if (assign_if == (*condition == 0)) { return; } - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { ref[i] = value[i]; } -} - -template -class AssignIfGPUKernel final : public user_op::OpKernel { - public: - AssignIfGPUKernel() = default; - ~AssignIfGPUKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* condition = ctx->Tensor4ArgNameAndIndex("condition", 0); - CHECK_EQ(condition->shape_view().NumAxes(), 1); - CHECK_EQ(condition->shape_view().At(0), 1); - const user_op::Tensor* value = ctx->Tensor4ArgNameAndIndex("value", 0); - user_op::Tensor* ref = ctx->Tensor4ArgNameAndIndex("ref", 0); - if (value->dptr() == ref->dptr()) { return; } - CHECK_EQ(value->shape_view(), ref->shape_view()); - CHECK_EQ(value->data_type(), ref->data_type()); - const size_t elem_cnt = ref->shape_view().elem_cnt(); - AssignGpu<<stream()->As()->cuda_stream()>>>( - elem_cnt, condition->dptr(), value->dptr(), ref->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; } -}; - -} // namespace - -#define REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL(op_type_name, assign_if, condition_type, \ - value_type) \ - REGISTER_USER_KERNEL(op_type_name) \ - .SetCreateFn>() \ - .SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("condition", 0) == GetDataType::value) \ - && (user_op::HobDataType("value", 0) == GetDataType::value)); - -#define REGISTER_ASSIGN_IF_CUDA_KERNEL(condition_type, value_type) \ - REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL( \ - "assign_if", true, OF_PP_PAIR_FIRST(condition_type), OF_PP_PAIR_FIRST(value_type)); \ - REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL( \ - "assign_if_not", false, OF_PP_PAIR_FIRST(condition_type), OF_PP_PAIR_FIRST(value_type)) - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_ASSIGN_IF_CUDA_KERNEL, INT_DATA_TYPE_SEQ, - POD_DATA_TYPE_SEQ) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/kernel_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +__global__ void AssignGpu(int64_t elem_cnt, const C* condition, const T* value, T* ref) { + if (assign_if == (*condition == 0)) { return; } + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { ref[i] = value[i]; } +} + +template +class AssignIfGPUKernel final : public user_op::OpKernel { + public: + AssignIfGPUKernel() = default; + ~AssignIfGPUKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* condition = ctx->Tensor4ArgNameAndIndex("condition", 0); + CHECK_EQ(condition->shape_view().NumAxes(), 1); + CHECK_EQ(condition->shape_view().At(0), 1); + const user_op::Tensor* value = ctx->Tensor4ArgNameAndIndex("value", 0); + user_op::Tensor* ref = ctx->Tensor4ArgNameAndIndex("ref", 0); + if (value->dptr() == ref->dptr()) { return; } + CHECK_EQ(value->shape_view(), ref->shape_view()); + CHECK_EQ(value->data_type(), ref->data_type()); + const size_t elem_cnt = ref->shape_view().elem_cnt(); + AssignGpu<<stream()->As()->cuda_stream()>>>( + elem_cnt, condition->dptr(), value->dptr(), ref->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; } +}; + +} // namespace + +#define REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL(op_type_name, assign_if, condition_type, \ + value_type) \ + REGISTER_USER_KERNEL(op_type_name) \ + .SetCreateFn>() \ + .SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("condition", 0) == GetDataType::value) \ + && (user_op::HobDataType("value", 0) == GetDataType::value)); + +#define REGISTER_ASSIGN_IF_CUDA_KERNEL(condition_type, value_type) \ + REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL( \ + "assign_if", true, OF_PP_PAIR_FIRST(condition_type), OF_PP_PAIR_FIRST(value_type)); \ + REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL( \ + "assign_if_not", false, OF_PP_PAIR_FIRST(condition_type), OF_PP_PAIR_FIRST(value_type)) + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_ASSIGN_IF_CUDA_KERNEL, INT_DATA_TYPE_SEQ, + POD_DATA_TYPE_SEQ) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/avg_pool_kernel.hip.cpp b/oneflow/user/kernels/avg_pool_kernel.hip.cpp index e2abbff..a8777ba 100644 --- a/oneflow/user/kernels/avg_pool_kernel.hip.cpp +++ b/oneflow/user/kernels/avg_pool_kernel.hip.cpp @@ -1,200 +1,200 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/user/kernels/avg_pool_kernel_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -constexpr int kBlockSize = cuda::elementwise::kBlockSize; - -int GetMinThreadNum(const int64_t elem_num) { return std::min(elem_num, kBlockSize); } - -int GetNumBlocks(int32_t elem_cnt) { - int num_blocks = 0; - OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks)); - return num_blocks; -} - -} // namespace - -template -__launch_bounds__(kBlockSize) __global__ - void DoCUDAAvgPool1dForward(const NdIndexOffsetHelper index_helper, IDX elem_num, - const T* src, T* dest, int32_t padding_l, const int32_t n_batch, - const int32_t n_channel, const int32_t x_length, - const int32_t kernel_size_l, const int32_t stride_l, - const bool count_include_pad, const int32_t divisor_override) { - Avgpool1dForwardCompute(index_helper, elem_num, src, dest, padding_l, n_batch, n_channel, - x_length, kernel_size_l, stride_l, count_include_pad, - divisor_override); -}; - -template -__launch_bounds__(kBlockSize) __global__ - void DoCUDAAvgPool2dForward(const NdIndexOffsetHelper index_helper, IDX elem_num, - const T* src, T* dest, const int32_t padding_h, - const int32_t padding_w, const int32_t n_batch, - const int32_t n_channel, const int32_t x_height, - const int32_t x_width, const int32_t kernel_size_h, - const int32_t kernel_size_w, const int32_t stride_h, - const int32_t stride_w, const bool count_include_pad, - const int32_t divisor_override) { - Avgpool2dForwardCompute(index_helper, elem_num, src, dest, padding_h, padding_w, n_batch, - n_channel, x_height, x_width, kernel_size_h, kernel_size_w, stride_h, - stride_w, count_include_pad, divisor_override); -}; - -template -__launch_bounds__(kBlockSize) __global__ - void DoCUDAAvgPool3dForward(const NdIndexOffsetHelper index_helper, IDX elem_num, - const T* src, T* dest, int32_t padding_t, const int32_t padding_h, - const int32_t padding_w, const int32_t n_batch, - const int32_t n_channel, const int32_t x_time, - const int32_t x_height, const int32_t x_width, - const int32_t kernel_size_t, int32_t kernel_size_h, - const int32_t kernel_size_w, const int32_t stride_t, - const int32_t stride_h, const int32_t stride_w, - const bool count_include_pad, const int32_t divisor_override) { - Avgpool3dForwardCompute(index_helper, elem_num, src, dest, padding_t, padding_h, padding_w, - n_batch, n_channel, x_time, x_height, x_width, kernel_size_t, - kernel_size_h, kernel_size_w, stride_t, stride_h, stride_w, - count_include_pad, divisor_override); -}; - -template -__launch_bounds__(kBlockSize) __global__ - void DoCUDAAvgPool1dBackward(const NdIndexOffsetHelper index_helper, IDX elem_num, - const T* src, T* dest, const int32_t padding_l, - const int32_t n_batch, const int32_t n_channel, - const int32_t input_length, const int32_t kernel_size_l, - const int32_t stride_l, const bool count_include_pad, - const int32_t divisor_override) { - Avgpool1dBackwardCompute(index_helper, elem_num, src, dest, padding_l, n_batch, n_channel, - input_length, kernel_size_l, stride_l, count_include_pad, - divisor_override); -}; - -template -__launch_bounds__(kBlockSize) __global__ - void DoCUDAAvgPool2dBackward(const NdIndexOffsetHelper index_helper, IDX elem_num, - const T* src, T* dest, const int32_t padding_h, - const int32_t padding_w, const int32_t n_batch, - const int32_t n_channel, const int32_t input_height, - const int32_t input_width, const int32_t kernel_size_h, - const int32_t kernel_size_w, const int32_t stride_h, - const int32_t stride_w, const bool count_include_pad, - int32_t divisor_override) { - Avgpool2dBackwardCompute(index_helper, elem_num, src, dest, padding_h, padding_w, n_batch, - n_channel, input_height, input_width, kernel_size_h, kernel_size_w, - stride_h, stride_w, count_include_pad, divisor_override); -}; - -template -__launch_bounds__(kBlockSize) __global__ void DoCUDAAvgPool3dBackward( - const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, - const int32_t padding_t, const int32_t padding_h, const int32_t padding_w, - const int32_t n_batch, const int32_t n_channel, const int32_t x_time, const int32_t x_height, - const int32_t x_width, const int32_t kernel_size_t, const int32_t kernel_size_h, - const int32_t kernel_size_w, const int32_t stride_t, const int32_t stride_h, - const int32_t stride_w, const bool count_include_pad, const int32_t divisor_override) { - Avgpool3dBackwardCompute(index_helper, elem_num, src, dest, padding_t, padding_h, padding_w, - n_batch, n_channel, x_time, x_height, x_width, kernel_size_t, - kernel_size_h, kernel_size_w, stride_t, stride_h, stride_w, - count_include_pad, divisor_override); -}; - -template -struct AvgPoolKernelUtil { - static void Avgpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, - const AvgPoolParams3D& params_3d) { - DoCUDAAvgPool1dForward<<As()->cuda_stream()>>>( - index_helper, elem_num, src, dest, params_3d.padding()[2], params_3d.num_batch(), - params_3d.num_channel(), params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[2], - params_3d.stride_3d()[2], params_3d.count_include_pad(), params_3d.divisor_override()); - } - - static void Avgpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, - const AvgPoolParams3D& params_3d) { - DoCUDAAvgPool1dBackward<<As()->cuda_stream()>>>( - index_helper, elem_num, src, dest, params_3d.padding()[2], params_3d.num_batch(), - params_3d.num_channel(), params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[2], - params_3d.stride_3d()[2], params_3d.count_include_pad(), params_3d.divisor_override()); - } - - static void Avgpool2dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, - const AvgPoolParams3D& params_3d) { - DoCUDAAvgPool2dForward<<As()->cuda_stream()>>>( - index_helper, elem_num, src, dest, params_3d.padding()[1], params_3d.padding()[2], - params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(3), - params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2], - params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.count_include_pad(), - params_3d.divisor_override()); - } - - static void Avgpool2dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, - const AvgPoolParams3D& params_3d) { - DoCUDAAvgPool2dBackward<<As()->cuda_stream()>>>( - index_helper, elem_num, src, dest, params_3d.padding()[1], params_3d.padding()[2], - params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(3), - params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2], - params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.count_include_pad(), - params_3d.divisor_override()); - } - - static void Avgpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, - const AvgPoolParams3D& params_3d) { - DoCUDAAvgPool3dForward<<As()->cuda_stream()>>>( - index_helper, elem_num, src, dest, params_3d.padding()[0], params_3d.padding()[1], - params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), - params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), - params_3d.pool_size_3d()[0], params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2], - params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2], - params_3d.count_include_pad(), params_3d.divisor_override()); - } - - static void Avgpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, - const AvgPoolParams3D& params_3d) { - DoCUDAAvgPool3dBackward<<As()->cuda_stream()>>>( - index_helper, elem_num, src, dest, params_3d.padding()[0], params_3d.padding()[1], - params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), - params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), - params_3d.pool_size_3d()[0], params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2], - params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2], - params_3d.count_include_pad(), params_3d.divisor_override()); - } -}; - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_AVG_POOL_KERNEL_UTIL, (DeviceType::kCUDA), - AVG_POOL_DATA_TYPE_CUDA_SEQ, AVG_POOL_IDX_DATA_TYPE_SEQ); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/user/kernels/avg_pool_kernel_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +constexpr int kBlockSize = cuda::elementwise::kBlockSize; + +int GetMinThreadNum(const int64_t elem_num) { return std::min(elem_num, kBlockSize); } + +int GetNumBlocks(int32_t elem_cnt) { + int num_blocks = 0; + OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks)); + return num_blocks; +} + +} // namespace + +template +__launch_bounds__(kBlockSize) __global__ + void DoCUDAAvgPool1dForward(const NdIndexOffsetHelper index_helper, IDX elem_num, + const T* src, T* dest, int32_t padding_l, const int32_t n_batch, + const int32_t n_channel, const int32_t x_length, + const int32_t kernel_size_l, const int32_t stride_l, + const bool count_include_pad, const int32_t divisor_override) { + Avgpool1dForwardCompute(index_helper, elem_num, src, dest, padding_l, n_batch, n_channel, + x_length, kernel_size_l, stride_l, count_include_pad, + divisor_override); +}; + +template +__launch_bounds__(kBlockSize) __global__ + void DoCUDAAvgPool2dForward(const NdIndexOffsetHelper index_helper, IDX elem_num, + const T* src, T* dest, const int32_t padding_h, + const int32_t padding_w, const int32_t n_batch, + const int32_t n_channel, const int32_t x_height, + const int32_t x_width, const int32_t kernel_size_h, + const int32_t kernel_size_w, const int32_t stride_h, + const int32_t stride_w, const bool count_include_pad, + const int32_t divisor_override) { + Avgpool2dForwardCompute(index_helper, elem_num, src, dest, padding_h, padding_w, n_batch, + n_channel, x_height, x_width, kernel_size_h, kernel_size_w, stride_h, + stride_w, count_include_pad, divisor_override); +}; + +template +__launch_bounds__(kBlockSize) __global__ + void DoCUDAAvgPool3dForward(const NdIndexOffsetHelper index_helper, IDX elem_num, + const T* src, T* dest, int32_t padding_t, const int32_t padding_h, + const int32_t padding_w, const int32_t n_batch, + const int32_t n_channel, const int32_t x_time, + const int32_t x_height, const int32_t x_width, + const int32_t kernel_size_t, int32_t kernel_size_h, + const int32_t kernel_size_w, const int32_t stride_t, + const int32_t stride_h, const int32_t stride_w, + const bool count_include_pad, const int32_t divisor_override) { + Avgpool3dForwardCompute(index_helper, elem_num, src, dest, padding_t, padding_h, padding_w, + n_batch, n_channel, x_time, x_height, x_width, kernel_size_t, + kernel_size_h, kernel_size_w, stride_t, stride_h, stride_w, + count_include_pad, divisor_override); +}; + +template +__launch_bounds__(kBlockSize) __global__ + void DoCUDAAvgPool1dBackward(const NdIndexOffsetHelper index_helper, IDX elem_num, + const T* src, T* dest, const int32_t padding_l, + const int32_t n_batch, const int32_t n_channel, + const int32_t input_length, const int32_t kernel_size_l, + const int32_t stride_l, const bool count_include_pad, + const int32_t divisor_override) { + Avgpool1dBackwardCompute(index_helper, elem_num, src, dest, padding_l, n_batch, n_channel, + input_length, kernel_size_l, stride_l, count_include_pad, + divisor_override); +}; + +template +__launch_bounds__(kBlockSize) __global__ + void DoCUDAAvgPool2dBackward(const NdIndexOffsetHelper index_helper, IDX elem_num, + const T* src, T* dest, const int32_t padding_h, + const int32_t padding_w, const int32_t n_batch, + const int32_t n_channel, const int32_t input_height, + const int32_t input_width, const int32_t kernel_size_h, + const int32_t kernel_size_w, const int32_t stride_h, + const int32_t stride_w, const bool count_include_pad, + int32_t divisor_override) { + Avgpool2dBackwardCompute(index_helper, elem_num, src, dest, padding_h, padding_w, n_batch, + n_channel, input_height, input_width, kernel_size_h, kernel_size_w, + stride_h, stride_w, count_include_pad, divisor_override); +}; + +template +__launch_bounds__(kBlockSize) __global__ void DoCUDAAvgPool3dBackward( + const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, + const int32_t padding_t, const int32_t padding_h, const int32_t padding_w, + const int32_t n_batch, const int32_t n_channel, const int32_t x_time, const int32_t x_height, + const int32_t x_width, const int32_t kernel_size_t, const int32_t kernel_size_h, + const int32_t kernel_size_w, const int32_t stride_t, const int32_t stride_h, + const int32_t stride_w, const bool count_include_pad, const int32_t divisor_override) { + Avgpool3dBackwardCompute(index_helper, elem_num, src, dest, padding_t, padding_h, padding_w, + n_batch, n_channel, x_time, x_height, x_width, kernel_size_t, + kernel_size_h, kernel_size_w, stride_t, stride_h, stride_w, + count_include_pad, divisor_override); +}; + +template +struct AvgPoolKernelUtil { + static void Avgpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, + const AvgPoolParams3D& params_3d) { + DoCUDAAvgPool1dForward<<As()->cuda_stream()>>>( + index_helper, elem_num, src, dest, params_3d.padding()[2], params_3d.num_batch(), + params_3d.num_channel(), params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[2], + params_3d.stride_3d()[2], params_3d.count_include_pad(), params_3d.divisor_override()); + } + + static void Avgpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, + const AvgPoolParams3D& params_3d) { + DoCUDAAvgPool1dBackward<<As()->cuda_stream()>>>( + index_helper, elem_num, src, dest, params_3d.padding()[2], params_3d.num_batch(), + params_3d.num_channel(), params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[2], + params_3d.stride_3d()[2], params_3d.count_include_pad(), params_3d.divisor_override()); + } + + static void Avgpool2dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, + const AvgPoolParams3D& params_3d) { + DoCUDAAvgPool2dForward<<As()->cuda_stream()>>>( + index_helper, elem_num, src, dest, params_3d.padding()[1], params_3d.padding()[2], + params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(3), + params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2], + params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.count_include_pad(), + params_3d.divisor_override()); + } + + static void Avgpool2dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, + const AvgPoolParams3D& params_3d) { + DoCUDAAvgPool2dBackward<<As()->cuda_stream()>>>( + index_helper, elem_num, src, dest, params_3d.padding()[1], params_3d.padding()[2], + params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(3), + params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2], + params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.count_include_pad(), + params_3d.divisor_override()); + } + + static void Avgpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, + const AvgPoolParams3D& params_3d) { + DoCUDAAvgPool3dForward<<As()->cuda_stream()>>>( + index_helper, elem_num, src, dest, params_3d.padding()[0], params_3d.padding()[1], + params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), + params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), + params_3d.pool_size_3d()[0], params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2], + params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2], + params_3d.count_include_pad(), params_3d.divisor_override()); + } + + static void Avgpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, + const AvgPoolParams3D& params_3d) { + DoCUDAAvgPool3dBackward<<As()->cuda_stream()>>>( + index_helper, elem_num, src, dest, params_3d.padding()[0], params_3d.padding()[1], + params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), + params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), + params_3d.pool_size_3d()[0], params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2], + params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2], + params_3d.count_include_pad(), params_3d.divisor_override()); + } +}; + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_AVG_POOL_KERNEL_UTIL, (DeviceType::kCUDA), + AVG_POOL_DATA_TYPE_CUDA_SEQ, AVG_POOL_IDX_DATA_TYPE_SEQ); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/batch_gather_kernel_util.hip.cpp b/oneflow/user/kernels/batch_gather_kernel_util.hip.cpp index 547fea4..de9cf92 100644 --- a/oneflow/user/kernels/batch_gather_kernel_util.hip.cpp +++ b/oneflow/user/kernels/batch_gather_kernel_util.hip.cpp @@ -1,103 +1,103 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/user/kernels/batch_gather_kernel_util.h" -#include "oneflow/core/hip/atomic.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include - -namespace oneflow { - -namespace { - -template -__device__ int64_t GetInOffset(const int64_t out_offset, const K* indices, - const int64_t indices_num, const int64_t instance_size, - const int64_t gather_dim_size) { - const int64_t batch_idx = out_offset / (indices_num * instance_size); - const int64_t indices_idx = out_offset % (indices_num * instance_size) / instance_size; - const int64_t inner_idx = out_offset % instance_size; - const int64_t idx = indices[batch_idx * indices_num + indices_idx]; - assert(idx >= 0 && idx < gather_dim_size); - return batch_idx * gather_dim_size * instance_size + idx * instance_size + inner_idx; -} - -template -__global__ void BatchGatherForwardGpu(const int64_t elem_cnt, const T* in, const K* indices, - const int64_t indices_num, const int64_t instance_size, - const int64_t gather_dim_size, T* out) { - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { - out[i] = in[GetInOffset(i, indices, indices_num, instance_size, gather_dim_size)]; - } -} - -template -__global__ void BatchGatherBackwardGpu(const int64_t elem_cnt, const T* out_diff, const K* indices, - const int64_t indices_num, const int64_t instance_size, - const int64_t gather_dim_size, T* in_diff) { - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { - cuda::atomic::Add( - in_diff + GetInOffset(i, indices, indices_num, instance_size, gather_dim_size), - out_diff[i]); - } -} - -} // namespace - -template -struct BatchGatherKernelUtilImpl final { - static void Forward(ep::Stream* stream, const T* in, const K* indices, - const Shape& flat_out_shape, const int64_t gather_dim_size, T* out); - static void Backward(ep::Stream* stream, const T* out_diff, const K* indices, - const Shape& flat_out_diff_shape, const int64_t gather_dim_size, T* in_diff); -}; - -template -void BatchGatherKernelUtilImpl::Forward(ep::Stream* stream, const T* in, - const K* indices, - const Shape& flat_out_shape, - const int64_t gather_dim_size, - T* out) { - const int64_t batch_num = flat_out_shape.At(0); - const int64_t indices_num = flat_out_shape.At(1); - const int64_t instance_size = flat_out_shape.At(2); - const int64_t elem_cnt = batch_num * indices_num * instance_size; - BatchGatherForwardGpu<<As()->cuda_stream()>>>( - elem_cnt, in, indices, indices_num, instance_size, gather_dim_size, out); -} - -template -void BatchGatherKernelUtilImpl::Backward( - ep::Stream* stream, const T* out_diff, const K* indices, const Shape& flat_out_diff_shape, - const int64_t gather_dim_size, T* in_diff) { - const int64_t batch_num = flat_out_diff_shape.At(0); - const int64_t indices_num = flat_out_diff_shape.At(1); - const int64_t instance_size = flat_out_diff_shape.At(2); - const int64_t elem_cnt = batch_num * indices_num * instance_size; - BatchGatherBackwardGpu<<As()->cuda_stream()>>>( - elem_cnt, out_diff, indices, indices_num, instance_size, gather_dim_size, in_diff); -} - -#define INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA(in_type_pair, index_type_pair) \ - template struct BatchGatherKernelUtilImpl; -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA, - FLOATING_DATA_TYPE_SEQ, INT_DATA_TYPE_SEQ); -#undef INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/user/kernels/batch_gather_kernel_util.h" +#include "oneflow/core/hip/atomic.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include + +namespace oneflow { + +namespace { + +template +__device__ int64_t GetInOffset(const int64_t out_offset, const K* indices, + const int64_t indices_num, const int64_t instance_size, + const int64_t gather_dim_size) { + const int64_t batch_idx = out_offset / (indices_num * instance_size); + const int64_t indices_idx = out_offset % (indices_num * instance_size) / instance_size; + const int64_t inner_idx = out_offset % instance_size; + const int64_t idx = indices[batch_idx * indices_num + indices_idx]; + assert(idx >= 0 && idx < gather_dim_size); + return batch_idx * gather_dim_size * instance_size + idx * instance_size + inner_idx; +} + +template +__global__ void BatchGatherForwardGpu(const int64_t elem_cnt, const T* in, const K* indices, + const int64_t indices_num, const int64_t instance_size, + const int64_t gather_dim_size, T* out) { + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { + out[i] = in[GetInOffset(i, indices, indices_num, instance_size, gather_dim_size)]; + } +} + +template +__global__ void BatchGatherBackwardGpu(const int64_t elem_cnt, const T* out_diff, const K* indices, + const int64_t indices_num, const int64_t instance_size, + const int64_t gather_dim_size, T* in_diff) { + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { + cuda::atomic::Add( + in_diff + GetInOffset(i, indices, indices_num, instance_size, gather_dim_size), + out_diff[i]); + } +} + +} // namespace + +template +struct BatchGatherKernelUtilImpl final { + static void Forward(ep::Stream* stream, const T* in, const K* indices, + const Shape& flat_out_shape, const int64_t gather_dim_size, T* out); + static void Backward(ep::Stream* stream, const T* out_diff, const K* indices, + const Shape& flat_out_diff_shape, const int64_t gather_dim_size, T* in_diff); +}; + +template +void BatchGatherKernelUtilImpl::Forward(ep::Stream* stream, const T* in, + const K* indices, + const Shape& flat_out_shape, + const int64_t gather_dim_size, + T* out) { + const int64_t batch_num = flat_out_shape.At(0); + const int64_t indices_num = flat_out_shape.At(1); + const int64_t instance_size = flat_out_shape.At(2); + const int64_t elem_cnt = batch_num * indices_num * instance_size; + BatchGatherForwardGpu<<As()->cuda_stream()>>>( + elem_cnt, in, indices, indices_num, instance_size, gather_dim_size, out); +} + +template +void BatchGatherKernelUtilImpl::Backward( + ep::Stream* stream, const T* out_diff, const K* indices, const Shape& flat_out_diff_shape, + const int64_t gather_dim_size, T* in_diff) { + const int64_t batch_num = flat_out_diff_shape.At(0); + const int64_t indices_num = flat_out_diff_shape.At(1); + const int64_t instance_size = flat_out_diff_shape.At(2); + const int64_t elem_cnt = batch_num * indices_num * instance_size; + BatchGatherBackwardGpu<<As()->cuda_stream()>>>( + elem_cnt, out_diff, indices, indices_num, instance_size, gather_dim_size, in_diff); +} + +#define INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA(in_type_pair, index_type_pair) \ + template struct BatchGatherKernelUtilImpl; +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA, + FLOATING_DATA_TYPE_SEQ, INT_DATA_TYPE_SEQ); +#undef INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/binary_cross_entropy_kernel.hip.cpp b/oneflow/user/kernels/binary_cross_entropy_kernel.hip.cpp index c1fe0cd..ccceaad 100644 --- a/oneflow/user/kernels/binary_cross_entropy_kernel.hip.cpp +++ b/oneflow/user/kernels/binary_cross_entropy_kernel.hip.cpp @@ -1,204 +1,204 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/user/kernels/loss_kernel_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { -namespace user_op { -namespace { - -using namespace loss; - -template -struct BinaryCrossEntropyFunctor { - T zero_; - T one_; - T negative_hundred_; - BinaryCrossEntropyFunctor() - : zero_(GetZeroVal()), one_(GetOneVal()), negative_hundred_(static_cast(-100)) {} - __device__ __forceinline__ T operator()(T input_val, T target_val) const { - assert(input_val >= zero_); - assert(input_val <= one_); - return (target_val - one_) * max(static_cast(log(one_ - input_val)), negative_hundred_) - - target_val * max(static_cast(log(input_val)), negative_hundred_); - } - - __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const { - return (*this)(input_val, target_val) * weight_val; - } -}; - -template<> -struct BinaryCrossEntropyFunctor { - float zero_; - float one_; - float negative_hundred_; - BinaryCrossEntropyFunctor() : zero_(0.f), one_(1.f), negative_hundred_(-100.f) {} - __device__ __forceinline__ float operator()(float input_val, float target_val) const { - assert(input_val >= zero_); - assert(input_val <= one_); - return (target_val - one_) * max(logf(one_ - input_val), negative_hundred_) - - target_val * max(logf(input_val), negative_hundred_); - } - - __device__ __forceinline__ float operator()(float input_val, float target_val, - float weight_val) const { - return (*this)(input_val, target_val) * weight_val; - } -}; - -template<> -struct BinaryCrossEntropyFunctor { - BinaryCrossEntropyFunctor float_functor; - __device__ __forceinline__ half operator()(half input_val, half target_val) const { - return __float2half(float_functor(__half2float(input_val), __half2float(target_val))); - } - - __device__ __forceinline__ half operator()(half input_val, half target_val, - half weight_val) const { - return (*this)(input_val, target_val) * weight_val; - } -}; - -template -struct BinaryCrossEntropyGradFunctor { - T eps_; - T one_; - BinaryCrossEntropyGradFunctor() : eps_(static_cast(1e-12)), one_(GetOneVal()) {} - __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val) const { - return dy_val * (input_val - target_val) / max((one_ - input_val) * input_val, eps_); - } - __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const { - return (*this)(input_val, target_val, dy_val) * weight_val; - } -}; - -template<> -struct BinaryCrossEntropyGradFunctor { - BinaryCrossEntropyGradFunctor float_functor; - BinaryCrossEntropyGradFunctor() {} - __device__ __forceinline__ half operator()(half input_val, half target_val, half dy_val) const { - return __float2half( - float_functor(__half2float(input_val), __half2float(target_val), __half2float(dy_val))); - } - __device__ __forceinline__ half operator()(half input_val, half target_val, half dy_val, - half weight_val) const { - return __float2half(float_functor(__half2float(input_val), __half2float(target_val), - __half2float(dy_val), __half2float(weight_val))); - } -}; - -template -class BinaryCrossEntropyKernel final : public user_op::OpKernel { - public: - BinaryCrossEntropyKernel() = default; - ~BinaryCrossEntropyKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0); - const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); - auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); - - const int64_t elem_cnt = input_blob->shape_view().elem_cnt(); - - const T* input = input_blob->dptr(); - const T* target = target_blob->dptr(); - T* out = out_blob->mut_dptr(); - - if (ctx->has_input("weight", 0)) { - const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr(); - OF_CUDA_CHECK( - (cuda::elementwise::Ternary(BinaryCrossEntropyFunctor(), elem_cnt, out, input, target, - weight, ctx->stream()->As()->cuda_stream()))); - } else { - OF_CUDA_CHECK( - (cuda::elementwise::Binary(BinaryCrossEntropyFunctor(), elem_cnt, out, input, target, - ctx->stream()->As()->cuda_stream()))); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class BinaryCrossEntropyGradKernel final : public user_op::OpKernel { - public: - BinaryCrossEntropyGradKernel() = default; - ~BinaryCrossEntropyGradKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0); - const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); - const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0); - auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0); - - const int64_t elem_cnt = input_blob->shape_view().elem_cnt(); - - const T* dy = dy_blob->dptr(); - const T* input = input_blob->dptr(); - const T* target = target_blob->dptr(); - T* dx = dx_blob->mut_dptr(); - - if (ctx->has_input("weight", 0)) { - const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr(); - using FunctorT = BinaryCrossEntropyGradFunctor; - using FactoryT = cuda::elementwise::SimpleFactory; - OF_CUDA_CHECK((cuda::elementwise::GenericLauncher::Launch( - FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight, - ctx->stream()->As()->cuda_stream()))); - } else { - OF_CUDA_CHECK((cuda::elementwise::Ternary( - BinaryCrossEntropyGradFunctor(), elem_cnt, dx, input, target, dy, - ctx->stream()->As()->cuda_stream()))); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -} // namespace - -#define REGISTER_BINARY_CROSS_ENTROPY_KERNEL(dtype) \ - REGISTER_USER_KERNEL("binary_cross_entropy") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("input", 0) == GetDataType::value) \ - && (user_op::HobDataType("target", 0) == GetDataType::value) \ - && (user_op::HobDataType("out", 0) == GetDataType::value)); - -#define REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(dtype) \ - REGISTER_USER_KERNEL("binary_cross_entropy_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("input", 0) == GetDataType::value) \ - && (user_op::HobDataType("target", 0) == GetDataType::value) \ - && (user_op::HobDataType("dy", 0) == GetDataType::value) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)); - -REGISTER_BINARY_CROSS_ENTROPY_KERNEL(half) -REGISTER_BINARY_CROSS_ENTROPY_KERNEL(float) -REGISTER_BINARY_CROSS_ENTROPY_KERNEL(double) - -REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(half) -REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(float) -REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(double) - -} // namespace user_op +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/user/kernels/loss_kernel_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { +namespace user_op { +namespace { + +using namespace loss; + +template +struct BinaryCrossEntropyFunctor { + T zero_; + T one_; + T negative_hundred_; + BinaryCrossEntropyFunctor() + : zero_(GetZeroVal()), one_(GetOneVal()), negative_hundred_(static_cast(-100)) {} + __device__ __forceinline__ T operator()(T input_val, T target_val) const { + assert(input_val >= zero_); + assert(input_val <= one_); + return (target_val - one_) * max(static_cast(log(one_ - input_val)), negative_hundred_) + - target_val * max(static_cast(log(input_val)), negative_hundred_); + } + + __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const { + return (*this)(input_val, target_val) * weight_val; + } +}; + +template<> +struct BinaryCrossEntropyFunctor { + float zero_; + float one_; + float negative_hundred_; + BinaryCrossEntropyFunctor() : zero_(0.f), one_(1.f), negative_hundred_(-100.f) {} + __device__ __forceinline__ float operator()(float input_val, float target_val) const { + assert(input_val >= zero_); + assert(input_val <= one_); + return (target_val - one_) * max(logf(one_ - input_val), negative_hundred_) + - target_val * max(logf(input_val), negative_hundred_); + } + + __device__ __forceinline__ float operator()(float input_val, float target_val, + float weight_val) const { + return (*this)(input_val, target_val) * weight_val; + } +}; + +template<> +struct BinaryCrossEntropyFunctor { + BinaryCrossEntropyFunctor float_functor; + __device__ __forceinline__ half operator()(half input_val, half target_val) const { + return __float2half(float_functor(__half2float(input_val), __half2float(target_val))); + } + + __device__ __forceinline__ half operator()(half input_val, half target_val, + half weight_val) const { + return (*this)(input_val, target_val) * weight_val; + } +}; + +template +struct BinaryCrossEntropyGradFunctor { + T eps_; + T one_; + BinaryCrossEntropyGradFunctor() : eps_(static_cast(1e-12)), one_(GetOneVal()) {} + __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val) const { + return dy_val * (input_val - target_val) / max((one_ - input_val) * input_val, eps_); + } + __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const { + return (*this)(input_val, target_val, dy_val) * weight_val; + } +}; + +template<> +struct BinaryCrossEntropyGradFunctor { + BinaryCrossEntropyGradFunctor float_functor; + BinaryCrossEntropyGradFunctor() {} + __device__ __forceinline__ half operator()(half input_val, half target_val, half dy_val) const { + return __float2half( + float_functor(__half2float(input_val), __half2float(target_val), __half2float(dy_val))); + } + __device__ __forceinline__ half operator()(half input_val, half target_val, half dy_val, + half weight_val) const { + return __float2half(float_functor(__half2float(input_val), __half2float(target_val), + __half2float(dy_val), __half2float(weight_val))); + } +}; + +template +class BinaryCrossEntropyKernel final : public user_op::OpKernel { + public: + BinaryCrossEntropyKernel() = default; + ~BinaryCrossEntropyKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0); + const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); + auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); + + const int64_t elem_cnt = input_blob->shape_view().elem_cnt(); + + const T* input = input_blob->dptr(); + const T* target = target_blob->dptr(); + T* out = out_blob->mut_dptr(); + + if (ctx->has_input("weight", 0)) { + const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr(); + OF_CUDA_CHECK( + (cuda::elementwise::Ternary(BinaryCrossEntropyFunctor(), elem_cnt, out, input, target, + weight, ctx->stream()->As()->cuda_stream()))); + } else { + OF_CUDA_CHECK( + (cuda::elementwise::Binary(BinaryCrossEntropyFunctor(), elem_cnt, out, input, target, + ctx->stream()->As()->cuda_stream()))); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class BinaryCrossEntropyGradKernel final : public user_op::OpKernel { + public: + BinaryCrossEntropyGradKernel() = default; + ~BinaryCrossEntropyGradKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0); + const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); + const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0); + auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0); + + const int64_t elem_cnt = input_blob->shape_view().elem_cnt(); + + const T* dy = dy_blob->dptr(); + const T* input = input_blob->dptr(); + const T* target = target_blob->dptr(); + T* dx = dx_blob->mut_dptr(); + + if (ctx->has_input("weight", 0)) { + const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr(); + using FunctorT = BinaryCrossEntropyGradFunctor; + using FactoryT = cuda::elementwise::SimpleFactory; + OF_CUDA_CHECK((cuda::elementwise::GenericLauncher::Launch( + FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight, + ctx->stream()->As()->cuda_stream()))); + } else { + OF_CUDA_CHECK((cuda::elementwise::Ternary( + BinaryCrossEntropyGradFunctor(), elem_cnt, dx, input, target, dy, + ctx->stream()->As()->cuda_stream()))); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +} // namespace + +#define REGISTER_BINARY_CROSS_ENTROPY_KERNEL(dtype) \ + REGISTER_USER_KERNEL("binary_cross_entropy") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("input", 0) == GetDataType::value) \ + && (user_op::HobDataType("target", 0) == GetDataType::value) \ + && (user_op::HobDataType("out", 0) == GetDataType::value)); + +#define REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(dtype) \ + REGISTER_USER_KERNEL("binary_cross_entropy_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("input", 0) == GetDataType::value) \ + && (user_op::HobDataType("target", 0) == GetDataType::value) \ + && (user_op::HobDataType("dy", 0) == GetDataType::value) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value)); + +REGISTER_BINARY_CROSS_ENTROPY_KERNEL(half) +REGISTER_BINARY_CROSS_ENTROPY_KERNEL(float) +REGISTER_BINARY_CROSS_ENTROPY_KERNEL(double) + +REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(half) +REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(float) +REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(double) + +} // namespace user_op } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.hip.cpp b/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.hip.cpp index fc19e37..9a7d7c4 100644 --- a/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.hip.cpp +++ b/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.hip.cpp @@ -1,373 +1,373 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/core/ndarray/ndarray_util.h" -#include "oneflow/core/ndarray/xpu_var_ndarray.h" -#include "oneflow/user/kernels/loss_kernel_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { -namespace user_op { -namespace { - -using namespace loss; - -enum class WeightType { - kNone, - kWeight, - kPosWeight, - kBoth, -}; - -template -struct BinaryCrossEntropyWithLogitsFunctor; - -template -struct BinaryCrossEntropyWithLogitsFunctor { - T zero_; - T one_; - BinaryCrossEntropyWithLogitsFunctor() : zero_(GetZeroVal()), one_(GetOneVal()) {} - __device__ __forceinline__ T operator()(T input_val, T target_val) const { - const T max_val = -input_val < zero_ ? zero_ : -input_val; - return (one_ - target_val) * input_val + max_val - + (log(exp(-max_val) + exp(-input_val - max_val))); - } -}; - -template -struct BinaryCrossEntropyWithLogitsFunctor { - T zero_; - T one_; - BinaryCrossEntropyWithLogitsFunctor() : zero_(GetZeroVal()), one_(GetOneVal()) {} - __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const { - const T max_val = -input_val < zero_ ? zero_ : -input_val; - const T pos_weight_processed_val = weight_val - target_val + one_; - return (one_ - target_val) * input_val - + (pos_weight_processed_val - * (log(exp(-max_val) + exp(-input_val - max_val)) + max_val)); - } -}; - -template<> -struct BinaryCrossEntropyWithLogitsFunctor { - float zero_; - float one_; - BinaryCrossEntropyWithLogitsFunctor() : zero_(0.f), one_(1.f) {} - __device__ __forceinline__ float operator()(float input_val, float target_val) const { - const float max_val = -input_val < zero_ ? zero_ : -input_val; - return (one_ - target_val) * input_val + max_val - + (logf(expf(-max_val) + expf(-input_val - max_val))); - } -}; - -template<> -struct BinaryCrossEntropyWithLogitsFunctor { - float zero_; - float one_; - BinaryCrossEntropyWithLogitsFunctor() : zero_(0.f), one_(1.f) {} - __device__ __forceinline__ float operator()(float input_val, float target_val, - float weight_val) const { - const float max_val = -input_val < zero_ ? zero_ : -input_val; - const float pos_weight_processed_val = weight_val - target_val + one_; - return (one_ - target_val) * input_val - + (pos_weight_processed_val - * (logf(expf(-max_val) + expf(-input_val - max_val)) + max_val)); - } -}; - -template -struct BinaryCrossEntropyWithLogitsFunctor { - BinaryCrossEntropyWithLogitsFunctor f; - __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const { - return f(input_val, target_val) * weight_val; - } -}; - -template -struct BinaryCrossEntropyWithLogitsFunctor { - BinaryCrossEntropyWithLogitsFunctor f; - __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val, - T pos_weight_val) const { - return f(input_val, target_val, pos_weight_val) * weight_val; - } -}; - -template<> -struct BinaryCrossEntropyWithLogitsFunctor { - BinaryCrossEntropyWithLogitsFunctor f; - __device__ __forceinline__ half operator()(half input_val, half target_val) const { - return __float2half(f(__half2float(input_val), __half2float(target_val))); - } -}; -template<> -struct BinaryCrossEntropyWithLogitsFunctor { - BinaryCrossEntropyWithLogitsFunctor f; - __device__ __forceinline__ half operator()(half input_val, half target_val, - half weight_val) const { - return __float2half( - f(__half2float(input_val), __half2float(target_val), __half2float(weight_val))); - } -}; -template<> -struct BinaryCrossEntropyWithLogitsFunctor { - BinaryCrossEntropyWithLogitsFunctor f; - __device__ __forceinline__ half operator()(half input_val, half target_val, - half weight_val) const { - return __float2half( - f(__half2float(input_val), __half2float(target_val), __half2float(weight_val))); - } -}; -template<> -struct BinaryCrossEntropyWithLogitsFunctor { - BinaryCrossEntropyWithLogitsFunctor f; - __device__ __forceinline__ half operator()(half input_val, half target_val, half weight_val, - half pos_weight_val) const { - return __float2half(f(__half2float(input_val), __half2float(target_val), - __half2float(weight_val), __half2float(pos_weight_val))); - } -}; - -template -__device__ __forceinline__ T CalSigmoid(const T x) { - const T half_of_one = static_cast(0.5); - return half_of_one * tanh(half_of_one * x) + half_of_one; -} - -template<> -__device__ __forceinline__ float CalSigmoid(const float x) { - const float half_of_one = static_cast(0.5); - return half_of_one * tanhf(half_of_one * x) + half_of_one; -} - -template<> -__device__ __forceinline__ half CalSigmoid(const half x) { - return __float2half(CalSigmoid(__half2float(x))); -} - -template -struct BinaryCrossEntropyWithLogitsGradFunctor; - -template -struct BinaryCrossEntropyWithLogitsGradFunctor { - __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val) const { - return (CalSigmoid(input_val) - target_val) * dy_val; - } -}; - -template -struct BinaryCrossEntropyWithLogitsGradFunctor { - T one_; - BinaryCrossEntropyWithLogitsGradFunctor() : one_(GetOneVal()) {} - __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const { - return dy_val * ((weight_val + one_ - target_val) * CalSigmoid(input_val) - weight_val); - } -}; - -template -struct BinaryCrossEntropyWithLogitsGradFunctor { - BinaryCrossEntropyWithLogitsGradFunctor f; - __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const { - return f(input_val, target_val, dy_val) * weight_val; - } -}; - -template -struct BinaryCrossEntropyWithLogitsGradFunctor { - BinaryCrossEntropyWithLogitsGradFunctor f; - __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val, - T pos_weight_val) const { - return f(input_val, target_val, dy_val, pos_weight_val) * weight_val; - } -}; - -template -class BinaryCrossEntropyWithLogitsKernel final : public user_op::OpKernel { - public: - BinaryCrossEntropyWithLogitsKernel() = default; - ~BinaryCrossEntropyWithLogitsKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0); - const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); - auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); - auto* tmp_buffer_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - - const int64_t elem_cnt = input_blob->shape_view().elem_cnt(); - - const T* input = input_blob->dptr(); - const T* target = target_blob->dptr(); - T* out = out_blob->mut_dptr(); - - if (ctx->Attr("has_pos_weight")) { - T* pos_weight_processed = tmp_buffer_blob->mut_dptr(); - const T* pos_weight = ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->dptr(); - - Shape pos_weight_shape = Shape::Ones(target_blob->shape_view().NumAxes()); - pos_weight_shape.Set(pos_weight_shape.NumAxes() - 1, - ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape_view().elem_cnt()); - NdarrayUtil::BroadcastMul( - ctx->stream(), XpuVarNdarray(target_blob->shape_view(), pos_weight_processed), - XpuVarNdarray(pos_weight_shape, pos_weight), - XpuVarNdarray(target_blob->shape_view(), target)); - if (ctx->has_input("weight", 0)) { - const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr(); - using FunctorT = BinaryCrossEntropyWithLogitsFunctor; - using FactoryT = cuda::elementwise::SimpleFactory; - OF_CUDA_CHECK((cuda::elementwise::GenericLauncher::Launch( - FactoryT(FunctorT()), elem_cnt, out, input, target, weight, pos_weight_processed, - ctx->stream()->As()->cuda_stream()))); - - } else { - OF_CUDA_CHECK((cuda::elementwise::Ternary( - BinaryCrossEntropyWithLogitsFunctor(), elem_cnt, out, input, - target, pos_weight_processed, ctx->stream()->As()->cuda_stream()))); - } - } else { - if (ctx->has_input("weight", 0)) { - const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr(); - OF_CUDA_CHECK((cuda::elementwise::Ternary( - BinaryCrossEntropyWithLogitsFunctor(), elem_cnt, out, input, - target, weight, ctx->stream()->As()->cuda_stream()))); - } else { - OF_CUDA_CHECK((cuda::elementwise::Binary( - BinaryCrossEntropyWithLogitsFunctor(), elem_cnt, out, input, - target, ctx->stream()->As()->cuda_stream()))); - } - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class BinaryCrossEntropyWithLogitsGradKernel final : public user_op::OpKernel { - public: - BinaryCrossEntropyWithLogitsGradKernel() = default; - ~BinaryCrossEntropyWithLogitsGradKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0); - const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); - const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0); - auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0); - auto* tmp_buffer_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - - const int64_t elem_cnt = input_blob->shape_view().elem_cnt(); - - const T* dy = dy_blob->dptr(); - const T* input = input_blob->dptr(); - const T* target = target_blob->dptr(); - T* dx = dx_blob->mut_dptr(); - - if (ctx->Attr("has_pos_weight")) { - T* pos_weight_processed = tmp_buffer_blob->mut_dptr(); - const T* pos_weight = ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->dptr(); - - Shape pos_weight_shape = Shape::Ones(target_blob->shape_view().NumAxes()); - pos_weight_shape.Set(pos_weight_shape.NumAxes() - 1, - ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape_view().elem_cnt()); - NdarrayUtil::BroadcastMul( - ctx->stream(), XpuVarNdarray(target_blob->shape_view(), pos_weight_processed), - XpuVarNdarray(pos_weight_shape, pos_weight), - XpuVarNdarray(target_blob->shape_view(), target)); - - if (ctx->has_input("weight", 0)) { - const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr(); - using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor; - using FactoryT = cuda::elementwise::SimpleFactory; - OF_CUDA_CHECK((cuda::elementwise::GenericLauncher::Launch( - FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight, pos_weight_processed, - ctx->stream()->As()->cuda_stream()))); - - } else { - using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor; - using FactoryT = cuda::elementwise::SimpleFactory; - OF_CUDA_CHECK((cuda::elementwise::GenericLauncher::Launch( - FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, pos_weight_processed, - ctx->stream()->As()->cuda_stream()))); - } - } else { - if (ctx->has_input("weight", 0)) { - const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr(); - using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor; - using FactoryT = cuda::elementwise::SimpleFactory; - OF_CUDA_CHECK((cuda::elementwise::GenericLauncher::Launch( - FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight, - ctx->stream()->As()->cuda_stream()))); - } else { - OF_CUDA_CHECK((cuda::elementwise::Ternary( - BinaryCrossEntropyWithLogitsGradFunctor(), elem_cnt, dx, input, - target, dy, ctx->stream()->As()->cuda_stream()))); - } - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -user_op::InferTmpSizeFn GenFwInferTmpSizeFn() { - return [](user_op::InferContext* ctx) { - const int64_t n = ctx->InputShape("input", 0).elem_cnt(); - size_t tmp_buffer_size = 0; - if (ctx->Attr("has_pos_weight")) { tmp_buffer_size += GetCudaAlignedSize(n * sizeof(T)); } - return tmp_buffer_size; - }; -} -template -user_op::InferTmpSizeFn GenBwInferTmpSizeFn() { - return [](user_op::InferContext* ctx) { - const int64_t n = ctx->InputShape("target", 0).elem_cnt(); - size_t tmp_buffer_size = 0; - if (ctx->Attr("has_pos_weight")) { tmp_buffer_size += GetCudaAlignedSize(n * sizeof(T)); } - return tmp_buffer_size; - }; -} - -} // namespace - -#define REGISTER_BINARY_CROSS_ENTROPY_KERNEL(dtype) \ - REGISTER_USER_KERNEL("binary_cross_entropy_with_logits") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("input", 0) == GetDataType::value) \ - && (user_op::HobDataType("target", 0) == GetDataType::value) \ - && (user_op::HobDataType("out", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn(GenFwInferTmpSizeFn()); - -#define REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(dtype) \ - REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("input", 0) == GetDataType::value) \ - && (user_op::HobDataType("target", 0) == GetDataType::value) \ - && (user_op::HobDataType("dy", 0) == GetDataType::value) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn(GenBwInferTmpSizeFn()); - -REGISTER_BINARY_CROSS_ENTROPY_KERNEL(half) -REGISTER_BINARY_CROSS_ENTROPY_KERNEL(float) -REGISTER_BINARY_CROSS_ENTROPY_KERNEL(double) - -REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(half) -REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(float) -REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(double) - -} // namespace user_op +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/core/ndarray/ndarray_util.h" +#include "oneflow/core/ndarray/xpu_var_ndarray.h" +#include "oneflow/user/kernels/loss_kernel_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { +namespace user_op { +namespace { + +using namespace loss; + +enum class WeightType { + kNone, + kWeight, + kPosWeight, + kBoth, +}; + +template +struct BinaryCrossEntropyWithLogitsFunctor; + +template +struct BinaryCrossEntropyWithLogitsFunctor { + T zero_; + T one_; + BinaryCrossEntropyWithLogitsFunctor() : zero_(GetZeroVal()), one_(GetOneVal()) {} + __device__ __forceinline__ T operator()(T input_val, T target_val) const { + const T max_val = -input_val < zero_ ? zero_ : -input_val; + return (one_ - target_val) * input_val + max_val + + (log(exp(-max_val) + exp(-input_val - max_val))); + } +}; + +template +struct BinaryCrossEntropyWithLogitsFunctor { + T zero_; + T one_; + BinaryCrossEntropyWithLogitsFunctor() : zero_(GetZeroVal()), one_(GetOneVal()) {} + __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const { + const T max_val = -input_val < zero_ ? zero_ : -input_val; + const T pos_weight_processed_val = weight_val - target_val + one_; + return (one_ - target_val) * input_val + + (pos_weight_processed_val + * (log(exp(-max_val) + exp(-input_val - max_val)) + max_val)); + } +}; + +template<> +struct BinaryCrossEntropyWithLogitsFunctor { + float zero_; + float one_; + BinaryCrossEntropyWithLogitsFunctor() : zero_(0.f), one_(1.f) {} + __device__ __forceinline__ float operator()(float input_val, float target_val) const { + const float max_val = -input_val < zero_ ? zero_ : -input_val; + return (one_ - target_val) * input_val + max_val + + (logf(expf(-max_val) + expf(-input_val - max_val))); + } +}; + +template<> +struct BinaryCrossEntropyWithLogitsFunctor { + float zero_; + float one_; + BinaryCrossEntropyWithLogitsFunctor() : zero_(0.f), one_(1.f) {} + __device__ __forceinline__ float operator()(float input_val, float target_val, + float weight_val) const { + const float max_val = -input_val < zero_ ? zero_ : -input_val; + const float pos_weight_processed_val = weight_val - target_val + one_; + return (one_ - target_val) * input_val + + (pos_weight_processed_val + * (logf(expf(-max_val) + expf(-input_val - max_val)) + max_val)); + } +}; + +template +struct BinaryCrossEntropyWithLogitsFunctor { + BinaryCrossEntropyWithLogitsFunctor f; + __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const { + return f(input_val, target_val) * weight_val; + } +}; + +template +struct BinaryCrossEntropyWithLogitsFunctor { + BinaryCrossEntropyWithLogitsFunctor f; + __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val, + T pos_weight_val) const { + return f(input_val, target_val, pos_weight_val) * weight_val; + } +}; + +template<> +struct BinaryCrossEntropyWithLogitsFunctor { + BinaryCrossEntropyWithLogitsFunctor f; + __device__ __forceinline__ half operator()(half input_val, half target_val) const { + return __float2half(f(__half2float(input_val), __half2float(target_val))); + } +}; +template<> +struct BinaryCrossEntropyWithLogitsFunctor { + BinaryCrossEntropyWithLogitsFunctor f; + __device__ __forceinline__ half operator()(half input_val, half target_val, + half weight_val) const { + return __float2half( + f(__half2float(input_val), __half2float(target_val), __half2float(weight_val))); + } +}; +template<> +struct BinaryCrossEntropyWithLogitsFunctor { + BinaryCrossEntropyWithLogitsFunctor f; + __device__ __forceinline__ half operator()(half input_val, half target_val, + half weight_val) const { + return __float2half( + f(__half2float(input_val), __half2float(target_val), __half2float(weight_val))); + } +}; +template<> +struct BinaryCrossEntropyWithLogitsFunctor { + BinaryCrossEntropyWithLogitsFunctor f; + __device__ __forceinline__ half operator()(half input_val, half target_val, half weight_val, + half pos_weight_val) const { + return __float2half(f(__half2float(input_val), __half2float(target_val), + __half2float(weight_val), __half2float(pos_weight_val))); + } +}; + +template +__device__ __forceinline__ T CalSigmoid(const T x) { + const T half_of_one = static_cast(0.5); + return half_of_one * tanh(half_of_one * x) + half_of_one; +} + +template<> +__device__ __forceinline__ float CalSigmoid(const float x) { + const float half_of_one = static_cast(0.5); + return half_of_one * tanhf(half_of_one * x) + half_of_one; +} + +template<> +__device__ __forceinline__ half CalSigmoid(const half x) { + return __float2half(CalSigmoid(__half2float(x))); +} + +template +struct BinaryCrossEntropyWithLogitsGradFunctor; + +template +struct BinaryCrossEntropyWithLogitsGradFunctor { + __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val) const { + return (CalSigmoid(input_val) - target_val) * dy_val; + } +}; + +template +struct BinaryCrossEntropyWithLogitsGradFunctor { + T one_; + BinaryCrossEntropyWithLogitsGradFunctor() : one_(GetOneVal()) {} + __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const { + return dy_val * ((weight_val + one_ - target_val) * CalSigmoid(input_val) - weight_val); + } +}; + +template +struct BinaryCrossEntropyWithLogitsGradFunctor { + BinaryCrossEntropyWithLogitsGradFunctor f; + __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const { + return f(input_val, target_val, dy_val) * weight_val; + } +}; + +template +struct BinaryCrossEntropyWithLogitsGradFunctor { + BinaryCrossEntropyWithLogitsGradFunctor f; + __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val, + T pos_weight_val) const { + return f(input_val, target_val, dy_val, pos_weight_val) * weight_val; + } +}; + +template +class BinaryCrossEntropyWithLogitsKernel final : public user_op::OpKernel { + public: + BinaryCrossEntropyWithLogitsKernel() = default; + ~BinaryCrossEntropyWithLogitsKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0); + const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); + auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); + auto* tmp_buffer_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + + const int64_t elem_cnt = input_blob->shape_view().elem_cnt(); + + const T* input = input_blob->dptr(); + const T* target = target_blob->dptr(); + T* out = out_blob->mut_dptr(); + + if (ctx->Attr("has_pos_weight")) { + T* pos_weight_processed = tmp_buffer_blob->mut_dptr(); + const T* pos_weight = ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->dptr(); + + Shape pos_weight_shape = Shape::Ones(target_blob->shape_view().NumAxes()); + pos_weight_shape.Set(pos_weight_shape.NumAxes() - 1, + ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape_view().elem_cnt()); + NdarrayUtil::BroadcastMul( + ctx->stream(), XpuVarNdarray(target_blob->shape_view(), pos_weight_processed), + XpuVarNdarray(pos_weight_shape, pos_weight), + XpuVarNdarray(target_blob->shape_view(), target)); + if (ctx->has_input("weight", 0)) { + const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr(); + using FunctorT = BinaryCrossEntropyWithLogitsFunctor; + using FactoryT = cuda::elementwise::SimpleFactory; + OF_CUDA_CHECK((cuda::elementwise::GenericLauncher::Launch( + FactoryT(FunctorT()), elem_cnt, out, input, target, weight, pos_weight_processed, + ctx->stream()->As()->cuda_stream()))); + + } else { + OF_CUDA_CHECK((cuda::elementwise::Ternary( + BinaryCrossEntropyWithLogitsFunctor(), elem_cnt, out, input, + target, pos_weight_processed, ctx->stream()->As()->cuda_stream()))); + } + } else { + if (ctx->has_input("weight", 0)) { + const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr(); + OF_CUDA_CHECK((cuda::elementwise::Ternary( + BinaryCrossEntropyWithLogitsFunctor(), elem_cnt, out, input, + target, weight, ctx->stream()->As()->cuda_stream()))); + } else { + OF_CUDA_CHECK((cuda::elementwise::Binary( + BinaryCrossEntropyWithLogitsFunctor(), elem_cnt, out, input, + target, ctx->stream()->As()->cuda_stream()))); + } + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class BinaryCrossEntropyWithLogitsGradKernel final : public user_op::OpKernel { + public: + BinaryCrossEntropyWithLogitsGradKernel() = default; + ~BinaryCrossEntropyWithLogitsGradKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0); + const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); + const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0); + auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0); + auto* tmp_buffer_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + + const int64_t elem_cnt = input_blob->shape_view().elem_cnt(); + + const T* dy = dy_blob->dptr(); + const T* input = input_blob->dptr(); + const T* target = target_blob->dptr(); + T* dx = dx_blob->mut_dptr(); + + if (ctx->Attr("has_pos_weight")) { + T* pos_weight_processed = tmp_buffer_blob->mut_dptr(); + const T* pos_weight = ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->dptr(); + + Shape pos_weight_shape = Shape::Ones(target_blob->shape_view().NumAxes()); + pos_weight_shape.Set(pos_weight_shape.NumAxes() - 1, + ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape_view().elem_cnt()); + NdarrayUtil::BroadcastMul( + ctx->stream(), XpuVarNdarray(target_blob->shape_view(), pos_weight_processed), + XpuVarNdarray(pos_weight_shape, pos_weight), + XpuVarNdarray(target_blob->shape_view(), target)); + + if (ctx->has_input("weight", 0)) { + const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr(); + using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor; + using FactoryT = cuda::elementwise::SimpleFactory; + OF_CUDA_CHECK((cuda::elementwise::GenericLauncher::Launch( + FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight, pos_weight_processed, + ctx->stream()->As()->cuda_stream()))); + + } else { + using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor; + using FactoryT = cuda::elementwise::SimpleFactory; + OF_CUDA_CHECK((cuda::elementwise::GenericLauncher::Launch( + FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, pos_weight_processed, + ctx->stream()->As()->cuda_stream()))); + } + } else { + if (ctx->has_input("weight", 0)) { + const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr(); + using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor; + using FactoryT = cuda::elementwise::SimpleFactory; + OF_CUDA_CHECK((cuda::elementwise::GenericLauncher::Launch( + FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight, + ctx->stream()->As()->cuda_stream()))); + } else { + OF_CUDA_CHECK((cuda::elementwise::Ternary( + BinaryCrossEntropyWithLogitsGradFunctor(), elem_cnt, dx, input, + target, dy, ctx->stream()->As()->cuda_stream()))); + } + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +user_op::InferTmpSizeFn GenFwInferTmpSizeFn() { + return [](user_op::InferContext* ctx) { + const int64_t n = ctx->InputShape("input", 0).elem_cnt(); + size_t tmp_buffer_size = 0; + if (ctx->Attr("has_pos_weight")) { tmp_buffer_size += GetCudaAlignedSize(n * sizeof(T)); } + return tmp_buffer_size; + }; +} +template +user_op::InferTmpSizeFn GenBwInferTmpSizeFn() { + return [](user_op::InferContext* ctx) { + const int64_t n = ctx->InputShape("target", 0).elem_cnt(); + size_t tmp_buffer_size = 0; + if (ctx->Attr("has_pos_weight")) { tmp_buffer_size += GetCudaAlignedSize(n * sizeof(T)); } + return tmp_buffer_size; + }; +} + +} // namespace + +#define REGISTER_BINARY_CROSS_ENTROPY_KERNEL(dtype) \ + REGISTER_USER_KERNEL("binary_cross_entropy_with_logits") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("input", 0) == GetDataType::value) \ + && (user_op::HobDataType("target", 0) == GetDataType::value) \ + && (user_op::HobDataType("out", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn(GenFwInferTmpSizeFn()); + +#define REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(dtype) \ + REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("input", 0) == GetDataType::value) \ + && (user_op::HobDataType("target", 0) == GetDataType::value) \ + && (user_op::HobDataType("dy", 0) == GetDataType::value) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn(GenBwInferTmpSizeFn()); + +REGISTER_BINARY_CROSS_ENTROPY_KERNEL(half) +REGISTER_BINARY_CROSS_ENTROPY_KERNEL(float) +REGISTER_BINARY_CROSS_ENTROPY_KERNEL(double) + +REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(half) +REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(float) +REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(double) + +} // namespace user_op } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel.hip.cpp b/oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel.hip.cpp index 2de0821..79ca507 100644 --- a/oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel.hip.cpp +++ b/oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel.hip.cpp @@ -1,277 +1,277 @@ -#include "hip/hip_runtime.h" -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include "oneflow/core/hip/elementwise.hip.h" -#include -#include "oneflow/core/kernel/cuda_graph_support.h" - -namespace oneflow { - -namespace user_op { - -namespace { - -constexpr int32_t kBlockSize = 1024; -constexpr int32_t kReduceLocalSumBlockSize = 1024; -constexpr int32_t kSingleBlockProcessNumThreshold = 1024; - -template -struct DefaultComputeType { - using type = T; -}; - -template<> -struct DefaultComputeType { - using type = float; -}; - -template -inline hipError_t GetNumBlocks(Func func, int64_t block_size, size_t dynamic_smem_size, - int64_t max_blocks, int64_t waves, int* num_blocks) { - int dev; - { - hipError_t err = hipGetDevice(&dev); - if (err != hipSuccess) { return err; } - } - int sm_count; - { - hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev); - if (err != hipSuccess) { return err; } - } - int max_active_blocks; - { - hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func, - block_size, dynamic_smem_size); - } - *num_blocks = - std::max(1, std::min(max_blocks, sm_count * max_active_blocks * waves)); - return hipSuccess; -} - -template -__global__ void FusedBinaryCrossEntropyWithLogitsReduceMeanKernel(const In* input, const In* target, - Out* out, - const int32_t local_elem_cnt, - const int32_t reduce_elem_cnt) { - ComputeType zero = static_cast(0.0); - ComputeType one = static_cast(1.0); - using BlockReduce = hipcub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - ComputeType reduce_sum = 0.0; - CUDA_1D_KERNEL_LOOP(i, local_elem_cnt) { - const ComputeType input_val = static_cast(input[i]); - const ComputeType target_val = static_cast(target[i]); - const ComputeType max_val = -input_val < zero ? zero : -input_val; - const ComputeType result = - (one - target_val) * input_val + max_val + (log(exp(-max_val) + exp(-input_val - max_val))); - reduce_sum += result; - } - - const ComputeType block_reduce_sum = BlockReduce(temp_storage).Sum(reduce_sum); - if (threadIdx.x == 0) { out[blockIdx.x] = static_cast(block_reduce_sum / reduce_elem_cnt); } -} - -template -__global__ void ReduceLocalSumKernel(ComputeType* block_local_sum_buf, Out* out, int64_t elem_cnt) { - using BlockReduce = hipcub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - ComputeType reduce_sum = 0.0; - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { reduce_sum += block_local_sum_buf[i]; } - const ComputeType block_reduce_sum = BlockReduce(temp_storage).Sum(reduce_sum); - if (threadIdx.x == 0) { out[0] = static_cast(block_reduce_sum); } -} - -template -__device__ __forceinline__ T Sigmoid(const T x) { - const T half_of_one = static_cast(0.5); - return half_of_one * tanh(half_of_one * x) + half_of_one; -} - -template<> -__device__ __forceinline__ half Sigmoid(const half x) { - return __float2half(Sigmoid(__half2float(x))); -} - -template -struct BinaryCrossEntropyWithLogitsReduceMeanGradFunctor { - OF_DEVICE_FUNC explicit BinaryCrossEntropyWithLogitsReduceMeanGradFunctor( - const T elem_cnt_reciprocal, const T dy) - : elem_cnt_reciprocal(elem_cnt_reciprocal), dy(dy) {} - __device__ T operator()(const T input_val, const T target_val) const { - return (Sigmoid(input_val) - target_val) * dy * elem_cnt_reciprocal; - } - const T dy; - const T elem_cnt_reciprocal; -}; - -template -struct BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor { - OF_DEVICE_FUNC explicit BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor( - const int32_t elem_cnt, const T* dy_ptr) - : elem_cnt_reciprocal(1.0f / elem_cnt), dy_ptr(dy_ptr) {} - __device__ BinaryCrossEntropyWithLogitsReduceMeanGradFunctor operator()() const { - return BinaryCrossEntropyWithLogitsReduceMeanGradFunctor(elem_cnt_reciprocal, - *dy_ptr); - } - const T* dy_ptr; - const T elem_cnt_reciprocal; -}; - -template -class BinaryCrossEntropyWithLogitsMeanKernel final : public user_op::OpKernel, - public CudaGraphSupport { - public: - BinaryCrossEntropyWithLogitsMeanKernel() = default; - ~BinaryCrossEntropyWithLogitsMeanKernel() override = default; - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } - - std::shared_ptr InitOpKernelCache( - user_op::KernelCacheContext* ctx) const override { - return CreateBCEWithLogitsReduceMeanKernelCache(ctx); - } - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, - const user_op::OpKernelCache* cache) const override { - const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0); - const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); - auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); - - int64_t local_elem_cnt = input_blob->shape_view().elem_cnt(); - int64_t reduce_elem_cnt = local_elem_cnt; - - if (cache != nullptr) { - // Because `out`'s SBP maybe P or B, we need to use reduce_elem_cnt as reduce_mean factor. - const auto* bce_cache = dynamic_cast(cache); - CHECK_NOTNULL(bce_cache); - reduce_elem_cnt = bce_cache->reduce_elem_cnt(); - } - - const T* input = input_blob->dptr(); - const T* target = target_blob->dptr(); - T* out = out_blob->mut_dptr(); - using ComputeType = typename DefaultComputeType::type; - - if (local_elem_cnt <= kSingleBlockProcessNumThreshold) { - FusedBinaryCrossEntropyWithLogitsReduceMeanKernel - <<<1, kBlockSize, 0, ctx->stream()->As()->cuda_stream()>>>( - input_blob->dptr(), target_blob->dptr(), out_blob->mut_dptr(), - local_elem_cnt, reduce_elem_cnt); - } else { - auto* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const int64_t tmp_buffer_elem_cnt = tmp_buffer->shape_view().elem_cnt() / sizeof(T); - const int64_t block_num = (local_elem_cnt + kBlockSize - 1) / kBlockSize; - int launch_block = block_num; - OF_CUDA_CHECK(GetNumBlocks( - FusedBinaryCrossEntropyWithLogitsReduceMeanKernel, - kBlockSize, 0, block_num, 32, &launch_block)); - launch_block = std::min(tmp_buffer_elem_cnt, launch_block); - FusedBinaryCrossEntropyWithLogitsReduceMeanKernel - <<stream()->As()->cuda_stream()>>>( - input_blob->dptr(), target_blob->dptr(), tmp_buffer->mut_dptr(), - local_elem_cnt, reduce_elem_cnt); - ReduceLocalSumKernel - <<<1, kReduceLocalSumBlockSize, 0, ctx->stream()->As()->cuda_stream()>>>( - tmp_buffer->mut_dptr(), out_blob->mut_dptr(), block_num); - } - } -}; - -template -class BinaryCrossEntropyWithLogitsReduceMeanGradKernel final : public user_op::OpKernel { - public: - BinaryCrossEntropyWithLogitsReduceMeanGradKernel() = default; - ~BinaryCrossEntropyWithLogitsReduceMeanGradKernel() = default; - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } - - std::shared_ptr InitOpKernelCache( - user_op::KernelCacheContext* ctx) const override { - return CreateBCEWithLogitsReduceMeanKernelCache(ctx); - } - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, - const user_op::OpKernelCache* cache) const override { - const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0); - const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); - const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0); - auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0); - - int64_t local_elem_cnt = input_blob->shape_view().elem_cnt(); - int64_t reduce_elem_cnt = local_elem_cnt; - if (cache != nullptr) { - // Because `out`'s SBP maybe P or B, we need to use reduce_elem_cnt as reduce_mean factor. - const auto* bce_cache = dynamic_cast(cache); - CHECK_NOTNULL(bce_cache); - reduce_elem_cnt = bce_cache->reduce_elem_cnt(); - } - - const T* dy = dy_blob->dptr(); - const T* input = input_blob->dptr(); - const T* target = target_blob->dptr(); - T* dx = dx_blob->mut_dptr(); - using ComputeType = typename DefaultComputeType::type; - - OF_CUDA_CHECK((cuda::elementwise::BinaryWithFactory( - BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor(reduce_elem_cnt, dy), - local_elem_cnt, dx, input, target, ctx->stream()->As()->cuda_stream()))); - } -}; - -} // namespace - -#define REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(dtype) \ - REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_reduce_mean") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("input", 0) == GetDataType::value) \ - && (user_op::HobDataType("target", 0) == GetDataType::value) \ - && (user_op::HobDataType("out", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ - const int64_t elem_cnt = ctx->InputShape("input", 0).elem_cnt(); \ - const int64_t block_num = (elem_cnt + kBlockSize - 1) / kBlockSize; \ - int launch_block = block_num; \ - using ComputeType = typename DefaultComputeType::type; \ - OF_CUDA_CHECK(GetNumBlocks( \ - FusedBinaryCrossEntropyWithLogitsReduceMeanKernel, \ - kBlockSize, 0, block_num, 32, &launch_block)); \ - const int64_t tmp_buffer_size = GetCudaAlignedSize(launch_block * sizeof(dtype)); \ - return tmp_buffer_size; \ - }); - -#define REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(dtype) \ - REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_reduce_mean_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("input", 0) == GetDataType::value) \ - && (user_op::HobDataType("target", 0) == GetDataType::value) \ - && (user_op::HobDataType("dy", 0) == GetDataType::value) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)); - -REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(half) -REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(float) -REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(double) - -REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(half) -REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(float) -REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(double) - -} // namespace user_op +#include "hip/hip_runtime.h" +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include "oneflow/core/hip/elementwise.hip.h" +#include +#include "oneflow/core/kernel/cuda_graph_support.h" + +namespace oneflow { + +namespace user_op { + +namespace { + +constexpr int32_t kBlockSize = 1024; +constexpr int32_t kReduceLocalSumBlockSize = 1024; +constexpr int32_t kSingleBlockProcessNumThreshold = 1024; + +template +struct DefaultComputeType { + using type = T; +}; + +template<> +struct DefaultComputeType { + using type = float; +}; + +template +inline hipError_t GetNumBlocks(Func func, int64_t block_size, size_t dynamic_smem_size, + int64_t max_blocks, int64_t waves, int* num_blocks) { + int dev; + { + hipError_t err = hipGetDevice(&dev); + if (err != hipSuccess) { return err; } + } + int sm_count; + { + hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev); + if (err != hipSuccess) { return err; } + } + int max_active_blocks; + { + hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func, + block_size, dynamic_smem_size); + } + *num_blocks = + std::max(1, std::min(max_blocks, sm_count * max_active_blocks * waves)); + return hipSuccess; +} + +template +__global__ void FusedBinaryCrossEntropyWithLogitsReduceMeanKernel(const In* input, const In* target, + Out* out, + const int32_t local_elem_cnt, + const int32_t reduce_elem_cnt) { + ComputeType zero = static_cast(0.0); + ComputeType one = static_cast(1.0); + using BlockReduce = hipcub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + ComputeType reduce_sum = 0.0; + CUDA_1D_KERNEL_LOOP(i, local_elem_cnt) { + const ComputeType input_val = static_cast(input[i]); + const ComputeType target_val = static_cast(target[i]); + const ComputeType max_val = -input_val < zero ? zero : -input_val; + const ComputeType result = + (one - target_val) * input_val + max_val + (log(exp(-max_val) + exp(-input_val - max_val))); + reduce_sum += result; + } + + const ComputeType block_reduce_sum = BlockReduce(temp_storage).Sum(reduce_sum); + if (threadIdx.x == 0) { out[blockIdx.x] = static_cast(block_reduce_sum / reduce_elem_cnt); } +} + +template +__global__ void ReduceLocalSumKernel(ComputeType* block_local_sum_buf, Out* out, int64_t elem_cnt) { + using BlockReduce = hipcub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + ComputeType reduce_sum = 0.0; + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { reduce_sum += block_local_sum_buf[i]; } + const ComputeType block_reduce_sum = BlockReduce(temp_storage).Sum(reduce_sum); + if (threadIdx.x == 0) { out[0] = static_cast(block_reduce_sum); } +} + +template +__device__ __forceinline__ T Sigmoid(const T x) { + const T half_of_one = static_cast(0.5); + return half_of_one * tanh(half_of_one * x) + half_of_one; +} + +template<> +__device__ __forceinline__ half Sigmoid(const half x) { + return __float2half(Sigmoid(__half2float(x))); +} + +template +struct BinaryCrossEntropyWithLogitsReduceMeanGradFunctor { + OF_DEVICE_FUNC explicit BinaryCrossEntropyWithLogitsReduceMeanGradFunctor( + const T elem_cnt_reciprocal, const T dy) + : elem_cnt_reciprocal(elem_cnt_reciprocal), dy(dy) {} + __device__ T operator()(const T input_val, const T target_val) const { + return (Sigmoid(input_val) - target_val) * dy * elem_cnt_reciprocal; + } + const T dy; + const T elem_cnt_reciprocal; +}; + +template +struct BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor { + OF_DEVICE_FUNC explicit BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor( + const int32_t elem_cnt, const T* dy_ptr) + : elem_cnt_reciprocal(1.0f / elem_cnt), dy_ptr(dy_ptr) {} + __device__ BinaryCrossEntropyWithLogitsReduceMeanGradFunctor operator()() const { + return BinaryCrossEntropyWithLogitsReduceMeanGradFunctor(elem_cnt_reciprocal, + *dy_ptr); + } + const T* dy_ptr; + const T elem_cnt_reciprocal; +}; + +template +class BinaryCrossEntropyWithLogitsMeanKernel final : public user_op::OpKernel, + public CudaGraphSupport { + public: + BinaryCrossEntropyWithLogitsMeanKernel() = default; + ~BinaryCrossEntropyWithLogitsMeanKernel() override = default; + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + + std::shared_ptr InitOpKernelCache( + user_op::KernelCacheContext* ctx) const override { + return CreateBCEWithLogitsReduceMeanKernelCache(ctx); + } + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, + const user_op::OpKernelCache* cache) const override { + const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0); + const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); + auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); + + int64_t local_elem_cnt = input_blob->shape_view().elem_cnt(); + int64_t reduce_elem_cnt = local_elem_cnt; + + if (cache != nullptr) { + // Because `out`'s SBP maybe P or B, we need to use reduce_elem_cnt as reduce_mean factor. + const auto* bce_cache = dynamic_cast(cache); + CHECK_NOTNULL(bce_cache); + reduce_elem_cnt = bce_cache->reduce_elem_cnt(); + } + + const T* input = input_blob->dptr(); + const T* target = target_blob->dptr(); + T* out = out_blob->mut_dptr(); + using ComputeType = typename DefaultComputeType::type; + + if (local_elem_cnt <= kSingleBlockProcessNumThreshold) { + FusedBinaryCrossEntropyWithLogitsReduceMeanKernel + <<<1, kBlockSize, 0, ctx->stream()->As()->cuda_stream()>>>( + input_blob->dptr(), target_blob->dptr(), out_blob->mut_dptr(), + local_elem_cnt, reduce_elem_cnt); + } else { + auto* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + const int64_t tmp_buffer_elem_cnt = tmp_buffer->shape_view().elem_cnt() / sizeof(T); + const int64_t block_num = (local_elem_cnt + kBlockSize - 1) / kBlockSize; + int launch_block = block_num; + OF_CUDA_CHECK(GetNumBlocks( + FusedBinaryCrossEntropyWithLogitsReduceMeanKernel, + kBlockSize, 0, block_num, 32, &launch_block)); + launch_block = std::min(tmp_buffer_elem_cnt, launch_block); + FusedBinaryCrossEntropyWithLogitsReduceMeanKernel + <<stream()->As()->cuda_stream()>>>( + input_blob->dptr(), target_blob->dptr(), tmp_buffer->mut_dptr(), + local_elem_cnt, reduce_elem_cnt); + ReduceLocalSumKernel + <<<1, kReduceLocalSumBlockSize, 0, ctx->stream()->As()->cuda_stream()>>>( + tmp_buffer->mut_dptr(), out_blob->mut_dptr(), block_num); + } + } +}; + +template +class BinaryCrossEntropyWithLogitsReduceMeanGradKernel final : public user_op::OpKernel { + public: + BinaryCrossEntropyWithLogitsReduceMeanGradKernel() = default; + ~BinaryCrossEntropyWithLogitsReduceMeanGradKernel() = default; + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + + std::shared_ptr InitOpKernelCache( + user_op::KernelCacheContext* ctx) const override { + return CreateBCEWithLogitsReduceMeanKernelCache(ctx); + } + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, + const user_op::OpKernelCache* cache) const override { + const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0); + const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); + const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0); + auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0); + + int64_t local_elem_cnt = input_blob->shape_view().elem_cnt(); + int64_t reduce_elem_cnt = local_elem_cnt; + if (cache != nullptr) { + // Because `out`'s SBP maybe P or B, we need to use reduce_elem_cnt as reduce_mean factor. + const auto* bce_cache = dynamic_cast(cache); + CHECK_NOTNULL(bce_cache); + reduce_elem_cnt = bce_cache->reduce_elem_cnt(); + } + + const T* dy = dy_blob->dptr(); + const T* input = input_blob->dptr(); + const T* target = target_blob->dptr(); + T* dx = dx_blob->mut_dptr(); + using ComputeType = typename DefaultComputeType::type; + + OF_CUDA_CHECK((cuda::elementwise::BinaryWithFactory( + BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor(reduce_elem_cnt, dy), + local_elem_cnt, dx, input, target, ctx->stream()->As()->cuda_stream()))); + } +}; + +} // namespace + +#define REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(dtype) \ + REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_reduce_mean") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("input", 0) == GetDataType::value) \ + && (user_op::HobDataType("target", 0) == GetDataType::value) \ + && (user_op::HobDataType("out", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ + const int64_t elem_cnt = ctx->InputShape("input", 0).elem_cnt(); \ + const int64_t block_num = (elem_cnt + kBlockSize - 1) / kBlockSize; \ + int launch_block = block_num; \ + using ComputeType = typename DefaultComputeType::type; \ + OF_CUDA_CHECK(GetNumBlocks( \ + FusedBinaryCrossEntropyWithLogitsReduceMeanKernel, \ + kBlockSize, 0, block_num, 32, &launch_block)); \ + const int64_t tmp_buffer_size = GetCudaAlignedSize(launch_block * sizeof(dtype)); \ + return tmp_buffer_size; \ + }); + +#define REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(dtype) \ + REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_reduce_mean_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("input", 0) == GetDataType::value) \ + && (user_op::HobDataType("target", 0) == GetDataType::value) \ + && (user_op::HobDataType("dy", 0) == GetDataType::value) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value)); + +REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(half) +REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(float) +REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(double) + +REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(half) +REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(float) +REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(double) + +} // namespace user_op } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/broadcast_pow_grad_kernel.hip.cpp b/oneflow/user/kernels/broadcast_pow_grad_kernel.hip.cpp index 562f69c..2cf6e90 100644 --- a/oneflow/user/kernels/broadcast_pow_grad_kernel.hip.cpp +++ b/oneflow/user/kernels/broadcast_pow_grad_kernel.hip.cpp @@ -1,88 +1,88 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/kernel/kernel_util.hip.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/ndarray/ndarray_util.h" -#include "oneflow/core/ndarray/xpu_var_ndarray.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { -namespace { -template -__global__ void ComputeLogGpu(const int64_t len, T* out, const T* in) { - CUDA_1D_KERNEL_LOOP(i, len) { out[i] = SafeLog(in[i]); } -} -template<> -__global__ void ComputeLogGpu(const int64_t len, float16* out, const float16* in) { - const half* _in = reinterpret_cast(in); - half* _out = reinterpret_cast(out); - CUDA_1D_KERNEL_LOOP(i, len) { _out[i] = SafeLog(_in[i]); } -} - -template -class BroadcastPowYGradKernel final : public user_op::OpKernel { - public: - BroadcastPowYGradKernel() = default; - ~BroadcastPowYGradKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* z_tensor = ctx->Tensor4ArgNameAndIndex("z", 0); - const user_op::Tensor* dz_tensor = ctx->Tensor4ArgNameAndIndex("dz", 0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); - - const int64_t num_axes = dz_tensor->shape_view().NumAxes(); - const int64_t elem_cnt = z_tensor->shape_view().elem_cnt(); - Memset(ctx->stream(), tmp_buffer->mut_dptr(), 0, - GetCudaAlignedSize(elem_cnt * sizeof(T))); - XpuVarNdarray z(z_tensor->shape_view(), z_tensor->dptr(), num_axes); - XpuVarNdarray dz(dz_tensor->shape_view(), dz_tensor->dptr(), num_axes); - XpuVarNdarray const_tmp(dz.shape(), tmp_buffer->dptr()); - XpuVarNdarray tmp(dz.shape(), tmp_buffer->mut_dptr()); - XpuVarNdarray x(x_tensor->shape_view(), x_tensor->dptr(), num_axes); - XpuVarNdarray dy(dy_tensor->shape_view(), dy_tensor->mut_dptr(), num_axes); - NdarrayUtil::BroadcastAdd(ctx->stream(), tmp, x, const_tmp); - ComputeLogGpu<<stream()->As()->cuda_stream()>>>( - elem_cnt, tmp_buffer->mut_dptr(), tmp_buffer->dptr()); - NdarrayUtil::BroadcastMul(ctx->stream(), tmp, dz, const_tmp); - NdarrayUtil::BroadcastMul(ctx->stream(), tmp, z, const_tmp); - NdarrayUtil::ReduceSum(ctx->stream(), dy, const_tmp, tmp); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -} // namespace -#define REGISTER_BROADCAST_POW_Y_GRAD_KERNEL(device, dtype_pair) \ - REGISTER_USER_KERNEL("broadcast_pow_y_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == device) \ - && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(dtype_pair))) \ - .SetInferTmpSizeFn([](oneflow::user_op::InferContext* ctx) { \ - const user_op::TensorDesc& z = ctx->InputTensorDesc("z", 0); \ - const DataType& data_type = z.data_type(); \ - const int64_t elem_cnt = z.shape().elem_cnt(); \ - return GetCudaAlignedSize(elem_cnt * GetSizeOfDataType(data_type)); \ - }); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_BROADCAST_POW_Y_GRAD_KERNEL, (DeviceType::kCUDA), - ARITHMETIC_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ) +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/kernel/kernel_util.hip.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/ndarray/ndarray_util.h" +#include "oneflow/core/ndarray/xpu_var_ndarray.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { +namespace { +template +__global__ void ComputeLogGpu(const int64_t len, T* out, const T* in) { + CUDA_1D_KERNEL_LOOP(i, len) { out[i] = SafeLog(in[i]); } +} +template<> +__global__ void ComputeLogGpu(const int64_t len, float16* out, const float16* in) { + const half* _in = reinterpret_cast(in); + half* _out = reinterpret_cast(out); + CUDA_1D_KERNEL_LOOP(i, len) { _out[i] = SafeLog(_in[i]); } +} + +template +class BroadcastPowYGradKernel final : public user_op::OpKernel { + public: + BroadcastPowYGradKernel() = default; + ~BroadcastPowYGradKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* z_tensor = ctx->Tensor4ArgNameAndIndex("z", 0); + const user_op::Tensor* dz_tensor = ctx->Tensor4ArgNameAndIndex("dz", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); + + const int64_t num_axes = dz_tensor->shape_view().NumAxes(); + const int64_t elem_cnt = z_tensor->shape_view().elem_cnt(); + Memset(ctx->stream(), tmp_buffer->mut_dptr(), 0, + GetCudaAlignedSize(elem_cnt * sizeof(T))); + XpuVarNdarray z(z_tensor->shape_view(), z_tensor->dptr(), num_axes); + XpuVarNdarray dz(dz_tensor->shape_view(), dz_tensor->dptr(), num_axes); + XpuVarNdarray const_tmp(dz.shape(), tmp_buffer->dptr()); + XpuVarNdarray tmp(dz.shape(), tmp_buffer->mut_dptr()); + XpuVarNdarray x(x_tensor->shape_view(), x_tensor->dptr(), num_axes); + XpuVarNdarray dy(dy_tensor->shape_view(), dy_tensor->mut_dptr(), num_axes); + NdarrayUtil::BroadcastAdd(ctx->stream(), tmp, x, const_tmp); + ComputeLogGpu<<stream()->As()->cuda_stream()>>>( + elem_cnt, tmp_buffer->mut_dptr(), tmp_buffer->dptr()); + NdarrayUtil::BroadcastMul(ctx->stream(), tmp, dz, const_tmp); + NdarrayUtil::BroadcastMul(ctx->stream(), tmp, z, const_tmp); + NdarrayUtil::ReduceSum(ctx->stream(), dy, const_tmp, tmp); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +} // namespace +#define REGISTER_BROADCAST_POW_Y_GRAD_KERNEL(device, dtype_pair) \ + REGISTER_USER_KERNEL("broadcast_pow_y_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == device) \ + && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(dtype_pair))) \ + .SetInferTmpSizeFn([](oneflow::user_op::InferContext* ctx) { \ + const user_op::TensorDesc& z = ctx->InputTensorDesc("z", 0); \ + const DataType& data_type = z.data_type(); \ + const int64_t elem_cnt = z.shape().elem_cnt(); \ + return GetCudaAlignedSize(elem_cnt * GetSizeOfDataType(data_type)); \ + }); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_BROADCAST_POW_Y_GRAD_KERNEL, (DeviceType::kCUDA), + ARITHMETIC_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ) } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/categorical_ordinal_encode_kernel_util.hip.cpp b/oneflow/user/kernels/categorical_ordinal_encode_kernel_util.hip.cpp index 927b0c8..3dde691 100644 --- a/oneflow/user/kernels/categorical_ordinal_encode_kernel_util.hip.cpp +++ b/oneflow/user/kernels/categorical_ordinal_encode_kernel_util.hip.cpp @@ -1,125 +1,125 @@ -#include "hip/hip_runtime.h" -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifdef NDEBUG -#undef NDEBUG -#endif -#include -#include "oneflow/user/kernels/categorical_ordinal_encode_kernel_util.h" -#include "oneflow/core/kernel/kernel_util.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -using CuInt64T = unsigned long long int; - -__device__ __inline__ int32_t AtomicCAS(int32_t* address, int32_t compare, int32_t val) { - return atomicCAS(address, compare, val); -} - -__device__ __inline__ int64_t AtomicCAS(int64_t* address, int64_t compare, int64_t val) { - static_assert(sizeof(int64_t) == sizeof(CuInt64T), "size error"); - return static_cast(atomicCAS(reinterpret_cast(address), - static_cast(compare), - static_cast(val))); -} - -__device__ __inline__ int32_t AtomicAdd(int32_t* address, int32_t val) { - return atomicAdd(address, val); -} - -__device__ __inline__ int64_t AtomicAdd(int64_t* address, int64_t val) { - static_assert(sizeof(int64_t) == sizeof(CuInt64T), "size error"); - return static_cast( - atomicAdd(reinterpret_cast(address), static_cast(val))); -} - -template -__device__ bool TryGetOrInsert(K* key, volatile V* value, V* size, const K hash, V* out) { - K old_key = AtomicCAS(key, static_cast(0), hash); - if (old_key == 0) { - V v = AtomicAdd(size, 1) + 1; - *value = v; - *out = v; - return true; - } else if (old_key == hash) { - while (true) { - V v = *value; - if (v != 0) { - *out = v; - break; - } - } - return true; - } else { - return false; - } -} - -template -__device__ bool GetOrInsertOne(const size_t capacity, T* table, T* size, const T hash, T* out) { - if (hash == 0) { - *out = 0; - return true; - } - const size_t start_idx = static_cast(hash) % capacity; - // fast path - { - T* key = table + start_idx * 2; - T* value = key + 1; - if (*key == hash && *value != 0) { - *out = *value; - return true; - } - } - for (size_t count = 0; count < capacity; ++count) { - const size_t idx = (start_idx + count) % capacity; - T* key = table + idx * 2; - T* value = key + 1; - if (TryGetOrInsert(key, value, size, hash, out)) { return true; } - } - return false; -} - -template -__global__ void EncodeGpu(const size_t capacity, T* table, T* size, const int64_t n, const T* hash, - T* out) { - CUDA_1D_KERNEL_LOOP(i, n) { - bool success = GetOrInsertOne(capacity, table, size, hash[i], out + i); - assert(success); - } -} - -} // namespace - -template -struct CategoricalOrdinalEncodeKernelUtil { - static void Encode(ep::Stream* stream, int64_t capacity, T* table, T* size, int64_t n, - const T* hash, T* out) { - EncodeGpu - <<As()->cuda_stream()>>>(capacity, table, size, n, hash, out); - } -}; - -#define INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA(type_cpp, type_proto) \ - template struct CategoricalOrdinalEncodeKernelUtil; -OF_PP_FOR_EACH_TUPLE(INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA, INDEX_DATA_TYPE_SEQ); -#undef INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA - +#include "hip/hip_runtime.h" +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef NDEBUG +#undef NDEBUG +#endif +#include +#include "oneflow/user/kernels/categorical_ordinal_encode_kernel_util.h" +#include "oneflow/core/kernel/kernel_util.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +using CuInt64T = unsigned long long int; + +__device__ __inline__ int32_t AtomicCAS(int32_t* address, int32_t compare, int32_t val) { + return atomicCAS(address, compare, val); +} + +__device__ __inline__ int64_t AtomicCAS(int64_t* address, int64_t compare, int64_t val) { + static_assert(sizeof(int64_t) == sizeof(CuInt64T), "size error"); + return static_cast(atomicCAS(reinterpret_cast(address), + static_cast(compare), + static_cast(val))); +} + +__device__ __inline__ int32_t AtomicAdd(int32_t* address, int32_t val) { + return atomicAdd(address, val); +} + +__device__ __inline__ int64_t AtomicAdd(int64_t* address, int64_t val) { + static_assert(sizeof(int64_t) == sizeof(CuInt64T), "size error"); + return static_cast( + atomicAdd(reinterpret_cast(address), static_cast(val))); +} + +template +__device__ bool TryGetOrInsert(K* key, volatile V* value, V* size, const K hash, V* out) { + K old_key = AtomicCAS(key, static_cast(0), hash); + if (old_key == 0) { + V v = AtomicAdd(size, 1) + 1; + *value = v; + *out = v; + return true; + } else if (old_key == hash) { + while (true) { + V v = *value; + if (v != 0) { + *out = v; + break; + } + } + return true; + } else { + return false; + } +} + +template +__device__ bool GetOrInsertOne(const size_t capacity, T* table, T* size, const T hash, T* out) { + if (hash == 0) { + *out = 0; + return true; + } + const size_t start_idx = static_cast(hash) % capacity; + // fast path + { + T* key = table + start_idx * 2; + T* value = key + 1; + if (*key == hash && *value != 0) { + *out = *value; + return true; + } + } + for (size_t count = 0; count < capacity; ++count) { + const size_t idx = (start_idx + count) % capacity; + T* key = table + idx * 2; + T* value = key + 1; + if (TryGetOrInsert(key, value, size, hash, out)) { return true; } + } + return false; +} + +template +__global__ void EncodeGpu(const size_t capacity, T* table, T* size, const int64_t n, const T* hash, + T* out) { + CUDA_1D_KERNEL_LOOP(i, n) { + bool success = GetOrInsertOne(capacity, table, size, hash[i], out + i); + assert(success); + } +} + +} // namespace + +template +struct CategoricalOrdinalEncodeKernelUtil { + static void Encode(ep::Stream* stream, int64_t capacity, T* table, T* size, int64_t n, + const T* hash, T* out) { + EncodeGpu + <<As()->cuda_stream()>>>(capacity, table, size, n, hash, out); + } +}; + +#define INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA(type_cpp, type_proto) \ + template struct CategoricalOrdinalEncodeKernelUtil; +OF_PP_FOR_EACH_TUPLE(INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA, INDEX_DATA_TYPE_SEQ); +#undef INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/clip_by_value_kernel.hip.cpp b/oneflow/user/kernels/clip_by_value_kernel.hip.cpp index eb2e550..7039aae 100644 --- a/oneflow/user/kernels/clip_by_value_kernel.hip.cpp +++ b/oneflow/user/kernels/clip_by_value_kernel.hip.cpp @@ -1,72 +1,72 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/user/kernels/clip_by_value_kernel.h" -#include "oneflow/core/device/cuda_util.h" - -namespace oneflow { - -namespace { - -template -__global__ void CudaClipForward(F clip_func, int64_t n, const T* x, T* y) { - CUDA_1D_KERNEL_LOOP(i, n) { y[i] = clip_func(x[i]); } -} - -template -__global__ void CudaClipBackward(F clip_func, int64_t n, const T* x, const T* dy, T* dx) { - CUDA_1D_KERNEL_LOOP(i, n) { dx[i] = clip_func(x[i], dy[i]); } -} - -} // namespace - -template -struct ClipKernelUtil { - template - static void Forward(ep::Stream* stream, F clip_func, const int64_t n, const T* x, T* y) { - if (n == 0) { return; } - RUN_CUDA_KERNEL((CudaClipForward), stream, n, clip_func, n, x, y); - } - - template - static void Backward(ep::Stream* stream, F clip_func, const int64_t n, const T* x, const T* dy, - T* dx) { - if (n == 0) { return; } - RUN_CUDA_KERNEL((CudaClipBackward), stream, n, clip_func, n, x, dy, dx); - } -}; - -#define INITIATE_CLIP_KERNEL_UTIL_CUDA(dtype, dtype_v) \ - template struct ClipKernelUtil; \ - template void ClipKernelUtil::Forward( \ - ep::Stream*, ClipByMinFunctor, const int64_t n, const dtype*, dtype*); \ - template void ClipKernelUtil::Forward( \ - ep::Stream*, ClipByMaxFunctor, const int64_t n, const dtype*, dtype*); \ - template void ClipKernelUtil::Forward( \ - ep::Stream*, ClipByMinMaxFunctor, const int64_t n, const dtype*, dtype*); \ - template void ClipKernelUtil::Backward( \ - ep::Stream*, ClipByMinGradFunctor, const int64_t n, const dtype*, const dtype*, \ - dtype*); \ - template void ClipKernelUtil::Backward( \ - ep::Stream*, ClipByMaxGradFunctor, const int64_t n, const dtype*, const dtype*, \ - dtype*); \ - template void ClipKernelUtil::Backward( \ - ep::Stream*, ClipByMinMaxGradFunctor, const int64_t n, const dtype*, const dtype*, \ - dtype*); - -OF_PP_FOR_EACH_TUPLE(INITIATE_CLIP_KERNEL_UTIL_CUDA, ARITHMETIC_DATA_TYPE_SEQ) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/user/kernels/clip_by_value_kernel.h" +#include "oneflow/core/device/cuda_util.h" + +namespace oneflow { + +namespace { + +template +__global__ void CudaClipForward(F clip_func, int64_t n, const T* x, T* y) { + CUDA_1D_KERNEL_LOOP(i, n) { y[i] = clip_func(x[i]); } +} + +template +__global__ void CudaClipBackward(F clip_func, int64_t n, const T* x, const T* dy, T* dx) { + CUDA_1D_KERNEL_LOOP(i, n) { dx[i] = clip_func(x[i], dy[i]); } +} + +} // namespace + +template +struct ClipKernelUtil { + template + static void Forward(ep::Stream* stream, F clip_func, const int64_t n, const T* x, T* y) { + if (n == 0) { return; } + RUN_CUDA_KERNEL((CudaClipForward), stream, n, clip_func, n, x, y); + } + + template + static void Backward(ep::Stream* stream, F clip_func, const int64_t n, const T* x, const T* dy, + T* dx) { + if (n == 0) { return; } + RUN_CUDA_KERNEL((CudaClipBackward), stream, n, clip_func, n, x, dy, dx); + } +}; + +#define INITIATE_CLIP_KERNEL_UTIL_CUDA(dtype, dtype_v) \ + template struct ClipKernelUtil; \ + template void ClipKernelUtil::Forward( \ + ep::Stream*, ClipByMinFunctor, const int64_t n, const dtype*, dtype*); \ + template void ClipKernelUtil::Forward( \ + ep::Stream*, ClipByMaxFunctor, const int64_t n, const dtype*, dtype*); \ + template void ClipKernelUtil::Forward( \ + ep::Stream*, ClipByMinMaxFunctor, const int64_t n, const dtype*, dtype*); \ + template void ClipKernelUtil::Backward( \ + ep::Stream*, ClipByMinGradFunctor, const int64_t n, const dtype*, const dtype*, \ + dtype*); \ + template void ClipKernelUtil::Backward( \ + ep::Stream*, ClipByMaxGradFunctor, const int64_t n, const dtype*, const dtype*, \ + dtype*); \ + template void ClipKernelUtil::Backward( \ + ep::Stream*, ClipByMinMaxGradFunctor, const int64_t n, const dtype*, const dtype*, \ + dtype*); + +OF_PP_FOR_EACH_TUPLE(INITIATE_CLIP_KERNEL_UTIL_CUDA, ARITHMETIC_DATA_TYPE_SEQ) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/combined_margin_loss_kernel.hip.cpp b/oneflow/user/kernels/combined_margin_loss_kernel.hip.cpp index 871ae34..446fd45 100644 --- a/oneflow/user/kernels/combined_margin_loss_kernel.hip.cpp +++ b/oneflow/user/kernels/combined_margin_loss_kernel.hip.cpp @@ -1,225 +1,225 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/common/balanced_splitter.h" -#include "oneflow/core/kernel/kernel_util.h" -#include "oneflow/user/kernels/math_unary_elementwise_func.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -__global__ void GpuForward(const int64_t n, const int64_t num_classes, const int64_t lower_bound, - const T m1, const T m2, const T m3, const T* in, const K* labels, T* out, - T* theta) { - CUDA_1D_KERNEL_LOOP(i, n) { - const int32_t row_id = i / num_classes; - const int32_t col_id = i - row_id * num_classes; - const T in_data = in[i]; - T out_data = in_data; - K label = labels[row_id] - lower_bound; - if (is_cosine_loss) { - if (label == col_id) { out_data = in_data - m3; } - } else { - if (label == col_id) { - const T theta_data = AcosFunctor::Forward(in_data); - out_data = CosFunctor::Forward(theta_data * m1 + m2) - m3; - theta[row_id] = theta_data; - } else if ((label < 0 || label >= num_classes) && col_id == 0) { - theta[row_id] = 0; - } - } - out[i] = out_data; - } -} - -template -__global__ void GpuBackward(const int64_t n, const int64_t num_classes, const int64_t lower_bound, - const T m1, const T m2, const T m3, const T* dy, const K* labels, - const T* theta, T* dx) { - CUDA_1D_KERNEL_LOOP(i, n) { - const int32_t row_id = i / num_classes; - const int32_t col_id = i - row_id * num_classes; - K label = labels[row_id] - lower_bound; - const T dy_data = dy[i]; - const T theta_data = theta[row_id]; - T dx_data = dy_data; - if (label == col_id && !is_cosine_loss) { - dx_data = dy_data * SinFunctor::Forward(theta_data * m1 + m2) * m1 - / SinFunctor::Forward(theta_data); - } - dx[i] = dx_data; - } -} - -class CombinedMarginLossOpKernelCache final : public user_op::OpKernelCache { - public: - CombinedMarginLossOpKernelCache(int64_t lower, int64_t upper) : lower_(lower), upper_(upper) {} - ~CombinedMarginLossOpKernelCache() override = default; - - int64_t lower() const { return lower_; } - int64_t upper() const { return upper_; } - - private: - const int64_t lower_; - const int64_t upper_; -}; - -std::shared_ptr CreateCombinedMarginLossOpKernelCache( - user_op::KernelCacheContext* ctx, const std::string& in_arg_name) { - if (ctx->parallel_ctx().parallel_num() == 1) { return nullptr; } - - const SbpParallel& in_sbp = ctx->SbpParallel4ArgNameAndIndex(in_arg_name, 0); - if (in_sbp.has_split_parallel() && in_sbp.split_parallel().axis() == 1 - && ctx->parallel_ctx().parallel_num() > 1) { - CHECK(ctx->SbpParallel4ArgNameAndIndex("label", 0).has_broadcast_parallel()); - const user_op::TensorDesc* in_logical_desc = - ctx->LogicalTensorDesc4ArgNameAndIndex(in_arg_name, 0); - const auto depth = ctx->Attr("depth"); - CHECK_EQ(depth, in_logical_desc->shape().At(1)); - BalancedSplitter bs(depth, ctx->parallel_ctx().parallel_num()); - return std::make_shared( - bs.At(ctx->parallel_ctx().parallel_id()).begin(), - bs.At(ctx->parallel_ctx().parallel_id()).end()); - } else { - return nullptr; - } -} - -} // namespace - -template -class CombinedMarginLossGpuKernel final : public user_op::OpKernel { - public: - CombinedMarginLossGpuKernel() = default; - ~CombinedMarginLossGpuKernel() override = default; - - std::shared_ptr InitOpKernelCache( - user_op::KernelCacheContext* ctx) const override { - return CreateCombinedMarginLossOpKernelCache(ctx, "x"); - } - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, - const user_op::OpKernelCache* cache) const override { - const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0); - user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - user_op::Tensor* theta = ctx->Tensor4ArgNameAndIndex("theta", 0); - const float m1 = ctx->Attr("m1"); - const float m2 = ctx->Attr("m2"); - const float m3 = ctx->Attr("m3"); - int64_t lower_bound = 0; - if (cache != nullptr) { - auto* kernel_cache = dynamic_cast(cache); - CHECK_NOTNULL(kernel_cache); - CHECK_EQ(x->shape_view().Count(1), kernel_cache->upper() - kernel_cache->lower()); - lower_bound = kernel_cache->lower(); - } - if (m1 == 1.0 && m2 == 0.0) { - GpuForward - <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, 0, - ctx->stream()->As()->cuda_stream()>>>( - x->shape_view().elem_cnt(), x->shape_view().Count(1), lower_bound, static_cast(m1), - static_cast(m2), static_cast(m3), x->dptr(), label->dptr(), - y->mut_dptr(), theta->mut_dptr()); - } else { - GpuForward - <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, 0, - ctx->stream()->As()->cuda_stream()>>>( - x->shape_view().elem_cnt(), x->shape_view().Count(1), lower_bound, static_cast(m1), - static_cast(m2), static_cast(m3), x->dptr(), label->dptr(), - y->mut_dptr(), theta->mut_dptr()); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_COMBINED_MARGIN_LOSS_CUDA_KERNEL(in_type, indices_type) \ - REGISTER_USER_KERNEL("combined_margin_loss") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(in_type)) \ - && (user_op::HobDataType("label", 0) == OF_PP_PAIR_SECOND(indices_type))); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_COMBINED_MARGIN_LOSS_CUDA_KERNEL, FLOATING_DATA_TYPE_SEQ, - INDEX_DATA_TYPE_SEQ) - -template -class CombinedMarginLossGradGpuKernel final : public user_op::OpKernel { - public: - CombinedMarginLossGradGpuKernel() = default; - ~CombinedMarginLossGradGpuKernel() override = default; - - std::shared_ptr InitOpKernelCache( - user_op::KernelCacheContext* ctx) const override { - return CreateCombinedMarginLossOpKernelCache(ctx, "dy"); - } - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, - const user_op::OpKernelCache* cache) const override { - const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0); - const user_op::Tensor* theta = ctx->Tensor4ArgNameAndIndex("theta", 0); - user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const float m1 = ctx->Attr("m1"); - const float m2 = ctx->Attr("m2"); - const float m3 = ctx->Attr("m3"); - int64_t lower_bound = 0; - if (cache != nullptr) { - auto* kernel_cache = dynamic_cast(cache); - CHECK_NOTNULL(kernel_cache); - CHECK_EQ(dy->shape_view().Count(1), kernel_cache->upper() - kernel_cache->lower()); - lower_bound = kernel_cache->lower(); - } - if (m1 == 1.0 && m2 == 0.0) { - GpuBackward - <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, 0, - ctx->stream()->As()->cuda_stream()>>>( - dy->shape_view().elem_cnt(), dy->shape_view().Count(1), lower_bound, - static_cast(m1), static_cast(m2), static_cast(m3), dy->dptr(), - label->dptr(), theta->dptr(), dx->mut_dptr()); - } else { - GpuBackward - <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, 0, - ctx->stream()->As()->cuda_stream()>>>( - dy->shape_view().elem_cnt(), dy->shape_view().Count(1), lower_bound, - static_cast(m1), static_cast(m2), static_cast(m3), dy->dptr(), - label->dptr(), theta->dptr(), dx->mut_dptr()); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_COMBINED_MARGIN_LOSS_GRAD_CUDA_KERNEL(dy_type, indices_type) \ - REGISTER_USER_KERNEL("combined_margin_loss_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dy", 0) == OF_PP_PAIR_SECOND(dy_type)) \ - && (user_op::HobDataType("label", 0) == OF_PP_PAIR_SECOND(indices_type))); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_COMBINED_MARGIN_LOSS_GRAD_CUDA_KERNEL, - FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/common/balanced_splitter.h" +#include "oneflow/core/kernel/kernel_util.h" +#include "oneflow/user/kernels/math_unary_elementwise_func.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +__global__ void GpuForward(const int64_t n, const int64_t num_classes, const int64_t lower_bound, + const T m1, const T m2, const T m3, const T* in, const K* labels, T* out, + T* theta) { + CUDA_1D_KERNEL_LOOP(i, n) { + const int32_t row_id = i / num_classes; + const int32_t col_id = i - row_id * num_classes; + const T in_data = in[i]; + T out_data = in_data; + K label = labels[row_id] - lower_bound; + if (is_cosine_loss) { + if (label == col_id) { out_data = in_data - m3; } + } else { + if (label == col_id) { + const T theta_data = AcosFunctor::Forward(in_data); + out_data = CosFunctor::Forward(theta_data * m1 + m2) - m3; + theta[row_id] = theta_data; + } else if ((label < 0 || label >= num_classes) && col_id == 0) { + theta[row_id] = 0; + } + } + out[i] = out_data; + } +} + +template +__global__ void GpuBackward(const int64_t n, const int64_t num_classes, const int64_t lower_bound, + const T m1, const T m2, const T m3, const T* dy, const K* labels, + const T* theta, T* dx) { + CUDA_1D_KERNEL_LOOP(i, n) { + const int32_t row_id = i / num_classes; + const int32_t col_id = i - row_id * num_classes; + K label = labels[row_id] - lower_bound; + const T dy_data = dy[i]; + const T theta_data = theta[row_id]; + T dx_data = dy_data; + if (label == col_id && !is_cosine_loss) { + dx_data = dy_data * SinFunctor::Forward(theta_data * m1 + m2) * m1 + / SinFunctor::Forward(theta_data); + } + dx[i] = dx_data; + } +} + +class CombinedMarginLossOpKernelCache final : public user_op::OpKernelCache { + public: + CombinedMarginLossOpKernelCache(int64_t lower, int64_t upper) : lower_(lower), upper_(upper) {} + ~CombinedMarginLossOpKernelCache() override = default; + + int64_t lower() const { return lower_; } + int64_t upper() const { return upper_; } + + private: + const int64_t lower_; + const int64_t upper_; +}; + +std::shared_ptr CreateCombinedMarginLossOpKernelCache( + user_op::KernelCacheContext* ctx, const std::string& in_arg_name) { + if (ctx->parallel_ctx().parallel_num() == 1) { return nullptr; } + + const SbpParallel& in_sbp = ctx->SbpParallel4ArgNameAndIndex(in_arg_name, 0); + if (in_sbp.has_split_parallel() && in_sbp.split_parallel().axis() == 1 + && ctx->parallel_ctx().parallel_num() > 1) { + CHECK(ctx->SbpParallel4ArgNameAndIndex("label", 0).has_broadcast_parallel()); + const user_op::TensorDesc* in_logical_desc = + ctx->LogicalTensorDesc4ArgNameAndIndex(in_arg_name, 0); + const auto depth = ctx->Attr("depth"); + CHECK_EQ(depth, in_logical_desc->shape().At(1)); + BalancedSplitter bs(depth, ctx->parallel_ctx().parallel_num()); + return std::make_shared( + bs.At(ctx->parallel_ctx().parallel_id()).begin(), + bs.At(ctx->parallel_ctx().parallel_id()).end()); + } else { + return nullptr; + } +} + +} // namespace + +template +class CombinedMarginLossGpuKernel final : public user_op::OpKernel { + public: + CombinedMarginLossGpuKernel() = default; + ~CombinedMarginLossGpuKernel() override = default; + + std::shared_ptr InitOpKernelCache( + user_op::KernelCacheContext* ctx) const override { + return CreateCombinedMarginLossOpKernelCache(ctx, "x"); + } + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, + const user_op::OpKernelCache* cache) const override { + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0); + user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); + user_op::Tensor* theta = ctx->Tensor4ArgNameAndIndex("theta", 0); + const float m1 = ctx->Attr("m1"); + const float m2 = ctx->Attr("m2"); + const float m3 = ctx->Attr("m3"); + int64_t lower_bound = 0; + if (cache != nullptr) { + auto* kernel_cache = dynamic_cast(cache); + CHECK_NOTNULL(kernel_cache); + CHECK_EQ(x->shape_view().Count(1), kernel_cache->upper() - kernel_cache->lower()); + lower_bound = kernel_cache->lower(); + } + if (m1 == 1.0 && m2 == 0.0) { + GpuForward + <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, 0, + ctx->stream()->As()->cuda_stream()>>>( + x->shape_view().elem_cnt(), x->shape_view().Count(1), lower_bound, static_cast(m1), + static_cast(m2), static_cast(m3), x->dptr(), label->dptr(), + y->mut_dptr(), theta->mut_dptr()); + } else { + GpuForward + <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, 0, + ctx->stream()->As()->cuda_stream()>>>( + x->shape_view().elem_cnt(), x->shape_view().Count(1), lower_bound, static_cast(m1), + static_cast(m2), static_cast(m3), x->dptr(), label->dptr(), + y->mut_dptr(), theta->mut_dptr()); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_COMBINED_MARGIN_LOSS_CUDA_KERNEL(in_type, indices_type) \ + REGISTER_USER_KERNEL("combined_margin_loss") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(in_type)) \ + && (user_op::HobDataType("label", 0) == OF_PP_PAIR_SECOND(indices_type))); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_COMBINED_MARGIN_LOSS_CUDA_KERNEL, FLOATING_DATA_TYPE_SEQ, + INDEX_DATA_TYPE_SEQ) + +template +class CombinedMarginLossGradGpuKernel final : public user_op::OpKernel { + public: + CombinedMarginLossGradGpuKernel() = default; + ~CombinedMarginLossGradGpuKernel() override = default; + + std::shared_ptr InitOpKernelCache( + user_op::KernelCacheContext* ctx) const override { + return CreateCombinedMarginLossOpKernelCache(ctx, "dy"); + } + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, + const user_op::OpKernelCache* cache) const override { + const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0); + const user_op::Tensor* theta = ctx->Tensor4ArgNameAndIndex("theta", 0); + user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + const float m1 = ctx->Attr("m1"); + const float m2 = ctx->Attr("m2"); + const float m3 = ctx->Attr("m3"); + int64_t lower_bound = 0; + if (cache != nullptr) { + auto* kernel_cache = dynamic_cast(cache); + CHECK_NOTNULL(kernel_cache); + CHECK_EQ(dy->shape_view().Count(1), kernel_cache->upper() - kernel_cache->lower()); + lower_bound = kernel_cache->lower(); + } + if (m1 == 1.0 && m2 == 0.0) { + GpuBackward + <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, 0, + ctx->stream()->As()->cuda_stream()>>>( + dy->shape_view().elem_cnt(), dy->shape_view().Count(1), lower_bound, + static_cast(m1), static_cast(m2), static_cast(m3), dy->dptr(), + label->dptr(), theta->dptr(), dx->mut_dptr()); + } else { + GpuBackward + <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, 0, + ctx->stream()->As()->cuda_stream()>>>( + dy->shape_view().elem_cnt(), dy->shape_view().Count(1), lower_bound, + static_cast(m1), static_cast(m2), static_cast(m3), dy->dptr(), + label->dptr(), theta->dptr(), dx->mut_dptr()); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_COMBINED_MARGIN_LOSS_GRAD_CUDA_KERNEL(dy_type, indices_type) \ + REGISTER_USER_KERNEL("combined_margin_loss_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dy", 0) == OF_PP_PAIR_SECOND(dy_type)) \ + && (user_op::HobDataType("label", 0) == OF_PP_PAIR_SECOND(indices_type))); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_COMBINED_MARGIN_LOSS_GRAD_CUDA_KERNEL, + FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/count_not_finite_kernel.hip.cpp b/oneflow/user/kernels/count_not_finite_kernel.hip.cpp index 7275283..98a3487 100644 --- a/oneflow/user/kernels/count_not_finite_kernel.hip.cpp +++ b/oneflow/user/kernels/count_not_finite_kernel.hip.cpp @@ -1,173 +1,173 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/kernel/cuda_graph_support.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -struct Param { - const T* x[N]; - int64_t x_elem_cnt[N]; - int64_t* y; - int64_t num_x; -}; - -using CuInt64T = unsigned long long int; - -__device__ __inline__ int64_t AtomicAdd(int64_t* address, int64_t val) { - static_assert(sizeof(int64_t) == sizeof(CuInt64T), "size error"); - return static_cast( - atomicAdd(reinterpret_cast(address), static_cast(val))); -} - -template -__inline__ __device__ bool IsFinite(T x) { - return isfinite(x); -} - -template<> -__inline__ __device__ bool IsFinite(half x) { - return IsFinite(static_cast(x)); -} - -template -__global__ void CountNotFiniteGpu(const int64_t n, const T* x, int64_t* y) { - typedef hipcub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage cub_reduce_tmp_storage; - int64_t thread_count = 0; - CUDA_1D_KERNEL_LOOP(i, n) { - if (!IsFinite(x[i])) { thread_count += 1; } - } - __syncthreads(); - int64_t block_count_sum = BlockReduce(cub_reduce_tmp_storage).Reduce(thread_count, hipcub::Sum()); - if (threadIdx.x == 0) { AtomicAdd(y, block_count_sum); } -} - -template -__global__ void MultiCountNotFiniteGpu(Param param) { - typedef hipcub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage cub_reduce_tmp_storage; - int64_t thread_count = 0; - for (int32_t k = 0; k < param.num_x; ++k) { - CUDA_1D_KERNEL_LOOP(i, param.x_elem_cnt[k]) { - if (!IsFinite(param.x[k][i])) { thread_count += 1; } - } - } - __syncthreads(); - int64_t block_count_sum = BlockReduce(cub_reduce_tmp_storage).Reduce(thread_count, hipcub::Sum()); - if (threadIdx.x == 0) { AtomicAdd(param.y, block_count_sum); } -} - -constexpr int64_t kCountNotFiniteNumBlocks = 512; - -int GetCountNotFiniteNumBlocks(const int64_t elem_cnt) { - return std::min((elem_cnt + kCudaThreadsNumPerBlock - 1) / kCudaThreadsNumPerBlock, - kCountNotFiniteNumBlocks); -} - -} // namespace - -template -class CountNotFiniteGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { - public: - CountNotFiniteGpuKernel() = default; - ~CountNotFiniteGpuKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); - user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - const int64_t elem_cnt = x->shape_view().elem_cnt(); - Memset(ctx->stream(), y->mut_dptr(), 0, - y->shape_view().elem_cnt() * sizeof(int64_t)); - CountNotFiniteGpu<<stream()->As()->cuda_stream()>>>( - elem_cnt, x->dptr(), y->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_COUNT_NOT_FINITE_CUDA_KERNEL(dtype) \ - REGISTER_USER_KERNEL("count_not_finite") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("x", 0) == GetDataType::value)); - -REGISTER_COUNT_NOT_FINITE_CUDA_KERNEL(half) -REGISTER_COUNT_NOT_FINITE_CUDA_KERNEL(float) -REGISTER_COUNT_NOT_FINITE_CUDA_KERNEL(double) - -template -class MultiCountNotFiniteGpuKernel final : public user_op::OpKernel, - public user_op::CudaGraphSupport { - public: - MultiCountNotFiniteGpuKernel() = default; - ~MultiCountNotFiniteGpuKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - Param para; - Memset(ctx->stream(), y->mut_dptr(), 0, - y->shape_view().elem_cnt() * sizeof(int64_t)); - para.y = y->mut_dptr(); - - int64_t remain_size = ctx->inputs().size(); - int64_t input_id = 0; - while (remain_size > 0) { - if (remain_size > 128) { - remain_size -= 128; - para.num_x = 128; - } else { - para.num_x = remain_size; - remain_size = 0; - } - int64_t max_elem_cnt = 0; - for (int32_t i = 0; i < para.num_x; ++i) { - const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", input_id); - input_id++; - para.x[i] = x->dptr(); - para.x_elem_cnt[i] = x->shape_view().elem_cnt(); - max_elem_cnt = std::max(max_elem_cnt, x->shape_view().elem_cnt()); - } - MultiCountNotFiniteGpu - <<stream()->As()->cuda_stream()>>>(para); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_MULTI_COUNT_NOT_FINITE_CUDA_KERNEL(dtype) \ - REGISTER_USER_KERNEL("multi_count_not_finite") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("x", 0) == GetDataType::value)); - -REGISTER_MULTI_COUNT_NOT_FINITE_CUDA_KERNEL(half) -REGISTER_MULTI_COUNT_NOT_FINITE_CUDA_KERNEL(float) -REGISTER_MULTI_COUNT_NOT_FINITE_CUDA_KERNEL(double) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/kernel/cuda_graph_support.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +struct Param { + const T* x[N]; + int64_t x_elem_cnt[N]; + int64_t* y; + int64_t num_x; +}; + +using CuInt64T = unsigned long long int; + +__device__ __inline__ int64_t AtomicAdd(int64_t* address, int64_t val) { + static_assert(sizeof(int64_t) == sizeof(CuInt64T), "size error"); + return static_cast( + atomicAdd(reinterpret_cast(address), static_cast(val))); +} + +template +__inline__ __device__ bool IsFinite(T x) { + return isfinite(x); +} + +template<> +__inline__ __device__ bool IsFinite(half x) { + return IsFinite(static_cast(x)); +} + +template +__global__ void CountNotFiniteGpu(const int64_t n, const T* x, int64_t* y) { + typedef hipcub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage cub_reduce_tmp_storage; + int64_t thread_count = 0; + CUDA_1D_KERNEL_LOOP(i, n) { + if (!IsFinite(x[i])) { thread_count += 1; } + } + __syncthreads(); + int64_t block_count_sum = BlockReduce(cub_reduce_tmp_storage).Reduce(thread_count, hipcub::Sum()); + if (threadIdx.x == 0) { AtomicAdd(y, block_count_sum); } +} + +template +__global__ void MultiCountNotFiniteGpu(Param param) { + typedef hipcub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage cub_reduce_tmp_storage; + int64_t thread_count = 0; + for (int32_t k = 0; k < param.num_x; ++k) { + CUDA_1D_KERNEL_LOOP(i, param.x_elem_cnt[k]) { + if (!IsFinite(param.x[k][i])) { thread_count += 1; } + } + } + __syncthreads(); + int64_t block_count_sum = BlockReduce(cub_reduce_tmp_storage).Reduce(thread_count, hipcub::Sum()); + if (threadIdx.x == 0) { AtomicAdd(param.y, block_count_sum); } +} + +constexpr int64_t kCountNotFiniteNumBlocks = 512; + +int GetCountNotFiniteNumBlocks(const int64_t elem_cnt) { + return std::min((elem_cnt + kCudaThreadsNumPerBlock - 1) / kCudaThreadsNumPerBlock, + kCountNotFiniteNumBlocks); +} + +} // namespace + +template +class CountNotFiniteGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { + public: + CountNotFiniteGpuKernel() = default; + ~CountNotFiniteGpuKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); + const int64_t elem_cnt = x->shape_view().elem_cnt(); + Memset(ctx->stream(), y->mut_dptr(), 0, + y->shape_view().elem_cnt() * sizeof(int64_t)); + CountNotFiniteGpu<<stream()->As()->cuda_stream()>>>( + elem_cnt, x->dptr(), y->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_COUNT_NOT_FINITE_CUDA_KERNEL(dtype) \ + REGISTER_USER_KERNEL("count_not_finite") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == GetDataType::value)); + +REGISTER_COUNT_NOT_FINITE_CUDA_KERNEL(half) +REGISTER_COUNT_NOT_FINITE_CUDA_KERNEL(float) +REGISTER_COUNT_NOT_FINITE_CUDA_KERNEL(double) + +template +class MultiCountNotFiniteGpuKernel final : public user_op::OpKernel, + public user_op::CudaGraphSupport { + public: + MultiCountNotFiniteGpuKernel() = default; + ~MultiCountNotFiniteGpuKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); + Param para; + Memset(ctx->stream(), y->mut_dptr(), 0, + y->shape_view().elem_cnt() * sizeof(int64_t)); + para.y = y->mut_dptr(); + + int64_t remain_size = ctx->inputs().size(); + int64_t input_id = 0; + while (remain_size > 0) { + if (remain_size > 128) { + remain_size -= 128; + para.num_x = 128; + } else { + para.num_x = remain_size; + remain_size = 0; + } + int64_t max_elem_cnt = 0; + for (int32_t i = 0; i < para.num_x; ++i) { + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", input_id); + input_id++; + para.x[i] = x->dptr(); + para.x_elem_cnt[i] = x->shape_view().elem_cnt(); + max_elem_cnt = std::max(max_elem_cnt, x->shape_view().elem_cnt()); + } + MultiCountNotFiniteGpu + <<stream()->As()->cuda_stream()>>>(para); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_MULTI_COUNT_NOT_FINITE_CUDA_KERNEL(dtype) \ + REGISTER_USER_KERNEL("multi_count_not_finite") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == GetDataType::value)); + +REGISTER_MULTI_COUNT_NOT_FINITE_CUDA_KERNEL(half) +REGISTER_MULTI_COUNT_NOT_FINITE_CUDA_KERNEL(float) +REGISTER_MULTI_COUNT_NOT_FINITE_CUDA_KERNEL(double) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/ctc_greedy_decoder.hip.cpp b/oneflow/user/kernels/ctc_greedy_decoder.hip.cpp index e754485..67568dd 100644 --- a/oneflow/user/kernels/ctc_greedy_decoder.hip.cpp +++ b/oneflow/user/kernels/ctc_greedy_decoder.hip.cpp @@ -1,146 +1,146 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/kernel/kernel_util.h" -#include "oneflow/user/kernels/ctc_greedy_decoder.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { -namespace { - -template -__global__ void CtcGreedyDecodeGpuMultiThread(int64_t* decoded_ptr, T* neg_sum_logits_ptr, - const T* log_probs_ptr, - const int64_t* input_lengths_ptr, - const bool merge_repeated, - const int64_t max_input_length, - const int64_t batch_size, const int64_t num_labels) { - const int64_t bid = blockIdx.x; - const int64_t tid = threadIdx.x; - - for (int64_t b = bid; b < batch_size; b += gridDim.x) { - if (tid == 0) { - if (input_lengths_ptr[b] > max_input_length) {asm volatile("s_trap 0;");} - } - } - - for (int64_t b = bid; b < batch_size; b += gridDim.x) { - extern __shared__ int64_t shared_max_indices_memory[]; - int64_t* shared_max_indices = (int64_t*)shared_max_indices_memory; - NdIndexOffsetHelper input_helper(max_input_length, batch_size, num_labels); - for (int64_t t = tid; t < max_input_length; t += blockDim.x) { - const T* prob_data_t = &log_probs_ptr[input_helper.NdIndexToOffset(t, b, 0)]; - int64_t max_indice = 0; - T max_value = -FLT_MAX; - FOR_RANGE(int64_t, c, 0, num_labels) { - const T prob = prob_data_t[c]; - if (prob > max_value) { - max_indice = c; - max_value = prob; - } - } - shared_max_indices[t] = max_indice; - } - - __syncthreads(); - - if (tid == 0) { - int64_t prev_indices = -1, t_dec = 0; - FOR_RANGE(int64_t, t, 0, input_lengths_ptr[b]) { - const T* prob_data_t = &log_probs_ptr[input_helper.NdIndexToOffset(t, b, 0)]; - const int64_t indice_t = shared_max_indices[t]; - neg_sum_logits_ptr[b] -= prob_data_t[indice_t]; - if (indice_t != num_labels - 1 && !(merge_repeated && (prev_indices == indice_t))) { - decoded_ptr[b * max_input_length + t_dec] = indice_t; - t_dec++; - } - prev_indices = indice_t; - } - FOR_RANGE(int64_t, t, t_dec, max_input_length) { decoded_ptr[b * max_input_length + t] = 0; } - } - } -} - -template -__global__ void CtcGreedyDecodeGpu(int64_t* decoded_ptr, T* neg_sum_logits_ptr, - const T* log_probs_ptr, const int64_t* input_lengths_ptr, - const bool merge_repeated, const int64_t max_input_length, - const int64_t batch_size, const int64_t num_labels) { - for (int64_t b = 0; b < batch_size; b++) { - if (input_lengths_ptr[b] > max_input_length) {asm volatile("s_trap 0;");} - } - NdIndexOffsetHelper input_helper(max_input_length, batch_size, num_labels); - - CUDA_1D_KERNEL_LOOP(b, batch_size) { - int prev_indices = -1, t_dec = 0; - neg_sum_logits_ptr[b] = 0; - FOR_RANGE(int64_t, t, 0, input_lengths_ptr[b]) { - const T* prob_data_t = &log_probs_ptr[input_helper.NdIndexToOffset(t, b, 0)]; - int64_t max_indice = -1; - T max_value = -FLT_MAX; - FOR_RANGE(int64_t, c, 0, num_labels) { - if (prob_data_t[c] > max_value) { - max_indice = c; - max_value = prob_data_t[c]; - } - } - neg_sum_logits_ptr[b] -= max_value; - if (max_indice != num_labels - 1 && !(merge_repeated && (prev_indices == max_indice))) { - decoded_ptr[b * max_input_length + t_dec] = max_indice; - t_dec++; - } - prev_indices = max_indice; - } - FOR_RANGE(int64_t, t, t_dec, max_input_length) { decoded_ptr[b * max_input_length + t] = 0; } - } -} - -template -struct CTCGreedyDecoderFunctor final { - void operator()(ep::Stream* stream, int64_t* decoded_ptr, T* neg_sum_logits_ptr, - const T* log_probs_ptr, const int64_t* input_lengths_ptr, - const bool merge_repeated, const int64_t max_input_length, - const int64_t batch_size, const int64_t num_labels) { - int32_t thread_num = batch_size * kCudaThreadsNumPerBlock; - int64_t shared_mem_size = max_input_length * sizeof(int64_t); - - int max_active_blocks; - OF_CUDA_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks, CtcGreedyDecodeGpu, kCudaThreadsNumPerBlock, shared_mem_size)); - if (max_active_blocks > 0) { - CtcGreedyDecodeGpuMultiThread<<As()->cuda_stream()>>>( - decoded_ptr, neg_sum_logits_ptr, log_probs_ptr, input_lengths_ptr, merge_repeated, - max_input_length, batch_size, num_labels); - - } else { - CtcGreedyDecodeGpu<<As()->cuda_stream()>>>( - decoded_ptr, neg_sum_logits_ptr, log_probs_ptr, input_lengths_ptr, merge_repeated, - max_input_length, batch_size, num_labels); - } - } -}; - -} // namespace - -REGISTER_CTC_GREEDY_DECODER_KERNELS(DeviceType::kCUDA, float); -REGISTER_CTC_GREEDY_DECODER_KERNELS(DeviceType::kCUDA, double); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/kernel/kernel_util.h" +#include "oneflow/user/kernels/ctc_greedy_decoder.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { +namespace { + +template +__global__ void CtcGreedyDecodeGpuMultiThread(int64_t* decoded_ptr, T* neg_sum_logits_ptr, + const T* log_probs_ptr, + const int64_t* input_lengths_ptr, + const bool merge_repeated, + const int64_t max_input_length, + const int64_t batch_size, const int64_t num_labels) { + const int64_t bid = blockIdx.x; + const int64_t tid = threadIdx.x; + + for (int64_t b = bid; b < batch_size; b += gridDim.x) { + if (tid == 0) { + if (input_lengths_ptr[b] > max_input_length) {asm volatile("s_trap 0;");} + } + } + + for (int64_t b = bid; b < batch_size; b += gridDim.x) { + extern __shared__ int64_t shared_max_indices_memory[]; + int64_t* shared_max_indices = (int64_t*)shared_max_indices_memory; + NdIndexOffsetHelper input_helper(max_input_length, batch_size, num_labels); + for (int64_t t = tid; t < max_input_length; t += blockDim.x) { + const T* prob_data_t = &log_probs_ptr[input_helper.NdIndexToOffset(t, b, 0)]; + int64_t max_indice = 0; + T max_value = -FLT_MAX; + FOR_RANGE(int64_t, c, 0, num_labels) { + const T prob = prob_data_t[c]; + if (prob > max_value) { + max_indice = c; + max_value = prob; + } + } + shared_max_indices[t] = max_indice; + } + + __syncthreads(); + + if (tid == 0) { + int64_t prev_indices = -1, t_dec = 0; + FOR_RANGE(int64_t, t, 0, input_lengths_ptr[b]) { + const T* prob_data_t = &log_probs_ptr[input_helper.NdIndexToOffset(t, b, 0)]; + const int64_t indice_t = shared_max_indices[t]; + neg_sum_logits_ptr[b] -= prob_data_t[indice_t]; + if (indice_t != num_labels - 1 && !(merge_repeated && (prev_indices == indice_t))) { + decoded_ptr[b * max_input_length + t_dec] = indice_t; + t_dec++; + } + prev_indices = indice_t; + } + FOR_RANGE(int64_t, t, t_dec, max_input_length) { decoded_ptr[b * max_input_length + t] = 0; } + } + } +} + +template +__global__ void CtcGreedyDecodeGpu(int64_t* decoded_ptr, T* neg_sum_logits_ptr, + const T* log_probs_ptr, const int64_t* input_lengths_ptr, + const bool merge_repeated, const int64_t max_input_length, + const int64_t batch_size, const int64_t num_labels) { + for (int64_t b = 0; b < batch_size; b++) { + if (input_lengths_ptr[b] > max_input_length) {asm volatile("s_trap 0;");} + } + NdIndexOffsetHelper input_helper(max_input_length, batch_size, num_labels); + + CUDA_1D_KERNEL_LOOP(b, batch_size) { + int prev_indices = -1, t_dec = 0; + neg_sum_logits_ptr[b] = 0; + FOR_RANGE(int64_t, t, 0, input_lengths_ptr[b]) { + const T* prob_data_t = &log_probs_ptr[input_helper.NdIndexToOffset(t, b, 0)]; + int64_t max_indice = -1; + T max_value = -FLT_MAX; + FOR_RANGE(int64_t, c, 0, num_labels) { + if (prob_data_t[c] > max_value) { + max_indice = c; + max_value = prob_data_t[c]; + } + } + neg_sum_logits_ptr[b] -= max_value; + if (max_indice != num_labels - 1 && !(merge_repeated && (prev_indices == max_indice))) { + decoded_ptr[b * max_input_length + t_dec] = max_indice; + t_dec++; + } + prev_indices = max_indice; + } + FOR_RANGE(int64_t, t, t_dec, max_input_length) { decoded_ptr[b * max_input_length + t] = 0; } + } +} + +template +struct CTCGreedyDecoderFunctor final { + void operator()(ep::Stream* stream, int64_t* decoded_ptr, T* neg_sum_logits_ptr, + const T* log_probs_ptr, const int64_t* input_lengths_ptr, + const bool merge_repeated, const int64_t max_input_length, + const int64_t batch_size, const int64_t num_labels) { + int32_t thread_num = batch_size * kCudaThreadsNumPerBlock; + int64_t shared_mem_size = max_input_length * sizeof(int64_t); + + int max_active_blocks; + OF_CUDA_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks, CtcGreedyDecodeGpu, kCudaThreadsNumPerBlock, shared_mem_size)); + if (max_active_blocks > 0) { + CtcGreedyDecodeGpuMultiThread<<As()->cuda_stream()>>>( + decoded_ptr, neg_sum_logits_ptr, log_probs_ptr, input_lengths_ptr, merge_repeated, + max_input_length, batch_size, num_labels); + + } else { + CtcGreedyDecodeGpu<<As()->cuda_stream()>>>( + decoded_ptr, neg_sum_logits_ptr, log_probs_ptr, input_lengths_ptr, merge_repeated, + max_input_length, batch_size, num_labels); + } + } +}; + +} // namespace + +REGISTER_CTC_GREEDY_DECODER_KERNELS(DeviceType::kCUDA, float); +REGISTER_CTC_GREEDY_DECODER_KERNELS(DeviceType::kCUDA, double); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/ctc_loss_kernel_util.hip.cpp b/oneflow/user/kernels/ctc_loss_kernel_util.hip.cpp index abbaccf..688df77 100644 --- a/oneflow/user/kernels/ctc_loss_kernel_util.hip.cpp +++ b/oneflow/user/kernels/ctc_loss_kernel_util.hip.cpp @@ -1,285 +1,285 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/user/kernels/ctc_loss_kernel_util.h" -#include "oneflow/core/device/cuda_util.h" - -namespace oneflow { - -namespace { - -template -__device__ __inline__ static int get_target_prime(const int* targets_ptr, - const IDX* target_lengths_ptr, - int64_t max_target_length, int64_t b, int64_t s, - int blank, const int32_t targets_ndim) { - if (s % 2 == 0) { - return blank; - } else { - int64_t idx = 0; - if (targets_ndim == 1) { - FOR_RANGE(int64_t, i, 0, b) { idx += target_lengths_ptr[i]; } - } else { // targets_ndim == 2 - idx = b * max_target_length; - } - idx += s / 2; - return targets_ptr[idx]; - } -} - -template -__global__ void CtcLossGpu(const T* log_probs_ptr, const int* targets_ptr, - const IDX* input_lengths_ptr, const IDX* target_lengths_ptr, - T* alpha_ptr, T* loss_ptr, NdIndexOffsetHelper input_helper, - NdIndexOffsetHelper alpha_helper, const int64_t batch_size, - const int64_t max_input_length, const int64_t max_target_length, - const int blank, const int32_t targets_ndim) { - constexpr T neginf = -INFINITY; - const int32_t bid = blockIdx.x; - const int32_t tid = threadIdx.x; - for (int64_t b = bid; b < batch_size; b += gridDim.x) { - if (tid == 0) { - if (input_lengths_ptr[b] > max_input_length) {asm("s_trap 0;");} - if (target_lengths_ptr[b] > max_target_length) {asm("s_trap 0;");} - } - } - for (int64_t b = bid; b < batch_size; b += gridDim.x) { - IDX input_length = input_lengths_ptr[b]; - IDX target_length = target_lengths_ptr[b]; - - for (IDX s = tid; s < 2 * target_length + 1; s += blockDim.x) { - alpha_ptr[alpha_helper.NdIndexToOffset(b, 0, s)] = neginf; - } - if (tid == 0) { - alpha_ptr[alpha_helper.NdIndexToOffset(b, 0, 0)] = - log_probs_ptr[input_helper.NdIndexToOffset(0, b, blank)]; - if (target_length > 0) { - int target = get_target_prime(targets_ptr, target_lengths_ptr, max_target_length, b, 1, - blank, targets_ndim); - alpha_ptr[alpha_helper.NdIndexToOffset(b, 0, 1)] = - log_probs_ptr[input_helper.NdIndexToOffset(0, b, target)]; - } - } - __syncthreads(); - for (IDX t = 1; t < input_length; t++) { - for (IDX s = tid; s < 2 * target_length + 1; s += blockDim.x) { - int current_target_prime = get_target_prime(targets_ptr, target_lengths_ptr, - max_target_length, b, s, blank, targets_ndim); - T la1 = alpha_ptr[alpha_helper.NdIndexToOffset(b, t - 1, s)]; - T la2, la3, lamax = la1; - if (s > 0) { - la2 = alpha_ptr[alpha_helper.NdIndexToOffset(b, t - 1, s - 1)]; - if (la2 > lamax) lamax = la2; - } else { - la2 = neginf; - } - if ((s > 1) - && (get_target_prime(targets_ptr, target_lengths_ptr, max_target_length, b, s - 2, - blank, targets_ndim) - != current_target_prime)) { - la3 = alpha_ptr[alpha_helper.NdIndexToOffset(b, t - 1, s - 2)]; - if (la3 > lamax) lamax = la3; - } else { - la3 = neginf; - } - if (lamax == neginf) lamax = 0; - - int64_t idx_t_s = alpha_helper.NdIndexToOffset(b, t, s); - alpha_ptr[idx_t_s] = - log(exp(la1 - lamax) + exp(la2 - lamax) + exp(la3 - lamax)) + lamax - + log_probs_ptr[input_helper.NdIndexToOffset(t, b, current_target_prime)]; - } - __syncthreads(); - } - if (tid == 0) { - if (target_length == 0) { - int64_t idx = alpha_helper.NdIndexToOffset(b, input_length - 1, 0); - loss_ptr[b] = -alpha_ptr[idx]; - } else { - int64_t idx1 = alpha_helper.NdIndexToOffset(b, input_length - 1, target_length * 2); - int64_t idx2 = alpha_helper.NdIndexToOffset(b, input_length - 1, target_length * 2 - 1); - T l1 = alpha_ptr[idx1]; - T l2 = alpha_ptr[idx2]; - T m = max(l1, l2); - m = ((m == neginf) ? 0 : m); - T log_likelihood = log(exp(l1 - m) + exp(l2 - m)) + m; - loss_ptr[b] = -log_likelihood; - } - } - } -} - -template -__global__ void CtcLossGradGpu( - const T* grad_out_ptr, const T* loss_ptr, const T* alpha_ptr, const T* log_probs_ptr, - const int* targets_ptr, const IDX* input_lengths_ptr, const IDX* target_lengths_ptr, - T* beta_ptr, T* grad_ptr, NdIndexOffsetHelper input_helper, - NdIndexOffsetHelper beta_helper, const int64_t batch_size, - const int64_t max_input_length, const int64_t max_target_length, const int64_t num_labels, - const int blank, const bool zero_infinity, const int32_t targets_ndim) { - constexpr T neginf = -INFINITY; - const int32_t bid = blockIdx.x; - const int32_t tid = threadIdx.x; - - for (int64_t b = bid; b < batch_size; b += gridDim.x) { - IDX input_length = input_lengths_ptr[b]; - IDX target_length = target_lengths_ptr[b]; - T nll = loss_ptr[b]; - if (zero_infinity && nll == INFINITY) { - for (IDX t = tid; t < max_input_length; t += blockDim.x) { - for (IDX c = 0; c < num_labels; c++) { - grad_ptr[input_helper.NdIndexToOffset(t, b, c)] = 0; - } - } - __syncthreads(); - continue; - } - - if (input_length > 0) { - for (IDX s = tid; s < 2 * target_length + 1; s += blockDim.x) { - beta_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, s)] = neginf; - } - if (tid == 0) { - beta_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length)] = - log_probs_ptr[input_helper.NdIndexToOffset(input_length - 1, b, blank)]; - if (target_length > 0) { - int target = get_target_prime(targets_ptr, target_lengths_ptr, max_target_length, b, - 2 * target_length - 1, blank, targets_ndim); - beta_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length - 1)] = - log_probs_ptr[input_helper.NdIndexToOffset(input_length - 1, b, target)]; - } - } - __syncthreads(); - } - for (IDX t = input_length - 2; t >= 0; t--) { - for (IDX s = tid; s < 2 * target_length + 1; s += blockDim.x) { - int current_target_prime = get_target_prime(targets_ptr, target_lengths_ptr, - max_target_length, b, s, blank, targets_ndim); - T lb1 = beta_ptr[beta_helper.NdIndexToOffset(b, t + 1, s)]; - T lb2, lb3, lbmax = lb1; - if (s < 2 * target_length) { - lb2 = beta_ptr[beta_helper.NdIndexToOffset(b, t + 1, s + 1)]; - if (lb2 > lbmax) lbmax = lb2; - } else { - lb2 = neginf; - } - if ((s < 2 * target_length - 1) - && (get_target_prime(targets_ptr, target_lengths_ptr, max_target_length, b, s + 2, - blank, targets_ndim) - != current_target_prime)) { - lb3 = beta_ptr[beta_helper.NdIndexToOffset(b, t + 1, s + 2)]; - if (lb3 > lbmax) lbmax = lb3; - } else { - lb3 = neginf; - } - if (lbmax == neginf) lbmax = 0; - - int64_t idx_t_s = beta_helper.NdIndexToOffset(b, t, s); - beta_ptr[idx_t_s] = - log(exp(lb1 - lbmax) + exp(lb2 - lbmax) + exp(lb3 - lbmax)) + lbmax - + log_probs_ptr[input_helper.NdIndexToOffset(t, b, current_target_prime)]; - } - __syncthreads(); - } - for (IDX t = tid; t < max_input_length; t += blockDim.x) { - for (IDX c = 0; c < num_labels; c++) { - grad_ptr[input_helper.NdIndexToOffset(t, b, c)] = t < input_length ? neginf : 0; - } - } - __syncthreads(); - if (tid == 0) { - grad_ptr[input_helper.NdIndexToOffset(input_length - 1, b, blank)] = - alpha_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length)] - + beta_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length)]; - if (target_length > 0) { - int target = get_target_prime(targets_ptr, target_lengths_ptr, max_target_length, b, - 2 * target_length - 1, blank, targets_ndim); - grad_ptr[input_helper.NdIndexToOffset(input_length - 1, b, target)] = - alpha_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length - 1)] - + beta_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length - 1)]; - } - } - __syncthreads(); - for (IDX t = tid; t < input_length; t += blockDim.x) { - for (IDX s = 0; (t < input_length - 1) && (s < 2 * target_length + 1); s += 1) { - int current_target_prime = get_target_prime(targets_ptr, target_lengths_ptr, - max_target_length, b, s, blank, targets_ndim); - int64_t idx_t_s = beta_helper.NdIndexToOffset(b, t, s); - T log_alpha_beta = alpha_ptr[idx_t_s] + beta_ptr[idx_t_s]; - T& lcab = grad_ptr[input_helper.NdIndexToOffset(t, b, current_target_prime)]; - if (lcab == neginf) { - lcab = log_alpha_beta; - } else { - T m = max(lcab, log_alpha_beta); - lcab = log(exp(lcab - m) + exp(log_alpha_beta - m)) + m; - } - } - for (int32_t c = 0; c < num_labels; c++) { - T& res = grad_ptr[input_helper.NdIndexToOffset(t, b, c)]; - T lp = log_probs_ptr[input_helper.NdIndexToOffset(t, b, c)]; - res = (exp(lp) - exp(res + nll - lp)) * grad_out_ptr[b]; - } - } - } -} - -} // namespace - -template -struct CtcLossKernelUtil { - static void CtcLossForward(ep::Stream* stream, const T* log_probs_ptr, const int* targets_ptr, - const IDX* input_lengths_ptr, const IDX* target_lengths_ptr, - T* alpha_ptr, T* loss_ptr, - NdIndexOffsetHelper& input_helper, - NdIndexOffsetHelper& alpha_helper, - const int64_t batch_size, const int64_t max_input_length, - const int64_t max_target_length, const int blank, - const int32_t targets_ndim) { - int32_t thread_num = batch_size * kCudaThreadsNumPerBlock; - RUN_CUDA_KERNEL((CtcLossGpu), stream, thread_num, log_probs_ptr, targets_ptr, - input_lengths_ptr, target_lengths_ptr, alpha_ptr, loss_ptr, input_helper, - alpha_helper, batch_size, max_input_length, max_target_length, blank, - targets_ndim); - } - - static void CtcLossBackward(ep::Stream* stream, const T* grad_out_ptr, const T* loss_ptr, - const T* alpha_ptr, const T* log_probs_ptr, const int* targets_ptr, - const IDX* input_lengths_ptr, const IDX* target_lengths_ptr, - T* beta_ptr, T* grad_ptr, - NdIndexOffsetHelper& input_helper, - NdIndexOffsetHelper& beta_helper, - const int64_t batch_size, const int64_t max_input_length, - const int64_t max_target_length, const int64_t num_labels, - const int blank, const bool zero_infinity, - const int32_t targets_ndim) { - int32_t thread_num = batch_size * kCudaThreadsNumPerBlock; - RUN_CUDA_KERNEL((CtcLossGradGpu), stream, thread_num, grad_out_ptr, loss_ptr, alpha_ptr, - log_probs_ptr, targets_ptr, input_lengths_ptr, target_lengths_ptr, beta_ptr, - grad_ptr, input_helper, beta_helper, batch_size, max_input_length, - max_target_length, num_labels, blank, zero_infinity, targets_ndim); - } -}; - -#define INSTANTIATE_CTC_LOSS_KERNEL_UTIL_CUDA(device_type_v, log_probs_dtype_pair, \ - input_lengths_dtype_pair) \ - template struct CtcLossKernelUtil; - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_CTC_LOSS_KERNEL_UTIL_CUDA, (DeviceType::kCUDA), - FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) -#undef INSTANTIATE_CTC_LOSS_KERNEL_UTIL_CUDA - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/user/kernels/ctc_loss_kernel_util.h" +#include "oneflow/core/device/cuda_util.h" + +namespace oneflow { + +namespace { + +template +__device__ __inline__ static int get_target_prime(const int* targets_ptr, + const IDX* target_lengths_ptr, + int64_t max_target_length, int64_t b, int64_t s, + int blank, const int32_t targets_ndim) { + if (s % 2 == 0) { + return blank; + } else { + int64_t idx = 0; + if (targets_ndim == 1) { + FOR_RANGE(int64_t, i, 0, b) { idx += target_lengths_ptr[i]; } + } else { // targets_ndim == 2 + idx = b * max_target_length; + } + idx += s / 2; + return targets_ptr[idx]; + } +} + +template +__global__ void CtcLossGpu(const T* log_probs_ptr, const int* targets_ptr, + const IDX* input_lengths_ptr, const IDX* target_lengths_ptr, + T* alpha_ptr, T* loss_ptr, NdIndexOffsetHelper input_helper, + NdIndexOffsetHelper alpha_helper, const int64_t batch_size, + const int64_t max_input_length, const int64_t max_target_length, + const int blank, const int32_t targets_ndim) { + constexpr T neginf = -INFINITY; + const int32_t bid = blockIdx.x; + const int32_t tid = threadIdx.x; + for (int64_t b = bid; b < batch_size; b += gridDim.x) { + if (tid == 0) { + if (input_lengths_ptr[b] > max_input_length) {asm("s_trap 0;");} + if (target_lengths_ptr[b] > max_target_length) {asm("s_trap 0;");} + } + } + for (int64_t b = bid; b < batch_size; b += gridDim.x) { + IDX input_length = input_lengths_ptr[b]; + IDX target_length = target_lengths_ptr[b]; + + for (IDX s = tid; s < 2 * target_length + 1; s += blockDim.x) { + alpha_ptr[alpha_helper.NdIndexToOffset(b, 0, s)] = neginf; + } + if (tid == 0) { + alpha_ptr[alpha_helper.NdIndexToOffset(b, 0, 0)] = + log_probs_ptr[input_helper.NdIndexToOffset(0, b, blank)]; + if (target_length > 0) { + int target = get_target_prime(targets_ptr, target_lengths_ptr, max_target_length, b, 1, + blank, targets_ndim); + alpha_ptr[alpha_helper.NdIndexToOffset(b, 0, 1)] = + log_probs_ptr[input_helper.NdIndexToOffset(0, b, target)]; + } + } + __syncthreads(); + for (IDX t = 1; t < input_length; t++) { + for (IDX s = tid; s < 2 * target_length + 1; s += blockDim.x) { + int current_target_prime = get_target_prime(targets_ptr, target_lengths_ptr, + max_target_length, b, s, blank, targets_ndim); + T la1 = alpha_ptr[alpha_helper.NdIndexToOffset(b, t - 1, s)]; + T la2, la3, lamax = la1; + if (s > 0) { + la2 = alpha_ptr[alpha_helper.NdIndexToOffset(b, t - 1, s - 1)]; + if (la2 > lamax) lamax = la2; + } else { + la2 = neginf; + } + if ((s > 1) + && (get_target_prime(targets_ptr, target_lengths_ptr, max_target_length, b, s - 2, + blank, targets_ndim) + != current_target_prime)) { + la3 = alpha_ptr[alpha_helper.NdIndexToOffset(b, t - 1, s - 2)]; + if (la3 > lamax) lamax = la3; + } else { + la3 = neginf; + } + if (lamax == neginf) lamax = 0; + + int64_t idx_t_s = alpha_helper.NdIndexToOffset(b, t, s); + alpha_ptr[idx_t_s] = + log(exp(la1 - lamax) + exp(la2 - lamax) + exp(la3 - lamax)) + lamax + + log_probs_ptr[input_helper.NdIndexToOffset(t, b, current_target_prime)]; + } + __syncthreads(); + } + if (tid == 0) { + if (target_length == 0) { + int64_t idx = alpha_helper.NdIndexToOffset(b, input_length - 1, 0); + loss_ptr[b] = -alpha_ptr[idx]; + } else { + int64_t idx1 = alpha_helper.NdIndexToOffset(b, input_length - 1, target_length * 2); + int64_t idx2 = alpha_helper.NdIndexToOffset(b, input_length - 1, target_length * 2 - 1); + T l1 = alpha_ptr[idx1]; + T l2 = alpha_ptr[idx2]; + T m = max(l1, l2); + m = ((m == neginf) ? 0 : m); + T log_likelihood = log(exp(l1 - m) + exp(l2 - m)) + m; + loss_ptr[b] = -log_likelihood; + } + } + } +} + +template +__global__ void CtcLossGradGpu( + const T* grad_out_ptr, const T* loss_ptr, const T* alpha_ptr, const T* log_probs_ptr, + const int* targets_ptr, const IDX* input_lengths_ptr, const IDX* target_lengths_ptr, + T* beta_ptr, T* grad_ptr, NdIndexOffsetHelper input_helper, + NdIndexOffsetHelper beta_helper, const int64_t batch_size, + const int64_t max_input_length, const int64_t max_target_length, const int64_t num_labels, + const int blank, const bool zero_infinity, const int32_t targets_ndim) { + constexpr T neginf = -INFINITY; + const int32_t bid = blockIdx.x; + const int32_t tid = threadIdx.x; + + for (int64_t b = bid; b < batch_size; b += gridDim.x) { + IDX input_length = input_lengths_ptr[b]; + IDX target_length = target_lengths_ptr[b]; + T nll = loss_ptr[b]; + if (zero_infinity && nll == INFINITY) { + for (IDX t = tid; t < max_input_length; t += blockDim.x) { + for (IDX c = 0; c < num_labels; c++) { + grad_ptr[input_helper.NdIndexToOffset(t, b, c)] = 0; + } + } + __syncthreads(); + continue; + } + + if (input_length > 0) { + for (IDX s = tid; s < 2 * target_length + 1; s += blockDim.x) { + beta_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, s)] = neginf; + } + if (tid == 0) { + beta_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length)] = + log_probs_ptr[input_helper.NdIndexToOffset(input_length - 1, b, blank)]; + if (target_length > 0) { + int target = get_target_prime(targets_ptr, target_lengths_ptr, max_target_length, b, + 2 * target_length - 1, blank, targets_ndim); + beta_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length - 1)] = + log_probs_ptr[input_helper.NdIndexToOffset(input_length - 1, b, target)]; + } + } + __syncthreads(); + } + for (IDX t = input_length - 2; t >= 0; t--) { + for (IDX s = tid; s < 2 * target_length + 1; s += blockDim.x) { + int current_target_prime = get_target_prime(targets_ptr, target_lengths_ptr, + max_target_length, b, s, blank, targets_ndim); + T lb1 = beta_ptr[beta_helper.NdIndexToOffset(b, t + 1, s)]; + T lb2, lb3, lbmax = lb1; + if (s < 2 * target_length) { + lb2 = beta_ptr[beta_helper.NdIndexToOffset(b, t + 1, s + 1)]; + if (lb2 > lbmax) lbmax = lb2; + } else { + lb2 = neginf; + } + if ((s < 2 * target_length - 1) + && (get_target_prime(targets_ptr, target_lengths_ptr, max_target_length, b, s + 2, + blank, targets_ndim) + != current_target_prime)) { + lb3 = beta_ptr[beta_helper.NdIndexToOffset(b, t + 1, s + 2)]; + if (lb3 > lbmax) lbmax = lb3; + } else { + lb3 = neginf; + } + if (lbmax == neginf) lbmax = 0; + + int64_t idx_t_s = beta_helper.NdIndexToOffset(b, t, s); + beta_ptr[idx_t_s] = + log(exp(lb1 - lbmax) + exp(lb2 - lbmax) + exp(lb3 - lbmax)) + lbmax + + log_probs_ptr[input_helper.NdIndexToOffset(t, b, current_target_prime)]; + } + __syncthreads(); + } + for (IDX t = tid; t < max_input_length; t += blockDim.x) { + for (IDX c = 0; c < num_labels; c++) { + grad_ptr[input_helper.NdIndexToOffset(t, b, c)] = t < input_length ? neginf : 0; + } + } + __syncthreads(); + if (tid == 0) { + grad_ptr[input_helper.NdIndexToOffset(input_length - 1, b, blank)] = + alpha_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length)] + + beta_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length)]; + if (target_length > 0) { + int target = get_target_prime(targets_ptr, target_lengths_ptr, max_target_length, b, + 2 * target_length - 1, blank, targets_ndim); + grad_ptr[input_helper.NdIndexToOffset(input_length - 1, b, target)] = + alpha_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length - 1)] + + beta_ptr[beta_helper.NdIndexToOffset(b, input_length - 1, 2 * target_length - 1)]; + } + } + __syncthreads(); + for (IDX t = tid; t < input_length; t += blockDim.x) { + for (IDX s = 0; (t < input_length - 1) && (s < 2 * target_length + 1); s += 1) { + int current_target_prime = get_target_prime(targets_ptr, target_lengths_ptr, + max_target_length, b, s, blank, targets_ndim); + int64_t idx_t_s = beta_helper.NdIndexToOffset(b, t, s); + T log_alpha_beta = alpha_ptr[idx_t_s] + beta_ptr[idx_t_s]; + T& lcab = grad_ptr[input_helper.NdIndexToOffset(t, b, current_target_prime)]; + if (lcab == neginf) { + lcab = log_alpha_beta; + } else { + T m = max(lcab, log_alpha_beta); + lcab = log(exp(lcab - m) + exp(log_alpha_beta - m)) + m; + } + } + for (int32_t c = 0; c < num_labels; c++) { + T& res = grad_ptr[input_helper.NdIndexToOffset(t, b, c)]; + T lp = log_probs_ptr[input_helper.NdIndexToOffset(t, b, c)]; + res = (exp(lp) - exp(res + nll - lp)) * grad_out_ptr[b]; + } + } + } +} + +} // namespace + +template +struct CtcLossKernelUtil { + static void CtcLossForward(ep::Stream* stream, const T* log_probs_ptr, const int* targets_ptr, + const IDX* input_lengths_ptr, const IDX* target_lengths_ptr, + T* alpha_ptr, T* loss_ptr, + NdIndexOffsetHelper& input_helper, + NdIndexOffsetHelper& alpha_helper, + const int64_t batch_size, const int64_t max_input_length, + const int64_t max_target_length, const int blank, + const int32_t targets_ndim) { + int32_t thread_num = batch_size * kCudaThreadsNumPerBlock; + RUN_CUDA_KERNEL((CtcLossGpu), stream, thread_num, log_probs_ptr, targets_ptr, + input_lengths_ptr, target_lengths_ptr, alpha_ptr, loss_ptr, input_helper, + alpha_helper, batch_size, max_input_length, max_target_length, blank, + targets_ndim); + } + + static void CtcLossBackward(ep::Stream* stream, const T* grad_out_ptr, const T* loss_ptr, + const T* alpha_ptr, const T* log_probs_ptr, const int* targets_ptr, + const IDX* input_lengths_ptr, const IDX* target_lengths_ptr, + T* beta_ptr, T* grad_ptr, + NdIndexOffsetHelper& input_helper, + NdIndexOffsetHelper& beta_helper, + const int64_t batch_size, const int64_t max_input_length, + const int64_t max_target_length, const int64_t num_labels, + const int blank, const bool zero_infinity, + const int32_t targets_ndim) { + int32_t thread_num = batch_size * kCudaThreadsNumPerBlock; + RUN_CUDA_KERNEL((CtcLossGradGpu), stream, thread_num, grad_out_ptr, loss_ptr, alpha_ptr, + log_probs_ptr, targets_ptr, input_lengths_ptr, target_lengths_ptr, beta_ptr, + grad_ptr, input_helper, beta_helper, batch_size, max_input_length, + max_target_length, num_labels, blank, zero_infinity, targets_ndim); + } +}; + +#define INSTANTIATE_CTC_LOSS_KERNEL_UTIL_CUDA(device_type_v, log_probs_dtype_pair, \ + input_lengths_dtype_pair) \ + template struct CtcLossKernelUtil; + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_CTC_LOSS_KERNEL_UTIL_CUDA, (DeviceType::kCUDA), + FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) +#undef INSTANTIATE_CTC_LOSS_KERNEL_UTIL_CUDA + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/cum_backward_kernel.hip.cpp b/oneflow/user/kernels/cum_backward_kernel.hip.cpp index b609c6e..98f581d 100644 --- a/oneflow/user/kernels/cum_backward_kernel.hip.cpp +++ b/oneflow/user/kernels/cum_backward_kernel.hip.cpp @@ -1,139 +1,139 @@ -#include "hip/hip_runtime.h" -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include "oneflow/core/kernel/new_kernel_util.h" - -namespace oneflow { -#ifdef WITH_ROCM -namespace { -template -__global__ void CumProdBackward(const T* dy_ptr, T* dx_ptr, const T* output_ptr, const T* input_ptr, - const int64_t up_space, const int64_t space, - const int64_t down_space, const int64_t thread_num) { - // A thread is responsible for a row along specific dimension. - const size_t up_space_step = space * down_space; - CUDA_1D_KERNEL_LOOP_T(size_t, i, thread_num) { - const size_t up_space_id = i / down_space; - const size_t down_space_id = i % down_space; - const size_t ptr_offset = up_space_id * up_space_step + down_space_id; - auto* dy_ptr_base = dy_ptr + ptr_offset; - auto* dx_ptr_base = dx_ptr + ptr_offset; - auto* input_ptr_base = input_ptr + ptr_offset; - auto* output_ptr_base = output_ptr + ptr_offset; - - // Buffer storing number of zero element along specific dimension. - // Use dx as tmp buffer. - for (size_t j = 0; j < space; j++) { - const size_t data_offset = j * down_space; - int is_zero = input_ptr_base[data_offset] == 0 ? 1 : 0; - dx_ptr_base[data_offset] = is_zero + (j == 0 ? 0 : dx_ptr_base[data_offset - down_space]); - } - - // Find index of first zero in input. - size_t first_zero_index = space; - for (size_t j = 0; j < space; j++) { - const size_t data_offset = j * down_space; - if (dx_ptr_base[data_offset] == 1) { - first_zero_index = j; - break; - } - } - - // Suppose z is index of first zero element in input, - // for element which index is less than z grad is computed as below: - T reverse_cumsum = 0; - for (size_t j = 0; j < first_zero_index; j++) { - const size_t cur_index = first_zero_index - j - 1; - const size_t data_offset = cur_index * down_space; - reverse_cumsum += output_ptr_base[data_offset] * dy_ptr_base[data_offset]; - dx_ptr_base[data_offset] = reverse_cumsum / input_ptr_base[data_offset]; - } - - // Where index is z, its grad is computed as below: - if (first_zero_index == space) { return; } - T cumprod = 1; - T cumsum = 0; - T cumprod_before_first_zero = - first_zero_index == 0 ? 1 : output_ptr_base[(first_zero_index - 1) * down_space]; - for (size_t j = first_zero_index; j < space; j++) { - const size_t down_space_offset = j * down_space; - // Recover dx_ptr default value - if (dx_ptr_base[down_space_offset] >= 1) { dx_ptr_base[down_space_offset] = 0; } - if (j != first_zero_index) { cumprod *= input_ptr_base[down_space_offset]; } - cumsum += cumprod_before_first_zero * dy_ptr_base[down_space_offset] * cumprod; - } - dx_ptr_base[first_zero_index * down_space] = cumsum; - } -} -} // namespace - -template -class GpuCumProdGradKernel final : public user_op::OpKernel { - public: - GpuCumProdGradKernel() = default; - ~GpuCumProdGradKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const auto* output = ctx->Tensor4ArgNameAndIndex("output", 0); - const auto* input = ctx->Tensor4ArgNameAndIndex("input", 0); - const auto* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - auto* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const auto elem_cnt = dy->shape_view().elem_cnt(); - if (!elem_cnt) { return; } - - const auto* output_ptr = output->dptr(); - const auto* input_ptr = input->dptr(); - const auto* dy_ptr = dy->dptr(); - auto* dx_ptr = dx->mut_dptr(); - - // Data partition: up_space|space|down_space - auto dim = ctx->Attr("dim"); - const auto up_space = elem_cnt / dx->shape_view().Count(dim); - const auto space = dx->shape_view().At(dim); - const auto down_space = dx->shape_view().Count(dim + 1); - const size_t thread_num = up_space * down_space; - - if (space == 1) { - Memcpy(ctx->stream(), dx_ptr, dy_ptr, elem_cnt * sizeof(T)); - return; - } - ep::CudaLaunchConfig config{}; - ctx->stream()->As()->InitLaunchConfigWithWaves( - &config, thread_num, /*DefaultBlockSize*/ 256, /*max_wave*/ 1); - CumProdBackward<<stream()->As()->cuda_stream()>>>( - dy_ptr, dx_ptr, output_ptr, input_ptr, up_space, space, down_space, thread_num); - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_CUMPROD_GRAD_KERNEL(dtype) \ - REGISTER_USER_KERNEL("cumprod_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)); - -REGISTER_CUDA_CUMPROD_GRAD_KERNEL(float) -REGISTER_CUDA_CUMPROD_GRAD_KERNEL(double) -#undef REGISTER_CUDA_CUMPROD_GRAD_KERNEL -#endif +#include "hip/hip_runtime.h" +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include "oneflow/core/kernel/new_kernel_util.h" + +namespace oneflow { +#ifdef WITH_ROCM +namespace { +template +__global__ void CumProdBackward(const T* dy_ptr, T* dx_ptr, const T* output_ptr, const T* input_ptr, + const int64_t up_space, const int64_t space, + const int64_t down_space, const int64_t thread_num) { + // A thread is responsible for a row along specific dimension. + const size_t up_space_step = space * down_space; + CUDA_1D_KERNEL_LOOP_T(size_t, i, thread_num) { + const size_t up_space_id = i / down_space; + const size_t down_space_id = i % down_space; + const size_t ptr_offset = up_space_id * up_space_step + down_space_id; + auto* dy_ptr_base = dy_ptr + ptr_offset; + auto* dx_ptr_base = dx_ptr + ptr_offset; + auto* input_ptr_base = input_ptr + ptr_offset; + auto* output_ptr_base = output_ptr + ptr_offset; + + // Buffer storing number of zero element along specific dimension. + // Use dx as tmp buffer. + for (size_t j = 0; j < space; j++) { + const size_t data_offset = j * down_space; + int is_zero = input_ptr_base[data_offset] == 0 ? 1 : 0; + dx_ptr_base[data_offset] = is_zero + (j == 0 ? 0 : dx_ptr_base[data_offset - down_space]); + } + + // Find index of first zero in input. + size_t first_zero_index = space; + for (size_t j = 0; j < space; j++) { + const size_t data_offset = j * down_space; + if (dx_ptr_base[data_offset] == 1) { + first_zero_index = j; + break; + } + } + + // Suppose z is index of first zero element in input, + // for element which index is less than z grad is computed as below: + T reverse_cumsum = 0; + for (size_t j = 0; j < first_zero_index; j++) { + const size_t cur_index = first_zero_index - j - 1; + const size_t data_offset = cur_index * down_space; + reverse_cumsum += output_ptr_base[data_offset] * dy_ptr_base[data_offset]; + dx_ptr_base[data_offset] = reverse_cumsum / input_ptr_base[data_offset]; + } + + // Where index is z, its grad is computed as below: + if (first_zero_index == space) { return; } + T cumprod = 1; + T cumsum = 0; + T cumprod_before_first_zero = + first_zero_index == 0 ? 1 : output_ptr_base[(first_zero_index - 1) * down_space]; + for (size_t j = first_zero_index; j < space; j++) { + const size_t down_space_offset = j * down_space; + // Recover dx_ptr default value + if (dx_ptr_base[down_space_offset] >= 1) { dx_ptr_base[down_space_offset] = 0; } + if (j != first_zero_index) { cumprod *= input_ptr_base[down_space_offset]; } + cumsum += cumprod_before_first_zero * dy_ptr_base[down_space_offset] * cumprod; + } + dx_ptr_base[first_zero_index * down_space] = cumsum; + } +} +} // namespace + +template +class GpuCumProdGradKernel final : public user_op::OpKernel { + public: + GpuCumProdGradKernel() = default; + ~GpuCumProdGradKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const auto* output = ctx->Tensor4ArgNameAndIndex("output", 0); + const auto* input = ctx->Tensor4ArgNameAndIndex("input", 0); + const auto* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + auto* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + const auto elem_cnt = dy->shape_view().elem_cnt(); + if (!elem_cnt) { return; } + + const auto* output_ptr = output->dptr(); + const auto* input_ptr = input->dptr(); + const auto* dy_ptr = dy->dptr(); + auto* dx_ptr = dx->mut_dptr(); + + // Data partition: up_space|space|down_space + auto dim = ctx->Attr("dim"); + const auto up_space = elem_cnt / dx->shape_view().Count(dim); + const auto space = dx->shape_view().At(dim); + const auto down_space = dx->shape_view().Count(dim + 1); + const size_t thread_num = up_space * down_space; + + if (space == 1) { + Memcpy(ctx->stream(), dx_ptr, dy_ptr, elem_cnt * sizeof(T)); + return; + } + ep::CudaLaunchConfig config{}; + ctx->stream()->As()->InitLaunchConfigWithWaves( + &config, thread_num, /*DefaultBlockSize*/ 256, /*max_wave*/ 1); + CumProdBackward<<stream()->As()->cuda_stream()>>>( + dy_ptr, dx_ptr, output_ptr, input_ptr, up_space, space, down_space, thread_num); + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_CUMPROD_GRAD_KERNEL(dtype) \ + REGISTER_USER_KERNEL("cumprod_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value)); + +REGISTER_CUDA_CUMPROD_GRAD_KERNEL(float) +REGISTER_CUDA_CUMPROD_GRAD_KERNEL(double) +#undef REGISTER_CUDA_CUMPROD_GRAD_KERNEL +#endif } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/cum_forward_kernel.hip.cpp b/oneflow/user/kernels/cum_forward_kernel.hip.cpp index 1a96aff..1c3671c 100644 --- a/oneflow/user/kernels/cum_forward_kernel.hip.cpp +++ b/oneflow/user/kernels/cum_forward_kernel.hip.cpp @@ -1,169 +1,169 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/ndarray/binary_func.h" - -namespace oneflow { -#ifdef WITH_ROCM -namespace { - -// total thread number: cs_up_space * cs_down_space -// in cs_down_space part, use cs_down_space threads -// to calculate as follows(m=cs_down_space-1, n=cs_space-1, '|' stands for dependency): -// dm0, ..., d10, d00 -// | | | -// dm1, ..., d11, d01 -// | | | -// dm2, ..., d12, d02 -// | | | -// ... ... ... -// | | | -// dmn, ..., d1n, d0n -template class BinaryFunc> -__global__ void CumsumForwardGpu(const T* in_ptr, T* out_ptr, int64_t cs_up_space, int64_t cs_space, - int64_t cs_down_space) { - CUDA_1D_KERNEL_LOOP(i, cs_up_space * cs_down_space) { - auto cs_up_space_id = i / cs_down_space; - auto cs_down_space_id = i - (i / cs_down_space) * cs_down_space; - - auto* in_ptr_base = in_ptr + cs_up_space_id * cs_space * cs_down_space + cs_down_space_id; - auto* out_ptr_base = out_ptr + cs_up_space_id * cs_space * cs_down_space + cs_down_space_id; - - // calculate cs_space data in one thread - for (auto j = 0; j < cs_space; j++) { - auto idx = j * cs_down_space; - out_ptr_base[idx] = in_ptr_base[idx]; - if (j != 0) { - out_ptr_base[idx] = - BinaryFunc::Invoke(out_ptr_base[idx], out_ptr_base[idx - cs_down_space]); - } - } - } -} -template class BinaryFunc> -__global__ void CumsumForwardGpuUpSpaceIs1(const T* in_ptr, T* out_ptr, int64_t cs_space, - int64_t cs_down_space) { - CUDA_1D_KERNEL_LOOP(i, cs_down_space) { - auto* in_ptr_base = in_ptr + i; - auto* out_ptr_base = out_ptr + i; - - // calculate cs_space data in one thread - for (auto j = 0; j < cs_space; j++) { - auto idx = j * cs_down_space; - out_ptr_base[idx] = in_ptr_base[idx]; - if (j != 0) { - out_ptr_base[idx] = - BinaryFunc::Invoke(out_ptr_base[idx], out_ptr_base[idx - cs_down_space]); - } - } - } -} -template class BinaryFunc> -__global__ void CumsumForwardGpuDownSpaceIs1(const T* in_ptr, T* out_ptr, int64_t cs_up_space, - int64_t cs_space) { - CUDA_1D_KERNEL_LOOP(i, cs_up_space) { - auto* in_ptr_base = in_ptr + i * cs_space; - auto* out_ptr_base = out_ptr + i * cs_space; - - // calculate cs_space data in one thread - for (auto j = 0; j < cs_space; j++) { - out_ptr_base[j] = in_ptr_base[j]; - if (j != 0) { out_ptr_base[j] = BinaryFunc::Invoke(out_ptr_base[j], out_ptr_base[j - 1]); } - } - } -} -} // namespace - -template class BinaryFunc> -class GpuCumKernel : public user_op::OpKernel { - public: - GpuCumKernel() = default; - ~GpuCumKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - // judge whether tensor has 0 size dimension first - const auto* in = ctx->Tensor4ArgNameAndIndex("x", 0); - auto elem_cnt = in->shape_view().elem_cnt(); - if (!elem_cnt) { return; } - - auto* out = ctx->Tensor4ArgNameAndIndex("y", 0); - auto dim = ctx->Attr("dim"); - const auto* in_ptr = in->dptr(); - auto* out_ptr = out->mut_dptr(); - - // data partition: up_space|space|down_space - auto up_space = elem_cnt / in->shape_view().Count(dim); - auto space = in->shape_view().At(dim); - auto down_space = in->shape_view().Count(dim + 1); - auto thread_num = up_space * down_space; - - if (up_space == 1) { - RUN_CUDA_KERNEL((CumsumForwardGpuUpSpaceIs1), ctx->stream(), thread_num, - in_ptr, out_ptr, space, down_space); - } else if (down_space == 1) { - RUN_CUDA_KERNEL((CumsumForwardGpuDownSpaceIs1), ctx->stream(), thread_num, - in_ptr, out_ptr, up_space, space); - } else { - RUN_CUDA_KERNEL((CumsumForwardGpu), ctx->stream(), thread_num, in_ptr, out_ptr, - up_space, space, down_space); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class GpuCumSumKernel final : public GpuCumKernel { - public: - GpuCumSumKernel() = default; - ~GpuCumSumKernel() = default; -}; - -#define REGISTER_CUDA_CUMSUM_KERNEL(dtype) \ - REGISTER_USER_KERNEL("cumsum").SetCreateFn>().SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value)); - -REGISTER_CUDA_CUMSUM_KERNEL(int32_t) -REGISTER_CUDA_CUMSUM_KERNEL(int64_t) -REGISTER_CUDA_CUMSUM_KERNEL(float) -REGISTER_CUDA_CUMSUM_KERNEL(double) -#undef REGISTER_CUDA_CUMSUM_KERNEL - -template -class GpuCumProdKernel final : public GpuCumKernel { - public: - GpuCumProdKernel() = default; - ~GpuCumProdKernel() = default; -}; - -#define REGISTER_CUDA_CUMPROD_KERNEL(dtype) \ - REGISTER_USER_KERNEL("cumprod").SetCreateFn>().SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value)); - -REGISTER_CUDA_CUMPROD_KERNEL(int32_t) -REGISTER_CUDA_CUMPROD_KERNEL(int64_t) -REGISTER_CUDA_CUMPROD_KERNEL(float) -REGISTER_CUDA_CUMPROD_KERNEL(double) -#undef REGISTER_CUDA_CUMPROD_KERNEL -#endif +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/ndarray/binary_func.h" + +namespace oneflow { +#ifdef WITH_ROCM +namespace { + +// total thread number: cs_up_space * cs_down_space +// in cs_down_space part, use cs_down_space threads +// to calculate as follows(m=cs_down_space-1, n=cs_space-1, '|' stands for dependency): +// dm0, ..., d10, d00 +// | | | +// dm1, ..., d11, d01 +// | | | +// dm2, ..., d12, d02 +// | | | +// ... ... ... +// | | | +// dmn, ..., d1n, d0n +template class BinaryFunc> +__global__ void CumsumForwardGpu(const T* in_ptr, T* out_ptr, int64_t cs_up_space, int64_t cs_space, + int64_t cs_down_space) { + CUDA_1D_KERNEL_LOOP(i, cs_up_space * cs_down_space) { + auto cs_up_space_id = i / cs_down_space; + auto cs_down_space_id = i - (i / cs_down_space) * cs_down_space; + + auto* in_ptr_base = in_ptr + cs_up_space_id * cs_space * cs_down_space + cs_down_space_id; + auto* out_ptr_base = out_ptr + cs_up_space_id * cs_space * cs_down_space + cs_down_space_id; + + // calculate cs_space data in one thread + for (auto j = 0; j < cs_space; j++) { + auto idx = j * cs_down_space; + out_ptr_base[idx] = in_ptr_base[idx]; + if (j != 0) { + out_ptr_base[idx] = + BinaryFunc::Invoke(out_ptr_base[idx], out_ptr_base[idx - cs_down_space]); + } + } + } +} +template class BinaryFunc> +__global__ void CumsumForwardGpuUpSpaceIs1(const T* in_ptr, T* out_ptr, int64_t cs_space, + int64_t cs_down_space) { + CUDA_1D_KERNEL_LOOP(i, cs_down_space) { + auto* in_ptr_base = in_ptr + i; + auto* out_ptr_base = out_ptr + i; + + // calculate cs_space data in one thread + for (auto j = 0; j < cs_space; j++) { + auto idx = j * cs_down_space; + out_ptr_base[idx] = in_ptr_base[idx]; + if (j != 0) { + out_ptr_base[idx] = + BinaryFunc::Invoke(out_ptr_base[idx], out_ptr_base[idx - cs_down_space]); + } + } + } +} +template class BinaryFunc> +__global__ void CumsumForwardGpuDownSpaceIs1(const T* in_ptr, T* out_ptr, int64_t cs_up_space, + int64_t cs_space) { + CUDA_1D_KERNEL_LOOP(i, cs_up_space) { + auto* in_ptr_base = in_ptr + i * cs_space; + auto* out_ptr_base = out_ptr + i * cs_space; + + // calculate cs_space data in one thread + for (auto j = 0; j < cs_space; j++) { + out_ptr_base[j] = in_ptr_base[j]; + if (j != 0) { out_ptr_base[j] = BinaryFunc::Invoke(out_ptr_base[j], out_ptr_base[j - 1]); } + } + } +} +} // namespace + +template class BinaryFunc> +class GpuCumKernel : public user_op::OpKernel { + public: + GpuCumKernel() = default; + ~GpuCumKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + // judge whether tensor has 0 size dimension first + const auto* in = ctx->Tensor4ArgNameAndIndex("x", 0); + auto elem_cnt = in->shape_view().elem_cnt(); + if (!elem_cnt) { return; } + + auto* out = ctx->Tensor4ArgNameAndIndex("y", 0); + auto dim = ctx->Attr("dim"); + const auto* in_ptr = in->dptr(); + auto* out_ptr = out->mut_dptr(); + + // data partition: up_space|space|down_space + auto up_space = elem_cnt / in->shape_view().Count(dim); + auto space = in->shape_view().At(dim); + auto down_space = in->shape_view().Count(dim + 1); + auto thread_num = up_space * down_space; + + if (up_space == 1) { + RUN_CUDA_KERNEL((CumsumForwardGpuUpSpaceIs1), ctx->stream(), thread_num, + in_ptr, out_ptr, space, down_space); + } else if (down_space == 1) { + RUN_CUDA_KERNEL((CumsumForwardGpuDownSpaceIs1), ctx->stream(), thread_num, + in_ptr, out_ptr, up_space, space); + } else { + RUN_CUDA_KERNEL((CumsumForwardGpu), ctx->stream(), thread_num, in_ptr, out_ptr, + up_space, space, down_space); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class GpuCumSumKernel final : public GpuCumKernel { + public: + GpuCumSumKernel() = default; + ~GpuCumSumKernel() = default; +}; + +#define REGISTER_CUDA_CUMSUM_KERNEL(dtype) \ + REGISTER_USER_KERNEL("cumsum").SetCreateFn>().SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("y", 0) == GetDataType::value)); + +REGISTER_CUDA_CUMSUM_KERNEL(int32_t) +REGISTER_CUDA_CUMSUM_KERNEL(int64_t) +REGISTER_CUDA_CUMSUM_KERNEL(float) +REGISTER_CUDA_CUMSUM_KERNEL(double) +#undef REGISTER_CUDA_CUMSUM_KERNEL + +template +class GpuCumProdKernel final : public GpuCumKernel { + public: + GpuCumProdKernel() = default; + ~GpuCumProdKernel() = default; +}; + +#define REGISTER_CUDA_CUMPROD_KERNEL(dtype) \ + REGISTER_USER_KERNEL("cumprod").SetCreateFn>().SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("y", 0) == GetDataType::value)); + +REGISTER_CUDA_CUMPROD_KERNEL(int32_t) +REGISTER_CUDA_CUMPROD_KERNEL(int64_t) +REGISTER_CUDA_CUMPROD_KERNEL(float) +REGISTER_CUDA_CUMPROD_KERNEL(double) +#undef REGISTER_CUDA_CUMPROD_KERNEL +#endif } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/data_shuffle_kernel.hip.cpp b/oneflow/user/kernels/data_shuffle_kernel.hip.cpp index 703b044..ed4b9b3 100644 --- a/oneflow/user/kernels/data_shuffle_kernel.hip.cpp +++ b/oneflow/user/kernels/data_shuffle_kernel.hip.cpp @@ -1,1523 +1,1523 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/device/nccl_util.h" -#include "oneflow/core/job/eager_nccl_comm_manager.h" -#include "oneflow/core/job/parallel_desc.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include "oneflow/user/kernels/gather_kernel_util.h" -#include "oneflow/user/kernels/unsorted_segment_sum_kernel_util.h" -#include "oneflow/core/hip/atomic.hip.h" -#include "oneflow/core/embedding/hash_functions.hip.h" -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/core/ep/include/primitive/copy_nd.h" -#include "oneflow/core/hip/atomic.hip.h" - -namespace oneflow { - -namespace { - -template -struct TableEntry { - K key; - uint32_t value; -}; - -template -__global__ void HashTableUniqueAndPartitionPairs(const uint32_t table_capacity, - const uint32_t num_keys, int32_t num_partition, - IDX* unique_counts, TableEntry* table, - const K* keys, const V* values, - K* partitioned_unique_keys, - V* partitioned_unique_values, IDX* reverse_index, - bool need_process_values) { - CUDA_1D_KERNEL_LOOP_T(uint32_t, i, num_keys) { - IDX r_index_plus_one = 0; - const K key = keys[i]; - size_t key_hash = HASH()(key); - uint32_t partition_id = key_hash % num_partition; - IDX* unique_count = unique_counts + partition_id; - K* unique_keys = partitioned_unique_keys + partition_id * num_keys; - uint32_t pos = key_hash % table_capacity; - const K key_hi = (key | 0x1); - const K key_lo = (key & 0x1); - uint32_t counter = 0; - while (r_index_plus_one == 0) { - bool prob_next = false; - K* key_ptr = &table[pos].key; - volatile uint32_t* table_value_ptr = &table[pos].value; - const K old_key = cuda::atomic::CAS(key_ptr, 0, key_hi); - if (old_key == 0) { - IDX unique_pos = cuda::atomic::Add(unique_count, 1); - r_index_plus_one = unique_pos + 1; - unique_keys[unique_pos] = key; - if (need_process_values) { - partitioned_unique_values[partition_id * num_keys + unique_pos] = values[i]; - } - *table_value_ptr = ((r_index_plus_one << 1U) | key_lo); - } else if (old_key == key_hi) { - const uint32_t value = *table_value_ptr; - if (value == 0) { - // do nothing - } else if ((value & 0x1) == key_lo) { - r_index_plus_one = (value >> 1U); - } else { - prob_next = true; - } - } else { - prob_next = true; - } - if (prob_next) { - pos += 1; - counter += 1; - if (pos >= table_capacity) { pos -= table_capacity; } - if (counter >= table_capacity) { asm volatile("s_trap 0;"); } - } - } - reverse_index[i] = partition_id * num_keys + r_index_plus_one - 1; - } -} - -template -__global__ void GenerateTableIds(int32_t elem_cnt, int32_t num_tables, U* table_ids) { - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { table_ids[i] = i % num_tables; } -} - -template -void UniqueAndPartition(hipStream_t cuda_stream, int64_t num_ids, size_t capacity, - int64_t num_partition, const K* ids, const V* table_ids, - IDX* num_partitioned_unique_ids_ptr, K* partitioned_unique_ids, - V* partitioned_unique_table_ids, IDX* inverse_unique_partition_indices, - void* workspace_ptr, size_t workspace_bytes, bool need_process_table_ids) { - size_t table_capacity_bytes = capacity * sizeof(TableEntry); - CHECK_GE(workspace_bytes, table_capacity_bytes); - OF_CUDA_CHECK(hipMemsetAsync(workspace_ptr, 0, table_capacity_bytes, cuda_stream)); - OF_CUDA_CHECK( - hipMemsetAsync(num_partitioned_unique_ids_ptr, 0, num_partition * sizeof(IDX), cuda_stream)); - hipLaunchKernelGGL(HIP_KERNEL_NAME(HashTableUniqueAndPartitionPairs), BlocksNum4ThreadsNum(num_ids), kCudaThreadsNumPerBlock, 0, cuda_stream, - capacity, num_ids, num_partition, num_partitioned_unique_ids_ptr, - reinterpret_cast*>(workspace_ptr), ids, table_ids, partitioned_unique_ids, - partitioned_unique_table_ids, inverse_unique_partition_indices, need_process_table_ids); -} - -template -void ShuffleData(hipStream_t cuda_stream, ncclComm_t comm, DataType data_type, - const std::vector& send_offsets, - const std::vector& send_elem_cnt, const T* send_data, - const std::vector& recv_offsets, - const std::vector& recv_elem_cnt, T* recv_data) { - ncclDataType_t nccl_data_type = GetNcclDataType(data_type); - const int64_t parallel_num = send_offsets.size(); - OF_NCCL_CHECK(ncclGroupStart()); - for (int64_t i = 0; i < parallel_num; ++i) { - OF_NCCL_CHECK(ncclSend(send_data + send_offsets.at(i), send_elem_cnt.at(i), nccl_data_type, i, - comm, cuda_stream)); - OF_NCCL_CHECK(ncclRecv(recv_data + recv_offsets.at(i), recv_elem_cnt.at(i), nccl_data_type, i, - comm, cuda_stream)); - } - OF_NCCL_CHECK(ncclGroupEnd()); -} - -template -void MakeShuffleParams(const IDX* host_num_unique_matrix, const int64_t num_ids, - const int64_t row_size, int64_t parallel_id, int64_t parallel_num, - std::vector* scatter_offset_vec, - std::vector* scatter_elem_cnt_vec, - std::vector* gather_offset_vec, - std::vector* gather_elem_cnt_vec) { - scatter_offset_vec->resize(parallel_num); - scatter_elem_cnt_vec->resize(parallel_num); - gather_offset_vec->resize(parallel_num); - gather_elem_cnt_vec->resize(parallel_num); - int64_t gather_offset = 0; - for (int64_t i = 0; i < parallel_num; ++i) { - const int64_t scatter_elem_cnt = - host_num_unique_matrix[parallel_id * parallel_num + i] * row_size; - const int64_t gather_elem_cnt = - host_num_unique_matrix[i * parallel_num + parallel_id] * row_size; - scatter_offset_vec->at(i) = i * num_ids * row_size; - scatter_elem_cnt_vec->at(i) = scatter_elem_cnt; - gather_offset_vec->at(i) = gather_offset; - gather_elem_cnt_vec->at(i) = gather_elem_cnt; - gather_offset += gather_elem_cnt; - } -} - -template -void ShuffleIdsAndTableIds(hipStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id, - int64_t parallel_num, int64_t num_ids, DataType ids_data_type, - DataType table_ids_data_type, IDX* host_num_unique_matrix, - K* partitioned_unique_ids, U* partitioned_unique_table_ids, - K* received_ids, U* received_table_ids, int64_t* received_elem_cnt, - bool need_process_table_ids) { - std::vector send_offsets; - std::vector send_elem_cnt; - std::vector recv_offsets; - std::vector recv_elem_cnt; - MakeShuffleParams(host_num_unique_matrix, num_ids, 1, parallel_id, parallel_num, &send_offsets, - &send_elem_cnt, &recv_offsets, &recv_elem_cnt); - ShuffleData(cuda_stream, comm, ids_data_type, send_offsets, send_elem_cnt, partitioned_unique_ids, - recv_offsets, recv_elem_cnt, received_ids); - *received_elem_cnt = recv_offsets.at(parallel_num - 1) + recv_elem_cnt.at(parallel_num - 1); - if (need_process_table_ids) { - ShuffleData(cuda_stream, comm, table_ids_data_type, send_offsets, send_elem_cnt, - partitioned_unique_table_ids, recv_offsets, recv_elem_cnt, received_table_ids); - } -} - -enum class IdShuffleBufferType { - kNumPartitionedUnique = 0, - kPartitionedUniqueIds, - kReceivedIds, - kTableIds, - kPartitionedUniqueTableIds, - kReceivedTableIds, - kWorkspace, - kMaxType -}; - -template -class IdShuffleTmpBufferManager final { - public: - OF_DISALLOW_COPY_AND_MOVE(IdShuffleTmpBufferManager); - IdShuffleTmpBufferManager(void* ptr, const int64_t num_ids, const int64_t parallel_num, - bool need_table_ids, bool need_process_table_ids) - : offset_(0), - offsets_(static_cast(IdShuffleBufferType::kMaxType), -1), - sizes_(static_cast(IdShuffleBufferType::kMaxType)), - ptr_(ptr) { - const int64_t num_table_ids = need_process_table_ids ? num_ids : 0; - const size_t table_ids_bytes = need_table_ids ? num_ids * sizeof(U) : 0; - AllocBuffer(IdShuffleBufferType::kNumPartitionedUnique, parallel_num * sizeof(IDX)); - size_t partitioned_ids_bytes = parallel_num * num_ids * sizeof(K); - AllocBuffer(IdShuffleBufferType::kPartitionedUniqueIds, partitioned_ids_bytes); - AllocBuffer(IdShuffleBufferType::kReceivedIds, partitioned_ids_bytes); - AllocBuffer(IdShuffleBufferType::kTableIds, table_ids_bytes); - size_t partitioned_table_ids_bytes = parallel_num * num_table_ids * sizeof(U); - AllocBuffer(IdShuffleBufferType::kPartitionedUniqueTableIds, partitioned_table_ids_bytes); - AllocBuffer(IdShuffleBufferType::kReceivedTableIds, partitioned_table_ids_bytes); - const size_t hash_table_capacity = parallel_num * num_ids; - AllocBuffer(IdShuffleBufferType::kWorkspace, hash_table_capacity * sizeof(TableEntry)); - } - - template - T* Ptr(IdShuffleBufferType type) { - CHECK(ptr_ != nullptr); - int64_t offset = offsets_.at(static_cast(type)); - CHECK_NE(offset, -1); - return reinterpret_cast(reinterpret_cast(ptr_) + offset); - } - - int64_t Size(IdShuffleBufferType type) { return sizes_.at(static_cast(type)); } - - size_t TotalBufferSize() const { return offset_; } - - private: - void AllocBuffer(IdShuffleBufferType type, size_t size) { - const size_t type_id = static_cast(type); - CHECK_EQ(offsets_.at(type_id), -1); - offsets_.at(type_id) = offset_; - sizes_.at(type_id) = size; - offset_ += GetCudaAlignedSize(size); - } - size_t offset_; - std::vector offsets_; - std::vector sizes_; - void* ptr_; -}; - -template -class DataShuffleKernelState final : public user_op::OpKernelState { - public: - explicit DataShuffleKernelState(user_op::KernelInitContext* ctx) - : device_index_(-1), - stream_name_(EagerNcclCommMgr::kDefaultStreamName), - parallel_desc_(ctx->parallel_desc()) { - OF_CUDA_CHECK(hipGetDevice(&device_index_)); - if (ctx->op_conf().has_stream_name_hint()) { stream_name_ = ctx->op_conf().stream_name_hint(); } - OF_CUDA_CHECK(hipMallocHost( - reinterpret_cast(&host_num_unique_matrix_), - parallel_desc_.parallel_num() * parallel_desc_.parallel_num() * sizeof(IDX))); - } - ~DataShuffleKernelState() { - CudaCurrentDeviceGuard guard(device_index_); - OF_CUDA_CHECK(hipHostFree(host_num_unique_matrix_)); - } - - ncclComm_t comm() { return GetOrCreate().comm; } - - IDX* HostNumUniqueMatrix() { return host_num_unique_matrix_; } - - private: - struct Comm { - Comm(ncclComm_t comm) : comm(comm) {} - ncclComm_t comm; - }; - - const Comm& GetOrCreate() { - if (!comm_) { Init(); } - return *comm_; - } - - void Init() { - std::set> device_set; - for (int64_t parallel_id = 0; parallel_id < parallel_desc_.parallel_num(); ++parallel_id) { - int64_t machine_id = CHECK_JUST(parallel_desc_.MachineId4ParallelId(parallel_id)); - int64_t device_id = CHECK_JUST(parallel_desc_.DeviceId4ParallelId(parallel_id)); - device_set.emplace(std::make_pair(machine_id, device_id)); - } - EagerNcclCommMgr* comm_mgr = CHECK_NOTNULL(Singleton::Get()); - ncclComm_t comm; - comm = comm_mgr->GetCommForDeviceAndStreamName(device_set, stream_name_); - comm_.reset(new Comm(comm)); - } - - int device_index_; - bool has_independent_stream_; - std::string stream_name_; - ParallelDesc parallel_desc_; - std::unique_ptr comm_; - IDX* host_num_unique_matrix_; -}; - -} // namespace - -template -class IdShuffleKernel final : public user_op::OpKernel { - public: - IdShuffleKernel() = default; - ~IdShuffleKernel() override = default; - - std::shared_ptr CreateOpKernelState( - user_op::KernelInitContext* ctx) const override { - return std::make_shared>(ctx); - } - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, - const user_op::OpKernelCache*) const override { - auto* kernel_state = dynamic_cast*>(state); - CHECK(kernel_state != nullptr); - const user_op::Tensor* ids = ctx->Tensor4ArgNameAndIndex("ids", 0); - user_op::Tensor* num_unique_matrix = ctx->Tensor4ArgNameAndIndex("num_unique_matrix", 0); - user_op::Tensor* inverse_unique_partition_indices = - ctx->Tensor4ArgNameAndIndex("inverse_unique_partition_indices", 0); - user_op::Tensor* cur_rank_num_unique = ctx->Tensor4ArgNameAndIndex("cur_rank_num_unique", 0); - user_op::Tensor* cur_rank_unique_ids = ctx->Tensor4ArgNameAndIndex("cur_rank_unique_ids", 0); - user_op::Tensor* cur_rank_unique_table_ids = - ctx->Tensor4ArgNameAndIndex("cur_rank_unique_table_ids", 0); - user_op::Tensor* cur_rank_inverse_indices = - ctx->Tensor4ArgNameAndIndex("cur_rank_inverse_indices", 0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const int32_t num_tables = ctx->Attr("num_tables"); - const bool has_table_ids = ctx->has_input("table_ids", 0); - const bool need_gen_table_ids = (!has_table_ids && num_tables > 1); - const bool need_process_table_ids = (has_table_ids || num_tables > 1); - const int64_t num_ids = ids->shape_view().elem_cnt(); - const int64_t parallel_num = ctx->parallel_ctx().parallel_num(); - const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - hipStream_t cuda_stream = ctx->stream()->As()->cuda_stream(); - IdShuffleTmpBufferManager buffer_manager( - tmp_buffer->mut_dptr(), num_ids, parallel_num, need_gen_table_ids, need_process_table_ids); - CHECK_GE(tmp_buffer->shape_view().elem_cnt(), buffer_manager.TotalBufferSize()); - - const U* table_ids_ptr; - if (has_table_ids) { - const user_op::Tensor* table_ids = ctx->Tensor4ArgNameAndIndex("table_ids", 0); - table_ids_ptr = reinterpret_cast(table_ids->dptr()); - } else if (need_gen_table_ids) { - hipLaunchKernelGGL(GenerateTableIds, BlocksNum4ThreadsNum(num_ids), kCudaThreadsNumPerBlock, 0, cuda_stream, - num_ids, num_tables, buffer_manager.template Ptr(IdShuffleBufferType::kTableIds)); - table_ids_ptr = buffer_manager.template Ptr(IdShuffleBufferType::kTableIds); - } else { - table_ids_ptr = nullptr; - } - IDX* num_partitioned_unique = - buffer_manager.template Ptr(IdShuffleBufferType::kNumPartitionedUnique); - K* partitioned_unique_ids = - buffer_manager.template Ptr(IdShuffleBufferType::kPartitionedUniqueIds); - U* partitioned_unique_table_ids = - buffer_manager.template Ptr(IdShuffleBufferType::kPartitionedUniqueTableIds); - IDX* num_unique_matrix_ptr = reinterpret_cast(num_unique_matrix->mut_dptr()); - size_t hash_table_capacity = parallel_num * num_ids; - void* workspace_ptr = buffer_manager.Ptr(IdShuffleBufferType::kWorkspace); - size_t workspace_size = buffer_manager.Size(IdShuffleBufferType::kWorkspace); - UniqueAndPartition( - cuda_stream, num_ids, hash_table_capacity, parallel_num, - reinterpret_cast(ids->dptr()), table_ids_ptr, num_partitioned_unique, - partitioned_unique_ids, partitioned_unique_table_ids, - reinterpret_cast(inverse_unique_partition_indices->mut_dptr()), workspace_ptr, - workspace_size, need_process_table_ids); - ncclComm_t comm = kernel_state->comm(); - OF_NCCL_CHECK(ncclAllGather(num_partitioned_unique, num_unique_matrix_ptr, parallel_num, - GetNcclDataType(num_unique_matrix->data_type()), comm, - cuda_stream)); - IDX* host_num_unique_matrix = kernel_state->HostNumUniqueMatrix(); - OF_CUDA_CHECK(hipMemcpyAsync(host_num_unique_matrix, num_unique_matrix_ptr, - parallel_num * parallel_num * sizeof(IDX), hipMemcpyDefault, - cuda_stream)); - CHECK_JUST(ctx->stream()->Sync()); - - K* received_ids = buffer_manager.template Ptr(IdShuffleBufferType::kReceivedIds); - U* received_table_ids = buffer_manager.template Ptr(IdShuffleBufferType::kReceivedTableIds); - int64_t received_elem_cnt = 0; - ShuffleIdsAndTableIds(cuda_stream, comm, parallel_id, parallel_num, num_ids, ids->data_type(), - cur_rank_unique_table_ids->data_type(), host_num_unique_matrix, - partitioned_unique_ids, partitioned_unique_table_ids, received_ids, - received_table_ids, &received_elem_cnt, need_process_table_ids); - UniqueAndPartition( - cuda_stream, received_elem_cnt, hash_table_capacity, 1, received_ids, received_table_ids, - reinterpret_cast(cur_rank_num_unique->mut_dptr()), - reinterpret_cast(cur_rank_unique_ids->mut_dptr()), - reinterpret_cast(cur_rank_unique_table_ids->mut_dptr()), - reinterpret_cast(cur_rank_inverse_indices->mut_dptr()), workspace_ptr, workspace_size, - need_process_table_ids); - if (!need_process_table_ids) { - OF_CUDA_CHECK(hipMemsetAsync(cur_rank_unique_table_ids->mut_dptr(), 0, - received_elem_cnt * sizeof(U), cuda_stream)); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define ID_DATA_TYPE_SEQ \ - OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \ - OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64) \ - OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) \ - OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64) - -#define TABLE_ID_DATA_TYPE_SEQ \ - OF_PP_MAKE_TUPLE_SEQ(uint8_t, DataType::kUInt8) \ - OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \ - OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64) \ - OF_PP_MAKE_TUPLE_SEQ(int8_t, DataType::kInt8) \ - OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) \ - OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64) - -#define IDX_DATA_TYPE_SEQ \ - OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \ - OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) - -#define REGISTER_CUDA_ID_SHUFFLE_KERNEL(k_dtype_pair, table_id_dtype_pair, idx_dtype_pair) \ - REGISTER_USER_KERNEL("id_shuffle") \ - .SetCreateFn< \ - IdShuffleKernel>() \ - .SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("ids", 0) == OF_PP_PAIR_SECOND(k_dtype_pair)) \ - && (user_op::HobDataType("cur_rank_unique_table_ids", 0) \ - == OF_PP_PAIR_SECOND(table_id_dtype_pair)) \ - && (user_op::HobDataType("num_unique_matrix", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ - const user_op::TensorDesc& ids = ctx->InputTensorDesc("ids", 0); \ - const bool has_table_ids = ctx->has_input("table_ids", 0); \ - const int32_t num_tables = ctx->Attr("num_tables"); \ - const bool need_gen_table_ids = (!has_table_ids && num_tables > 1); \ - const bool need_process_table_ids = (has_table_ids || num_tables > 1); \ - IdShuffleTmpBufferManager \ - buffer_manager(nullptr, ids.shape().elem_cnt(), ctx->parallel_desc().parallel_num(), \ - need_gen_table_ids, need_process_table_ids); \ - return buffer_manager.TotalBufferSize(); \ - }); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_ID_SHUFFLE_KERNEL, ID_DATA_TYPE_SEQ, - TABLE_ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) - -template -void ShuffleEmbeddings(hipStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id, - int64_t parallel_num, int64_t num_ids, int64_t embedding_size, - DataType data_type, IDX* host_num_unique_matrix, - T* reverse_unique_cur_rank_embeddings, T* received_embeddings) { - std::vector send_offsets; - std::vector send_elem_cnt; - std::vector recv_offsets; - std::vector recv_elem_cnt; - MakeShuffleParams(host_num_unique_matrix, num_ids, embedding_size, parallel_id, parallel_num, - &recv_offsets, &recv_elem_cnt, &send_offsets, &send_elem_cnt); - ShuffleData(cuda_stream, comm, data_type, send_offsets, send_elem_cnt, - reverse_unique_cur_rank_embeddings, recv_offsets, recv_elem_cnt, received_embeddings); -} - -// Quantized Version. -template -void ShuffleEmbeddings(hipStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id, - int64_t parallel_num, int64_t num_ids, int64_t embedding_size, - DataType data_type, IDX* host_num_unique_matrix, - int8_t* reverse_unique_cur_rank_embeddings, int8_t* received_embeddings, - T* reverse_cur_rank_quantize_factor, T* recv_quantize_factor) { - std::vector send_offsets; - std::vector send_elem_cnt; - std::vector recv_offsets; - std::vector recv_elem_cnt; - // shuffle quantized_embedding - MakeShuffleParams(host_num_unique_matrix, num_ids, embedding_size, parallel_id, parallel_num, - &recv_offsets, &recv_elem_cnt, &send_offsets, &send_elem_cnt); - ShuffleData(cuda_stream, comm, DataType::kInt8, send_offsets, send_elem_cnt, - reverse_unique_cur_rank_embeddings, recv_offsets, recv_elem_cnt, received_embeddings); - // shuffle quantize_factor - MakeShuffleParams(host_num_unique_matrix, num_ids, /*embedding_size=*/1, parallel_id, - parallel_num, &recv_offsets, &recv_elem_cnt, &send_offsets, &send_elem_cnt); - ShuffleData(cuda_stream, comm, data_type, send_offsets, send_elem_cnt, - reverse_cur_rank_quantize_factor, recv_offsets, recv_elem_cnt, recv_quantize_factor); -} - -__device__ float RoundHalfAwayFromZero(const float x) { - float abs_val = abs(x); - float floor_val = floor(abs_val + static_cast(0.5)); - return copysignf(floor_val, x); -} - -// warp reduce version. -constexpr int32_t kWarpSize = 32; -constexpr int32_t kMaxColSize = 1024; - -template -__inline__ __device__ T WarpMaxAllReduce(T val) { - for (int32_t lane_mask = thread_group_width / 2; lane_mask > 0; lane_mask /= 2) { - // val = max(val, __shfl_xor_sync(0xffffffff, val, lane_mask, thread_group_width)); - val = max(val, __shfl_xor(val, lane_mask, thread_group_width)); - } - return val; -} - -inline hipError_t GetWarpImplNumBlocks(int64_t block_size, int64_t max_blocks, int64_t waves, - int* num_blocks) { - int dev; - { - hipError_t err = hipGetDevice(&dev); - if (err != hipSuccess) { return err; } - } - int sm_count; - { - hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev); - if (err != hipSuccess) { return err; } - } - int tpm; - { - hipError_t err = hipDeviceGetAttribute(&tpm, hipDeviceAttributeMaxThreadsPerMultiProcessor, dev); - if (err != hipSuccess) { return err; } - } - *num_blocks = - std::max(1, std::min(max_blocks, sm_count * tpm / block_size * waves)); - return hipSuccess; -} - -template -__global__ void QuantizeWarpImplKernel(const T* src, int8_t* dst, T* quantize_factor, - const int64_t rows, const int64_t cols) { - static_assert(cols_per_thread % pack_size == 0, ""); - static_assert(thread_group_width <= kWarpSize, ""); - static_assert(kWarpSize % thread_group_width == 0, ""); - constexpr int num_packs = cols_per_thread / pack_size; - assert(cols <= cols_per_thread * thread_group_width); - ComputeType buf[rows_per_access][cols_per_thread]; - const int global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y; - const int num_global_thread_group = gridDim.x * blockDim.y; - const int lane_id = threadIdx.x; - const int64_t step = num_global_thread_group * rows_per_access; - using LoadType = cuda::elementwise::PackType; - using LoadPack = cuda::elementwise::Pack; - using StoreType = cuda::elementwise::PackType; - using StorePack = cuda::elementwise::Pack; - - for (int64_t row = global_thread_group_id * rows_per_access; row < rows; row += step) { - ComputeType thread_abs_max[rows_per_access]; -#pragma unroll - for (int row_id = 0; row_id < rows_per_access; row_id++) { - ComputeType* row_buf = buf[row_id]; - thread_abs_max[row_id] = 0.0; -#pragma unroll - for (int pack_id = 0; pack_id < num_packs; pack_id++) { - const int pack_offset = pack_id * pack_size; - const int col = (pack_id * thread_group_width + lane_id) * pack_size; - LoadPack load_pack; - if (!padding || col < cols) { - const int64_t load_offset = ((row + row_id) * cols + col) / pack_size; - load_pack.storage = *(reinterpret_cast(src) + load_offset); -#pragma unroll - for (int i = 0; i < pack_size; i++) { - row_buf[pack_offset + i] = static_cast(load_pack.elem[i]); - thread_abs_max[row_id] = max(thread_abs_max[row_id], abs(row_buf[pack_offset + i])); - } - } else { -#pragma unroll - for (int i = 0; i < pack_size; i++) { row_buf[pack_offset + i] = 0.0; } - } - } - } - ComputeType warp_max[rows_per_access]; -#pragma unroll - for (int row_id = 0; row_id < rows_per_access; row_id++) { - warp_max[row_id] = WarpMaxAllReduce(thread_abs_max[row_id]); - if (threadIdx.x == 0) { quantize_factor[row + row_id] = static_cast(warp_max[row_id]); } - ComputeType* row_buf = buf[row_id]; - ComputeType quantize_factor_val = static_cast(127.0) / warp_max[row_id]; -#pragma unroll - for (int col = 0; col < cols_per_thread; col++) { - row_buf[col] = RoundHalfAwayFromZero(row_buf[col] * quantize_factor_val); - } -#pragma unroll - for (int pack_id = 0; pack_id < num_packs; pack_id++) { - const int pack_offset = pack_id * pack_size; - const int col = (pack_id * thread_group_width + lane_id) * pack_size; - StorePack store_pack; - if (!padding || col < cols) { - const int64_t store_offset = ((row + row_id) * cols + col) / pack_size; - for (int i = 0; i < pack_size; i++) { - store_pack.elem[i] = static_cast(row_buf[pack_id * pack_size + i]); - } - *(reinterpret_cast(dst) + store_offset) = store_pack.storage; - } - } - } - } -} - -template -inline hipError_t LaunchQuantizeWarpImpl(hipStream_t stream, const T* src, int8_t* dst, - T* quantize_factor, const int64_t rows, - const int64_t cols) { - constexpr int block_size = 128; - constexpr int waves = 32; - static_assert(block_size % thread_group_width == 0, ""); - constexpr int thread_groups_per_block = block_size / thread_group_width; - dim3 block_dim(thread_group_width, thread_groups_per_block); - const int64_t num_blocks = - (rows / rows_per_access + thread_groups_per_block - 1) / thread_groups_per_block; - int grid_dim_x = 0; - - hipError_t err = GetWarpImplNumBlocks(block_size, num_blocks, waves, &grid_dim_x); - if (err != hipSuccess) { return err; } - - QuantizeWarpImplKernel - <<>>(src, dst, quantize_factor, rows, cols); - return hipPeekAtLastError(); -} - -template -inline hipError_t DispatchQuantizeWarpImplPadding(hipStream_t stream, const T* src, int8_t* dst, - T* quantize_factor, const int64_t rows, - const int64_t cols) { - if (cols == cols_per_thread * thread_group_width) { - return LaunchQuantizeWarpImpl(stream, src, dst, quantize_factor, rows, - cols); - } else { - return LaunchQuantizeWarpImpl(stream, src, dst, quantize_factor, rows, - cols); - } -} - -template -typename std::enable_if::type DispatchQuantizeWarpImplCols( - hipStream_t stream, const T* src, int8_t* dst, T* quantize_factor, const int64_t rows, - const int64_t cols) { - if (cols <= 0) { return hipErrorInvalidValue; } -#define DEFINE_ONE_ELIF(thread_group_width) \ - else if (cols <= (thread_group_width)*pack_size) { \ - if (rows % 2 == 0) { \ - return DispatchQuantizeWarpImplPadding(stream, src, dst, \ - quantize_factor, rows, cols); \ - } else { \ - return DispatchQuantizeWarpImplPadding(stream, src, dst, \ - quantize_factor, rows, cols); \ - } \ - } - DEFINE_ONE_ELIF(1) - DEFINE_ONE_ELIF(2) - DEFINE_ONE_ELIF(4) - DEFINE_ONE_ELIF(8) - DEFINE_ONE_ELIF(16) - DEFINE_ONE_ELIF(32) -#undef DEFINE_ONE_ELIF -#define DEFINE_ONE_ELIF(col) \ - else if (cols <= (col)*kWarpSize) { \ - return DispatchQuantizeWarpImplPadding( \ - stream, src, dst, quantize_factor, rows, cols); \ - } - DEFINE_ONE_ELIF(2) - DEFINE_ONE_ELIF(3) - DEFINE_ONE_ELIF(4) - DEFINE_ONE_ELIF(5) - DEFINE_ONE_ELIF(6) - DEFINE_ONE_ELIF(7) - DEFINE_ONE_ELIF(8) - DEFINE_ONE_ELIF(9) - DEFINE_ONE_ELIF(10) - DEFINE_ONE_ELIF(11) - DEFINE_ONE_ELIF(12) - DEFINE_ONE_ELIF(13) - DEFINE_ONE_ELIF(14) - DEFINE_ONE_ELIF(15) - DEFINE_ONE_ELIF(16) - DEFINE_ONE_ELIF(17) - DEFINE_ONE_ELIF(18) - DEFINE_ONE_ELIF(19) - DEFINE_ONE_ELIF(20) - DEFINE_ONE_ELIF(21) - DEFINE_ONE_ELIF(22) - DEFINE_ONE_ELIF(23) - DEFINE_ONE_ELIF(24) - DEFINE_ONE_ELIF(25) - DEFINE_ONE_ELIF(26) - DEFINE_ONE_ELIF(27) - DEFINE_ONE_ELIF(28) - DEFINE_ONE_ELIF(29) - DEFINE_ONE_ELIF(30) - DEFINE_ONE_ELIF(31) - DEFINE_ONE_ELIF(32) -#undef DEFINE_ONE_ELIF - else { - return hipErrorInvalidValue; - } -} - -template -typename std::enable_if::type DispatchQuantizeWarpImplCols( - hipStream_t stream, const T* src, int8_t* dst, T* quantize_factor, const int64_t rows, - const int64_t cols) { - if (cols <= 0) { return hipErrorInvalidValue; } -#define DEFINE_ONE_ELIF(thread_group_width) \ - else if (cols <= (thread_group_width)*pack_size) { \ - if (rows % 2 == 0) { \ - return DispatchQuantizeWarpImplPadding(stream, src, dst, \ - quantize_factor, rows, cols); \ - } else { \ - return DispatchQuantizeWarpImplPadding(stream, src, dst, \ - quantize_factor, rows, cols); \ - } \ - } - DEFINE_ONE_ELIF(1) - DEFINE_ONE_ELIF(2) - DEFINE_ONE_ELIF(4) - DEFINE_ONE_ELIF(8) - DEFINE_ONE_ELIF(16) - DEFINE_ONE_ELIF(32) -#undef DEFINE_ONE_ELIF -#define DEFINE_ONE_ELIF(col) \ - else if (cols <= (col)*kWarpSize) { \ - return DispatchQuantizeWarpImplPadding( \ - stream, src, dst, quantize_factor, rows, cols); \ - } - DEFINE_ONE_ELIF(4) - DEFINE_ONE_ELIF(6) - DEFINE_ONE_ELIF(8) - DEFINE_ONE_ELIF(10) - DEFINE_ONE_ELIF(12) - DEFINE_ONE_ELIF(14) - DEFINE_ONE_ELIF(16) - DEFINE_ONE_ELIF(18) - DEFINE_ONE_ELIF(20) - DEFINE_ONE_ELIF(22) - DEFINE_ONE_ELIF(24) - DEFINE_ONE_ELIF(26) - DEFINE_ONE_ELIF(28) - DEFINE_ONE_ELIF(30) - DEFINE_ONE_ELIF(32) -#undef DEFINE_ONE_ELIF - else { - return hipErrorInvalidValue; - } -} - -template -struct DispatchQuantizeWarpImplPackSize { - hipError_t operator()(hipStream_t stream, const T* src, int8_t* dst, T* quantize_factor, - const int64_t rows, const int64_t cols) { - if (cols % 2 == 0) { - return DispatchQuantizeWarpImplCols(stream, src, dst, quantize_factor, - rows, cols); - } else { - return DispatchQuantizeWarpImplCols(stream, src, dst, quantize_factor, - rows, cols); - } - } -}; - -template -__global__ void DequantizeKernel(const int8_t* x, T* quantize_factor, T* out, IDX col_size, - IDX elem_cnt); - -template -__global__ void DequantizeKernel(const int8_t* x, T* quantize_factor, T* out, IDX col_size, - IDX elem_cnt) { - IDX global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - - for (int index = global_thread_id * pack_size; index < elem_cnt; - index += gridDim.x * blockDim.x * pack_size) { - IDX quantize_factor_idx = index / col_size; - ComputeType quantize_factor_val = static_cast(quantize_factor[quantize_factor_idx]) - / static_cast(127.0); - using LoadPackType = cuda::elementwise::PackType; - using LoadPack = cuda::elementwise::Pack; - using StorePackType = cuda::elementwise::PackType; - using StorePack = cuda::elementwise::Pack; - LoadPack load_pack{}; - StorePack store_pack{}; - load_pack.storage = *(reinterpret_cast(x) + index / pack_size); -#pragma unroll - for (int i = 0; i < pack_size; i++) { - store_pack.elem[i] = - static_cast(static_cast(load_pack.elem[i]) * quantize_factor_val); - } - *(reinterpret_cast(out) + index / pack_size) = store_pack.storage; - } -} - -template -hipError_t DispatchDequantizeKernelPackSize(hipStream_t stream, const int8_t* src, - T* quantize_factor, T* dst, const int64_t col_size, - const int64_t elem_cnt) { - const int64_t pack_num = elem_cnt / pack_size; - int grid_size = 0; - hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size); - if (err != hipSuccess) { return err; } - hipLaunchKernelGGL(HIP_KERNEL_NAME(DequantizeKernel), grid_size, cuda::elementwise::kBlockSize, 0, stream, src, quantize_factor, dst, col_size, - elem_cnt); - return hipSuccess; -} - -template -inline hipError_t LaunchDequantizeKernel(hipStream_t stream, const int8_t* src, - T* quantize_factor, T* dst, const int64_t col_size, - const int64_t elem_cnt) { - constexpr int quantized_src_pack_size = cuda::elementwise::PackSize(); - constexpr int dst_pack_size = cuda::elementwise::PackSize(); - int launch_pack_size = std::min(quantized_src_pack_size, dst_pack_size); - if (launch_pack_size == 8 && col_size % 8 == 0) { - hipError_t err = DispatchDequantizeKernelPackSize( - stream, src, quantize_factor, dst, col_size, elem_cnt); - if (err != hipSuccess) { return err; } - } else if (launch_pack_size == 4 && col_size % 4 == 0) { - hipError_t err = DispatchDequantizeKernelPackSize( - stream, src, quantize_factor, dst, col_size, elem_cnt); - if (err != hipSuccess) { return err; } - } else if (launch_pack_size == 2 && col_size % 2 == 0) { - hipError_t err = DispatchDequantizeKernelPackSize( - stream, src, quantize_factor, dst, col_size, elem_cnt); - if (err != hipSuccess) { return err; } - } else { - hipError_t err = DispatchDequantizeKernelPackSize( - stream, src, quantize_factor, dst, col_size, elem_cnt); - if (err != hipSuccess) { return err; } - } - return hipPeekAtLastError(); -} - -template -struct DefaultComputeType { - using type = T; -}; - -template<> -struct DefaultComputeType { - using type = float; -}; - -template -class EmbeddingShuffleKernel final : public user_op::OpKernel { - public: - EmbeddingShuffleKernel() = default; - ~EmbeddingShuffleKernel() override = default; - - std::shared_ptr CreateOpKernelState( - user_op::KernelInitContext* ctx) const override { - return std::make_shared>(ctx); - } - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, - const user_op::OpKernelCache*) const override { - auto* kernel_state = dynamic_cast*>(state); - CHECK(kernel_state != nullptr); - const user_op::Tensor* cur_rank_embeddings = - ctx->Tensor4ArgNameAndIndex("cur_rank_embeddings", 0); - const user_op::Tensor* num_unique_matrix = ctx->Tensor4ArgNameAndIndex("num_unique_matrix", 0); - const user_op::Tensor* cur_rank_inverse_indices = - ctx->Tensor4ArgNameAndIndex("cur_rank_inverse_indices", 0); - const user_op::Tensor* inverse_unique_partition_indices = - ctx->Tensor4ArgNameAndIndex("inverse_unique_partition_indices", 0); - user_op::Tensor* embeddings = ctx->Tensor4ArgNameAndIndex("embeddings", 0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - ncclComm_t comm = kernel_state->comm(); - using ComputeType = typename DefaultComputeType::type; - const int64_t embedding_size = cur_rank_embeddings->shape_view().At(1); - IDX* host_num_unique_matrix = kernel_state->HostNumUniqueMatrix(); - DataType data_type = cur_rank_embeddings->data_type(); - const int64_t num_ids = inverse_unique_partition_indices->shape_view().elem_cnt(); - const int64_t parallel_num = ctx->parallel_ctx().parallel_num(); - const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - bool enable_quantized_comm_env_var = - ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM", false); - bool enable_quantized_comm = enable_quantized_comm_env_var && (embedding_size < kMaxColSize); - if (enable_quantized_comm_env_var && !enable_quantized_comm) { - LOG(WARNING) << "Only envrionment variable ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM=1 and " - "embedding_size less equal than 1024 can use quantized communication. "; - } - hipStream_t cuda_stream = ctx->stream()->As()->cuda_stream(); - OF_CUDA_CHECK(hipMemcpyAsync( - host_num_unique_matrix, reinterpret_cast(num_unique_matrix->dptr()), - parallel_num * parallel_num * sizeof(IDX), hipMemcpyDefault, cuda_stream)); - CHECK_JUST(ctx->stream()->Sync()); - int64_t cur_rank_num_ids = 0; - for (int64_t i = 0; i < parallel_num; ++i) { - cur_rank_num_ids += host_num_unique_matrix[i * parallel_num + parallel_id]; - } - size_t full_elem_cnt = parallel_num * num_ids * embedding_size; - CHECK_EQ(full_elem_cnt, cur_rank_embeddings->shape_view().elem_cnt()); - if (!enable_quantized_comm) { - size_t reverse_unique_cur_rank_embeddings_size = - GetCudaAlignedSize(full_elem_cnt * sizeof(T)); - size_t received_embeddings_size = reverse_unique_cur_rank_embeddings_size; - - CHECK_GE(tmp_buffer->shape_view().elem_cnt(), - reverse_unique_cur_rank_embeddings_size + received_embeddings_size); - - T* reverse_unique_cur_rank_embeddings = reinterpret_cast(tmp_buffer->mut_dptr()); - T* received_embeddings = reinterpret_cast(tmp_buffer->mut_dptr() - + reverse_unique_cur_rank_embeddings_size); - // reverse cur_rank unique - GatherKernelUtilImpl::Forward( - ctx->stream(), reinterpret_cast(cur_rank_inverse_indices->dptr()), - cur_rank_num_ids, cur_rank_embeddings->dptr(), - Shape({1, cur_rank_embeddings->shape_view().elem_cnt() / embedding_size, embedding_size}), - reverse_unique_cur_rank_embeddings, 0); - - ShuffleEmbeddings(cuda_stream, comm, parallel_id, parallel_num, num_ids, embedding_size, - data_type, host_num_unique_matrix, reverse_unique_cur_rank_embeddings, - received_embeddings); - - // reverse unique_partition - GatherKernelUtilImpl::Forward( - ctx->stream(), reinterpret_cast(inverse_unique_partition_indices->dptr()), - inverse_unique_partition_indices->shape_view().elem_cnt(), received_embeddings, - Shape({1, parallel_num * num_ids, embedding_size}), embeddings->mut_dptr(), 0); - } else { - size_t reverse_unique_cur_rank_embeddings_size = - GetCudaAlignedSize(full_elem_cnt * sizeof(int8_t)); - size_t received_embeddings_size = reverse_unique_cur_rank_embeddings_size; - size_t quantize_cur_rank_embeddings_size = reverse_unique_cur_rank_embeddings_size; - size_t reverse_recv_quantize_cur_rank_embeddings_size = - reverse_unique_cur_rank_embeddings_size; - size_t cur_rank_quantize_factor_size = - GetCudaAlignedSize(cur_rank_embeddings->shape_view().At(0) * sizeof(T)); - size_t reverse_cur_rank_quantize_factor_size = cur_rank_quantize_factor_size; - size_t recv_quantize_factor_size = cur_rank_quantize_factor_size; - size_t reverse_recv_quantize_factor_size = cur_rank_quantize_factor_size; - CHECK_GE(tmp_buffer->shape_view().elem_cnt(), - reverse_unique_cur_rank_embeddings_size + received_embeddings_size - + quantize_cur_rank_embeddings_size - + reverse_recv_quantize_cur_rank_embeddings_size + cur_rank_quantize_factor_size - + reverse_cur_rank_quantize_factor_size + recv_quantize_factor_size - + reverse_recv_quantize_factor_size); - int8_t* reverse_unique_cur_rank_embeddings = - reinterpret_cast(tmp_buffer->mut_dptr()); - int8_t* received_embeddings = reinterpret_cast( - tmp_buffer->mut_dptr() + reverse_unique_cur_rank_embeddings_size); - int8_t* quantize_cur_rank_embeddings = reinterpret_cast( - tmp_buffer->mut_dptr() + reverse_unique_cur_rank_embeddings_size - + received_embeddings_size); - int8_t* reverse_recv_quantize_cur_rank_embeddings = reinterpret_cast( - tmp_buffer->mut_dptr() + reverse_unique_cur_rank_embeddings_size - + received_embeddings_size + quantize_cur_rank_embeddings_size); - T* cur_rank_quantize_factor = reinterpret_cast( - tmp_buffer->mut_dptr() + reverse_unique_cur_rank_embeddings_size - + received_embeddings_size + quantize_cur_rank_embeddings_size - + reverse_recv_quantize_cur_rank_embeddings_size); - T* reverse_cur_rank_quantize_factor = reinterpret_cast( - tmp_buffer->mut_dptr() + reverse_unique_cur_rank_embeddings_size - + received_embeddings_size + quantize_cur_rank_embeddings_size - + reverse_recv_quantize_cur_rank_embeddings_size + cur_rank_quantize_factor_size); - T* recv_quantize_factor = reinterpret_cast( - tmp_buffer->mut_dptr() + reverse_unique_cur_rank_embeddings_size - + received_embeddings_size + quantize_cur_rank_embeddings_size - + reverse_recv_quantize_cur_rank_embeddings_size + cur_rank_quantize_factor_size - + reverse_cur_rank_quantize_factor_size); - T* reverse_recv_quantize_factor = reinterpret_cast( - tmp_buffer->mut_dptr() + reverse_unique_cur_rank_embeddings_size - + received_embeddings_size + quantize_cur_rank_embeddings_size - + reverse_recv_quantize_cur_rank_embeddings_size + cur_rank_quantize_factor_size - + reverse_cur_rank_quantize_factor_size + recv_quantize_factor_size); - DispatchQuantizeWarpImplPackSize()( - cuda_stream, cur_rank_embeddings->dptr(), quantize_cur_rank_embeddings, - cur_rank_quantize_factor, cur_rank_num_ids, embedding_size); - // reverse cur_rank embedding unique - GatherKernelUtilImpl::Forward( - ctx->stream(), reinterpret_cast(cur_rank_inverse_indices->dptr()), - cur_rank_num_ids, quantize_cur_rank_embeddings, - Shape({1, cur_rank_embeddings->shape_view().elem_cnt() / embedding_size, embedding_size}), - reverse_unique_cur_rank_embeddings, 0); - - // reverse cur_rank quantize factor unique - GatherKernelUtilImpl::Forward( - ctx->stream(), reinterpret_cast(cur_rank_inverse_indices->dptr()), - cur_rank_num_ids, cur_rank_quantize_factor, - Shape({1, cur_rank_embeddings->shape_view().elem_cnt() / embedding_size, 1}), - reverse_cur_rank_quantize_factor, 0); - - ShuffleEmbeddings(cuda_stream, comm, parallel_id, parallel_num, num_ids, embedding_size, - data_type, host_num_unique_matrix, reverse_unique_cur_rank_embeddings, - received_embeddings, reverse_cur_rank_quantize_factor, - recv_quantize_factor); - - // reverse unique_partition - GatherKernelUtilImpl::Forward( - ctx->stream(), reinterpret_cast(inverse_unique_partition_indices->dptr()), - inverse_unique_partition_indices->shape_view().elem_cnt(), received_embeddings, - Shape({1, parallel_num * num_ids, embedding_size}), - reverse_recv_quantize_cur_rank_embeddings, 0); - - GatherKernelUtilImpl::Forward( - ctx->stream(), reinterpret_cast(inverse_unique_partition_indices->dptr()), - inverse_unique_partition_indices->shape_view().elem_cnt(), recv_quantize_factor, - Shape({1, parallel_num * num_ids, 1}), reverse_recv_quantize_factor, 0); - - int32_t dequantize_row_size = inverse_unique_partition_indices->shape_view().elem_cnt(); - IDX dequantize_elem_cnt = dequantize_row_size * embedding_size; - OF_CUDA_CHECK((LaunchDequantizeKernel( - cuda_stream, reverse_recv_quantize_cur_rank_embeddings, reverse_recv_quantize_factor, - embeddings->mut_dptr(), embedding_size, dequantize_elem_cnt))); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_EMBEDDING_SHUFFLE_KERNEL(t_dtype_pair, idx_dtype_pair) \ - REGISTER_USER_KERNEL("embedding_shuffle") \ - .SetCreateFn>() \ - .SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("cur_rank_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair)) \ - && (user_op::HobDataType("num_unique_matrix", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ - const user_op::TensorDesc& cur_rank_embeddings = \ - ctx->InputTensorDesc("cur_rank_embeddings", 0); \ - bool enable_quantized_comm = \ - ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM", false) \ - && (cur_rank_embeddings.shape().At(1) < kMaxColSize); \ - size_t tmp_size = 0; \ - if (!enable_quantized_comm) { \ - size_t reverse_cur_rank_embeddings_size = GetCudaAlignedSize( \ - cur_rank_embeddings.shape().elem_cnt() * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair))); \ - size_t recv_unique_embeddings_size = reverse_cur_rank_embeddings_size; \ - tmp_size = reverse_cur_rank_embeddings_size + recv_unique_embeddings_size; \ - } else { \ - size_t total_elem_cnt = cur_rank_embeddings.shape().elem_cnt(); \ - size_t reverse_cur_rank_embeddings_size = \ - GetCudaAlignedSize(total_elem_cnt * sizeof(int8_t)); \ - size_t recv_unique_embeddings = reverse_cur_rank_embeddings_size; \ - size_t quantize_cur_rank_embeddings_size = reverse_cur_rank_embeddings_size; \ - size_t reverse_recv_quantize_cur_rank_embeddings_size = \ - reverse_cur_rank_embeddings_size; \ - size_t cur_rank_quantize_factor_size = GetCudaAlignedSize( \ - cur_rank_embeddings.shape().At(0) * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair))); \ - size_t reverse_cur_rank_quantize_factor_size = cur_rank_quantize_factor_size; \ - size_t recv_quantize_factor_size = cur_rank_quantize_factor_size; \ - size_t reverse_recv_quantize_factor_size = cur_rank_quantize_factor_size; \ - tmp_size = reverse_cur_rank_embeddings_size + recv_unique_embeddings \ - + quantize_cur_rank_embeddings_size \ - + reverse_recv_quantize_cur_rank_embeddings_size \ - + cur_rank_quantize_factor_size + reverse_cur_rank_quantize_factor_size \ - + recv_quantize_factor_size + reverse_recv_quantize_factor_size; \ - } \ - return tmp_size; \ - }); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_SHUFFLE_KERNEL, - FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) - // FLOATING_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) - - -template -void ShuffleEmbeddingsGrad(hipStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id, - int64_t parallel_num, int64_t num_ids, int64_t embedding_size, - DataType data_type, IDX* host_num_unique_matrix, - T* unique_partition_embedding_grad, T* received_embeddings_grad) { - std::vector send_offsets; - std::vector send_elem_cnt; - std::vector recv_offsets; - std::vector recv_elem_cnt; - MakeShuffleParams(host_num_unique_matrix, num_ids, embedding_size, parallel_id, parallel_num, - &send_offsets, &send_elem_cnt, &recv_offsets, &recv_elem_cnt); - ShuffleData(cuda_stream, comm, data_type, send_offsets, send_elem_cnt, - unique_partition_embedding_grad, recv_offsets, recv_elem_cnt, - received_embeddings_grad); -} - -// Quantize Version. -template -void ShuffleEmbeddingsGrad(hipStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id, - int64_t parallel_num, int64_t num_ids, int64_t embedding_size, - DataType data_type, IDX* host_num_unique_matrix, - int8_t* unique_partition_embedding_grad, - int8_t* received_embeddings_grad, T* cur_rank_quantize_factor, - T* received_cur_rank_quantize_factor) { - std::vector send_offsets; - std::vector send_elem_cnt; - std::vector recv_offsets; - std::vector recv_elem_cnt; - // Shuffle Embedding Grad. - MakeShuffleParams(host_num_unique_matrix, num_ids, embedding_size, parallel_id, parallel_num, - &send_offsets, &send_elem_cnt, &recv_offsets, &recv_elem_cnt); - ShuffleData(cuda_stream, comm, DataType::kInt8, send_offsets, send_elem_cnt, - unique_partition_embedding_grad, recv_offsets, recv_elem_cnt, - received_embeddings_grad); - // Shuffle Quantize factor. - MakeShuffleParams(host_num_unique_matrix, num_ids, /*embedding_size=*/1, parallel_id, - parallel_num, &send_offsets, &send_elem_cnt, &recv_offsets, &recv_elem_cnt); - ShuffleData(cuda_stream, comm, data_type, send_offsets, send_elem_cnt, cur_rank_quantize_factor, - recv_offsets, recv_elem_cnt, received_cur_rank_quantize_factor); -} - -template -__global__ void UnsortedSegmentHalfGpu(const IDX in_h2_elem_cnt, const IDX h2_inner_dim_size, - const IDX inner_dim_size, const half* data, - const K* segment_ids, const IDX num_segments, - half2* out_h2) { - CUDA_1D_KERNEL_LOOP_T(IDX, i, in_h2_elem_cnt) { - const IDX segment_id_idx = i / h2_inner_dim_size; - const IDX h2_inner_idx = i - segment_id_idx * h2_inner_dim_size; - const IDX inner_idx_0 = 2 * h2_inner_idx; - const IDX inner_idx_1 = inner_idx_0 + 1; - const half* data_row = data + segment_id_idx * inner_dim_size; - half2 val; - val.data.x = data_row[inner_idx_0]; - val.data.y = (inner_idx_1 >= inner_dim_size) ? static_cast(0) : data_row[inner_idx_1]; - const IDX idx = segment_ids[segment_id_idx]; - const IDX out_h2_offset = idx * h2_inner_dim_size + h2_inner_idx; - cuda::atomic::Add(out_h2 + out_h2_offset, val); - } -} - -template -struct UnsortedSegmentSumPad { - void operator()(ep::Stream* stream, const K* segment_ids, const T* data, int64_t num_segment_ids, - int64_t num_segments, int64_t inner_dim_size, int64_t padded_inner_dim_size, - T* out) const { - UNIMPLEMENTED(); - } -}; - -template -struct UnsortedSegmentSumPad { - void operator()(ep::Stream* stream, const K* segment_ids, const half* data, - int64_t num_segment_ids, int64_t num_segments, int64_t inner_dim_size, - int64_t padded_inner_dim_size, half* out) const { - const int64_t data_elem_cnt = num_segment_ids * inner_dim_size; - const int64_t out_elem_cnt = num_segments * padded_inner_dim_size; - CHECK_EQ(padded_inner_dim_size % 2, 0); - CHECK_EQ(inner_dim_size + 1, padded_inner_dim_size); - const int64_t h2_inner_dim_size = padded_inner_dim_size / 2; - const int64_t in_h2_elem_cnt = num_segment_ids * h2_inner_dim_size; - if (std::max(data_elem_cnt, out_elem_cnt) < GetMaxVal() / 2) { - UnsortedSegmentHalfGpu - <<As()->cuda_stream()>>>( - in_h2_elem_cnt, h2_inner_dim_size, inner_dim_size, data, segment_ids, num_segments, - reinterpret_cast(out)); - } else { - UnsortedSegmentHalfGpu - <<As()->cuda_stream()>>>( - in_h2_elem_cnt, h2_inner_dim_size, inner_dim_size, data, segment_ids, num_segments, - reinterpret_cast(out)); - } - } -}; - -template -void UnsortedSegmentSum(ep::Stream* stream, const K* segment_ids, const T* data, - int64_t num_segment_ids, int64_t num_segments, int64_t inner_dim_size, - int64_t padded_inner_dim_size, T* out) { - if (inner_dim_size == padded_inner_dim_size) { - UnsortedSegmentSumKernelUtil::UnsortedSegmentSum( - stream, segment_ids, data, num_segment_ids, num_segments, 1, inner_dim_size, 0, out); - } else { - CHECK_EQ(inner_dim_size + 1, padded_inner_dim_size); - UnsortedSegmentSumPad()(stream, segment_ids, data, num_segment_ids, num_segments, - inner_dim_size, padded_inner_dim_size, out); - } -} - -template -void UniquePartitionEmbeddingGrad(ep::Stream* stream, int64_t parallel_id, int64_t parallel_num, - int64_t num_ids, int64_t embedding_size, - int64_t padded_embedding_size, const IDX* host_num_unique_matrix, - const T* embedding_grad, - const IDX* inverse_unique_partition_indices, - T* unique_partition_embedding_grad) { - for (int64_t i = 0; i < parallel_num; ++i) { - const int64_t offset = i * num_ids * padded_embedding_size; - const int64_t valid_value_size = - host_num_unique_matrix[parallel_id * parallel_num + i] * padded_embedding_size * sizeof(T); - OF_CUDA_CHECK(hipMemsetAsync(unique_partition_embedding_grad + offset, 0, valid_value_size, - stream->As()->cuda_stream())); - } - UnsortedSegmentSum(stream, inverse_unique_partition_indices, embedding_grad, num_ids, - parallel_num * num_ids, embedding_size, padded_embedding_size, - unique_partition_embedding_grad); -} - -template -void UniqueCurRankEmbeddingGrad(ep::Stream* stream, DataType data_type, int64_t cur_rank_num_ids, - int64_t embedding_size, int64_t padded_embedding_size, - const T* cur_rank_embedding_grad, - const IDX* cur_rank_inverse_indices, - T* cur_rank_unique_embedding_grad, T* tmp_buffer) { - T* unsorted_segment_sum_out = - (embedding_size == padded_embedding_size) ? cur_rank_unique_embedding_grad : tmp_buffer; - OF_CUDA_CHECK(hipMemsetAsync(unsorted_segment_sum_out, 0, - cur_rank_num_ids * padded_embedding_size * sizeof(T), - stream->As()->cuda_stream())); - UnsortedSegmentSum(stream, cur_rank_inverse_indices, cur_rank_embedding_grad, - cur_rank_num_ids, cur_rank_num_ids, padded_embedding_size, - padded_embedding_size, unsorted_segment_sum_out); - if (embedding_size != padded_embedding_size) { - std::unique_ptr primitive = - ep::primitive::NewPrimitive(DeviceType::kCUDA, 2); - DimVector dst_shape = {cur_rank_num_ids, embedding_size}; - DimVector dst_pos_vec = {0, 0}; - DimVector src_shape = {cur_rank_num_ids, padded_embedding_size}; - DimVector src_pos_vec = {0, 0}; - DimVector extent_vec = {cur_rank_num_ids, embedding_size}; - primitive->Launch(stream, data_type, 2, cur_rank_unique_embedding_grad, dst_shape.data(), - dst_pos_vec.data(), unsorted_segment_sum_out, src_shape.data(), - src_pos_vec.data(), extent_vec.data()); - } -} - -int64_t GetPaddedEmbeddingSize(DataType data_type, int64_t embedding_size) { - if (data_type == DataType::kFloat16 && embedding_size % 2 != 0) { - return embedding_size + 1; - } else { - return embedding_size; - } -} - -template -class EmbeddingGradientShuffleKernel final : public user_op::OpKernel { - public: - EmbeddingGradientShuffleKernel() = default; - ~EmbeddingGradientShuffleKernel() override = default; - - std::shared_ptr CreateOpKernelState( - user_op::KernelInitContext* ctx) const override { - return std::make_shared>(ctx); - } - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, - const user_op::OpKernelCache*) const override { - auto* kernel_state = dynamic_cast*>(state); - CHECK(kernel_state != nullptr); - const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0); - - const user_op::Tensor* num_unique_matrix = ctx->Tensor4ArgNameAndIndex("num_unique_matrix", 0); - const user_op::Tensor* cur_rank_inverse_indices = - ctx->Tensor4ArgNameAndIndex("cur_rank_inverse_indices", 0); - const user_op::Tensor* inverse_unique_partition_indices = - ctx->Tensor4ArgNameAndIndex("inverse_unique_partition_indices", 0); - user_op::Tensor* cur_rank_unique_embedding_grad = - ctx->Tensor4ArgNameAndIndex("cur_rank_unique_embedding_grad", 0); - const int64_t embedding_size = cur_rank_unique_embedding_grad->shape_view().At(1); - IDX* host_num_unique_matrix = kernel_state->HostNumUniqueMatrix(); - DataType data_type = embedding_grad->data_type(); - const int64_t num_ids = inverse_unique_partition_indices->shape_view().elem_cnt(); - const int64_t parallel_num = ctx->parallel_ctx().parallel_num(); - const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - const int64_t padded_embedding_size = GetPaddedEmbeddingSize(data_type, embedding_size); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - ncclComm_t comm = kernel_state->comm(); - using ComputeType = typename DefaultComputeType::type; - bool enable_quantized_comm_env_var = - ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM", false); - bool enable_quantized_comm = - enable_quantized_comm_env_var && (padded_embedding_size < kMaxColSize); - if (enable_quantized_comm_env_var && !enable_quantized_comm) { - LOG(WARNING) << "Only envrionment variable ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM=1 and " - "embedding_size less equal than 1024 can use quantized communication. "; - } - hipStream_t cuda_stream = ctx->stream()->As()->cuda_stream(); - OF_CUDA_CHECK(hipMemcpyAsync(host_num_unique_matrix, num_unique_matrix->dptr(), - parallel_num * parallel_num * sizeof(IDX), hipMemcpyDefault, - cuda_stream)); - CHECK_JUST(ctx->stream()->Sync()); - - int64_t cur_rank_num_ids = 0; - for (int64_t i = 0; i < parallel_num; ++i) { - cur_rank_num_ids += host_num_unique_matrix[i * parallel_num + parallel_id]; - } - size_t full_num_ids = parallel_num * num_ids; - size_t full_elem_cnt = full_num_ids * padded_embedding_size; - size_t unique_partition_embedding_grad_size = GetCudaAlignedSize(full_elem_cnt * sizeof(T)); - - if (!enable_quantized_comm) { - size_t received_embedding_grad_size = unique_partition_embedding_grad_size; - T* unique_partition_embedding_grad = reinterpret_cast(tmp_buffer->mut_dptr()); - T* received_embedding_grad = - reinterpret_cast(tmp_buffer->mut_dptr() + unique_partition_embedding_grad_size); - CHECK_GE(tmp_buffer->shape_view().elem_cnt(), - unique_partition_embedding_grad_size + received_embedding_grad_size); - - UniquePartitionEmbeddingGrad( - ctx->stream(), parallel_id, parallel_num, num_ids, embedding_size, padded_embedding_size, - host_num_unique_matrix, embedding_grad->dptr(), - reinterpret_cast(inverse_unique_partition_indices->dptr()), - unique_partition_embedding_grad); - - ShuffleEmbeddingsGrad(cuda_stream, comm, parallel_id, parallel_num, num_ids, - padded_embedding_size, data_type, host_num_unique_matrix, - unique_partition_embedding_grad, received_embedding_grad); - - // use unique_partition_embedding_grad as UniqueCurRankEmbeddingGrad buffer. - T* buffer_ptr = unique_partition_embedding_grad; - UniqueCurRankEmbeddingGrad(ctx->stream(), data_type, cur_rank_num_ids, embedding_size, - padded_embedding_size, received_embedding_grad, - reinterpret_cast(cur_rank_inverse_indices->dptr()), - cur_rank_unique_embedding_grad->mut_dptr(), buffer_ptr); - } else { - size_t received_embedding_grad_size = GetCudaAlignedSize(full_elem_cnt * sizeof(int8_t)); - size_t quantize_cur_rank_embedding_grad_size = received_embedding_grad_size; - size_t cur_rank_quantize_factor_size = GetCudaAlignedSize(full_num_ids * sizeof(T)); - size_t received_cur_rank_quantize_factor_size = cur_rank_quantize_factor_size; - size_t dequantize_cur_rank_embedding_grad_size = - GetCudaAlignedSize(full_elem_cnt * sizeof(T)); - CHECK_GE(tmp_buffer->shape_view().elem_cnt(), - unique_partition_embedding_grad_size + received_embedding_grad_size - + quantize_cur_rank_embedding_grad_size + cur_rank_quantize_factor_size - + received_cur_rank_quantize_factor_size - + dequantize_cur_rank_embedding_grad_size); - T* unique_partition_embedding_grad = reinterpret_cast(tmp_buffer->mut_dptr()); - int8_t* received_embedding_grad = reinterpret_cast( - tmp_buffer->mut_dptr() + unique_partition_embedding_grad_size); - - int8_t* quantize_cur_rank_embedding_grad = reinterpret_cast( - tmp_buffer->mut_dptr() + unique_partition_embedding_grad_size - + received_embedding_grad_size); - T* cur_rank_quantize_factor = reinterpret_cast( - tmp_buffer->mut_dptr() + unique_partition_embedding_grad_size - + received_embedding_grad_size + quantize_cur_rank_embedding_grad_size); - T* received_cur_rank_quantize_factor = reinterpret_cast( - tmp_buffer->mut_dptr() + unique_partition_embedding_grad_size - + received_embedding_grad_size + quantize_cur_rank_embedding_grad_size - + cur_rank_quantize_factor_size); - T* dequantize_cur_rank_embedding_grad = reinterpret_cast( - tmp_buffer->mut_dptr() + unique_partition_embedding_grad_size - + received_embedding_grad_size + quantize_cur_rank_embedding_grad_size - + cur_rank_quantize_factor_size + received_cur_rank_quantize_factor_size); - - UniquePartitionEmbeddingGrad( - ctx->stream(), parallel_id, parallel_num, num_ids, embedding_size, padded_embedding_size, - host_num_unique_matrix, embedding_grad->dptr(), - reinterpret_cast(inverse_unique_partition_indices->dptr()), - unique_partition_embedding_grad); - - // Quantize. - for (int64_t i = 0; i < parallel_num; ++i) { - const int64_t embedding_grad_offset = i * num_ids * padded_embedding_size; - const int64_t quantize_factor_offset = i * num_ids; - const int64_t valid_row_size = host_num_unique_matrix[parallel_id * parallel_num + i]; - DispatchQuantizeWarpImplPackSize()( - cuda_stream, unique_partition_embedding_grad + embedding_grad_offset, - quantize_cur_rank_embedding_grad + embedding_grad_offset, - cur_rank_quantize_factor + quantize_factor_offset, valid_row_size, - padded_embedding_size); - } - - ShuffleEmbeddingsGrad(cuda_stream, comm, parallel_id, parallel_num, num_ids, - padded_embedding_size, data_type, host_num_unique_matrix, - quantize_cur_rank_embedding_grad, received_embedding_grad, - cur_rank_quantize_factor, received_cur_rank_quantize_factor); - - int64_t dequantize_cur_rank_num = 0; - for (int64_t i = 0; i < parallel_num; ++i) { - /* - Host num unique matrix: - | Partition0 | Partition1 | - | Rank0 | 2 | 4 | - | Rank1 | 3 | 3 | - After ShuffleEmbeddingGrads, each rank will exchange partition. - For example: - Rank0 will have (matrix[rank0][part0] + matrix[rank1][part0]) grad tensor. - Rank1 will have (matrix[rank0][part1] + matrix[rank1][part1]) grad tensor. - */ - dequantize_cur_rank_num += host_num_unique_matrix[i * parallel_num + parallel_id]; - } - IDX dequantize_elem_cnt = dequantize_cur_rank_num * padded_embedding_size; - OF_CUDA_CHECK((LaunchDequantizeKernel( - cuda_stream, received_embedding_grad, received_cur_rank_quantize_factor, - dequantize_cur_rank_embedding_grad, padded_embedding_size, dequantize_elem_cnt))); - // use unique_partition_embedding_grad as UniqueCurRankEmbeddingGrad buffer. - T* buffer_ptr = unique_partition_embedding_grad; - UniqueCurRankEmbeddingGrad(ctx->stream(), data_type, cur_rank_num_ids, embedding_size, - padded_embedding_size, dequantize_cur_rank_embedding_grad, - reinterpret_cast(cur_rank_inverse_indices->dptr()), - cur_rank_unique_embedding_grad->mut_dptr(), buffer_ptr); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_EMBEDDING_GRADIENT_SHUFFLE_KERNEL(t_dtype_pair, idx_dtype_pair) \ - REGISTER_USER_KERNEL("embedding_gradient_shuffle") \ - .SetCreateFn>() \ - .SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(t_dtype_pair)) \ - && (user_op::HobDataType("num_unique_matrix", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ - const user_op::TensorDesc& cur_rank_unique_embedding_grad = \ - ctx->InputTensorDesc("cur_rank_unique_embedding_grad", 0); \ - size_t cur_rank_embedding_grad_num = cur_rank_unique_embedding_grad.shape().At(0); \ - size_t embedding_size = cur_rank_unique_embedding_grad.shape().At(1); \ - size_t padded_embedding_size = \ - GetPaddedEmbeddingSize(cur_rank_unique_embedding_grad.data_type(), embedding_size); \ - size_t cur_rank_embedding_grad_elem_cnt = \ - cur_rank_embedding_grad_num * padded_embedding_size; \ - bool enable_quantized_comm = \ - ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM", false) \ - && (padded_embedding_size < kMaxColSize); \ - size_t tmp_size = 0; \ - if (!enable_quantized_comm) { \ - size_t cur_rank_embedding_grad_size = GetCudaAlignedSize( \ - cur_rank_embedding_grad_elem_cnt * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair))); \ - tmp_size = 2 * cur_rank_embedding_grad_size; \ - } else { \ - size_t unique_partition_embedding_grad_size = GetCudaAlignedSize( \ - cur_rank_embedding_grad_elem_cnt * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair))); \ - size_t received_embedding_grad_size = \ - GetCudaAlignedSize(cur_rank_embedding_grad_elem_cnt * sizeof(int8_t)); \ - size_t quantize_cur_rank_embedding_grad_size = received_embedding_grad_size; \ - size_t cur_rank_quantize_factor_size = GetCudaAlignedSize( \ - cur_rank_embedding_grad_num * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair))); \ - size_t received_cur_rank_quantize_factor_size = cur_rank_quantize_factor_size; \ - size_t dequantize_cur_rank_embedding_grad_size = unique_partition_embedding_grad_size; \ - tmp_size = unique_partition_embedding_grad_size + received_embedding_grad_size \ - + quantize_cur_rank_embedding_grad_size + cur_rank_quantize_factor_size \ - + received_cur_rank_quantize_factor_size \ - + dequantize_cur_rank_embedding_grad_size; \ - } \ - return tmp_size; \ - }); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_GRADIENT_SHUFFLE_KERNEL, - FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) - // FLOATING_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) - -template -class UniqueKeyValuePairKernel final : public user_op::OpKernel { - public: - UniqueKeyValuePairKernel() = default; - ~UniqueKeyValuePairKernel() override = default; - - private: - using user_op::OpKernel::Compute; - - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* keys = ctx->Tensor4ArgNameAndIndex("keys", 0); - user_op::Tensor* num_unique = ctx->Tensor4ArgNameAndIndex("num_unique", 0); - user_op::Tensor* unique_keys = ctx->Tensor4ArgNameAndIndex("unique_keys", 0); - user_op::Tensor* unique_values = ctx->Tensor4ArgNameAndIndex("unique_values", 0); - user_op::Tensor* inverse_indices = ctx->Tensor4ArgNameAndIndex("inverse_indices", 0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const int32_t num_tables = ctx->Attr("num_tables"); - const bool has_values = ctx->has_input("values", 0); - const bool need_values_buffer = (!has_values && num_tables > 1); - size_t values_buffer_bytes = - need_values_buffer ? GetCudaAlignedSize(keys->shape_view().elem_cnt() * sizeof(V)) : 0; - const int64_t num_keys = keys->shape_view().elem_cnt(); - const int64_t hash_capacity = num_keys; - const size_t workspace_bytes = GetCudaAlignedSize(hash_capacity * sizeof(TableEntry)); - CHECK_LE(values_buffer_bytes + workspace_bytes, tmp_buffer->shape_view().elem_cnt()); - hipStream_t cuda_stream = ctx->stream()->As()->cuda_stream(); - const V* values_ptr; - if (has_values) { - const user_op::Tensor* values = ctx->Tensor4ArgNameAndIndex("values", 0); - values_ptr = reinterpret_cast(values->dptr()); - } else if (need_values_buffer) { - V* values_buffer_ptr = reinterpret_cast(tmp_buffer->mut_dptr()); - hipLaunchKernelGGL(GenerateTableIds, BlocksNum4ThreadsNum(num_keys), kCudaThreadsNumPerBlock, 0, cuda_stream, - num_keys, num_tables, values_buffer_ptr); - values_ptr = values_buffer_ptr; - } else { - values_ptr = nullptr; - } - const bool need_process_table_ids = (has_values || num_tables > 1); - TableEntry* workspace_ptr = - reinterpret_cast*>(tmp_buffer->mut_dptr() + values_buffer_bytes); - UniqueAndPartition( - cuda_stream, num_keys, hash_capacity, 1, reinterpret_cast(keys->dptr()), - values_ptr, reinterpret_cast(num_unique->mut_dptr()), - reinterpret_cast(unique_keys->mut_dptr()), - reinterpret_cast(unique_values->mut_dptr()), - reinterpret_cast(inverse_indices->mut_dptr()), workspace_ptr, workspace_bytes, - need_process_table_ids); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_UNIQUE_KEY_VALUE_PAIR_KERNEL(k_dtype_pair, value_dtype_pair, idx_dtype_pair) \ - REGISTER_USER_KERNEL("unique_key_value_pair") \ - .SetCreateFn>() \ - .SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("keys", 0) == OF_PP_PAIR_SECOND(k_dtype_pair)) \ - && (user_op::HobDataType("inverse_indices", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair)) \ - && (user_op::HobDataType("unique_values", 0) == OF_PP_PAIR_SECOND(value_dtype_pair))) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ - const user_op::TensorDesc& keys = ctx->InputTensorDesc("keys", 0); \ - const int64_t num_keys = keys.shape().elem_cnt(); \ - const int64_t hash_capacity = num_keys; \ - const size_t workspace_bytes = GetCudaAlignedSize( \ - hash_capacity * sizeof(TableEntry)); \ - const int32_t num_tables = ctx->Attr("num_tables"); \ - const bool has_values = ctx->has_input("values", 0); \ - const bool need_values_buffer = (!has_values && num_tables > 1); \ - size_t values_buffer_bytes = \ - need_values_buffer \ - ? GetCudaAlignedSize(num_keys * sizeof(OF_PP_PAIR_FIRST(value_dtype_pair))) \ - : 0; \ - return workspace_bytes + values_buffer_bytes; \ - }); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_UNIQUE_KEY_VALUE_PAIR_KERNEL, ID_DATA_TYPE_SEQ, - ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) - -REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("id_shuffle"); -REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("embedding_shuffle"); -REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("embedding_gradient_shuffle"); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/device/nccl_util.h" +#include "oneflow/core/job/eager_nccl_comm_manager.h" +#include "oneflow/core/job/parallel_desc.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include "oneflow/user/kernels/gather_kernel_util.h" +#include "oneflow/user/kernels/unsorted_segment_sum_kernel_util.h" +#include "oneflow/core/hip/atomic.hip.h" +#include "oneflow/core/embedding/hash_functions.hip.h" +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/core/ep/include/primitive/copy_nd.h" +#include "oneflow/core/hip/atomic.hip.h" + +namespace oneflow { + +namespace { + +template +struct TableEntry { + K key; + uint32_t value; +}; + +template +__global__ void HashTableUniqueAndPartitionPairs(const uint32_t table_capacity, + const uint32_t num_keys, int32_t num_partition, + IDX* unique_counts, TableEntry* table, + const K* keys, const V* values, + K* partitioned_unique_keys, + V* partitioned_unique_values, IDX* reverse_index, + bool need_process_values) { + CUDA_1D_KERNEL_LOOP_T(uint32_t, i, num_keys) { + IDX r_index_plus_one = 0; + const K key = keys[i]; + size_t key_hash = HASH()(key); + uint32_t partition_id = key_hash % num_partition; + IDX* unique_count = unique_counts + partition_id; + K* unique_keys = partitioned_unique_keys + partition_id * num_keys; + uint32_t pos = key_hash % table_capacity; + const K key_hi = (key | 0x1); + const K key_lo = (key & 0x1); + uint32_t counter = 0; + while (r_index_plus_one == 0) { + bool prob_next = false; + K* key_ptr = &table[pos].key; + volatile uint32_t* table_value_ptr = &table[pos].value; + const K old_key = cuda::atomic::CAS(key_ptr, 0, key_hi); + if (old_key == 0) { + IDX unique_pos = cuda::atomic::Add(unique_count, 1); + r_index_plus_one = unique_pos + 1; + unique_keys[unique_pos] = key; + if (need_process_values) { + partitioned_unique_values[partition_id * num_keys + unique_pos] = values[i]; + } + *table_value_ptr = ((r_index_plus_one << 1U) | key_lo); + } else if (old_key == key_hi) { + const uint32_t value = *table_value_ptr; + if (value == 0) { + // do nothing + } else if ((value & 0x1) == key_lo) { + r_index_plus_one = (value >> 1U); + } else { + prob_next = true; + } + } else { + prob_next = true; + } + if (prob_next) { + pos += 1; + counter += 1; + if (pos >= table_capacity) { pos -= table_capacity; } + if (counter >= table_capacity) { asm volatile("s_trap 0;"); } + } + } + reverse_index[i] = partition_id * num_keys + r_index_plus_one - 1; + } +} + +template +__global__ void GenerateTableIds(int32_t elem_cnt, int32_t num_tables, U* table_ids) { + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { table_ids[i] = i % num_tables; } +} + +template +void UniqueAndPartition(hipStream_t cuda_stream, int64_t num_ids, size_t capacity, + int64_t num_partition, const K* ids, const V* table_ids, + IDX* num_partitioned_unique_ids_ptr, K* partitioned_unique_ids, + V* partitioned_unique_table_ids, IDX* inverse_unique_partition_indices, + void* workspace_ptr, size_t workspace_bytes, bool need_process_table_ids) { + size_t table_capacity_bytes = capacity * sizeof(TableEntry); + CHECK_GE(workspace_bytes, table_capacity_bytes); + OF_CUDA_CHECK(hipMemsetAsync(workspace_ptr, 0, table_capacity_bytes, cuda_stream)); + OF_CUDA_CHECK( + hipMemsetAsync(num_partitioned_unique_ids_ptr, 0, num_partition * sizeof(IDX), cuda_stream)); + hipLaunchKernelGGL(HIP_KERNEL_NAME(HashTableUniqueAndPartitionPairs), BlocksNum4ThreadsNum(num_ids), kCudaThreadsNumPerBlock, 0, cuda_stream, + capacity, num_ids, num_partition, num_partitioned_unique_ids_ptr, + reinterpret_cast*>(workspace_ptr), ids, table_ids, partitioned_unique_ids, + partitioned_unique_table_ids, inverse_unique_partition_indices, need_process_table_ids); +} + +template +void ShuffleData(hipStream_t cuda_stream, ncclComm_t comm, DataType data_type, + const std::vector& send_offsets, + const std::vector& send_elem_cnt, const T* send_data, + const std::vector& recv_offsets, + const std::vector& recv_elem_cnt, T* recv_data) { + ncclDataType_t nccl_data_type = GetNcclDataType(data_type); + const int64_t parallel_num = send_offsets.size(); + OF_NCCL_CHECK(ncclGroupStart()); + for (int64_t i = 0; i < parallel_num; ++i) { + OF_NCCL_CHECK(ncclSend(send_data + send_offsets.at(i), send_elem_cnt.at(i), nccl_data_type, i, + comm, cuda_stream)); + OF_NCCL_CHECK(ncclRecv(recv_data + recv_offsets.at(i), recv_elem_cnt.at(i), nccl_data_type, i, + comm, cuda_stream)); + } + OF_NCCL_CHECK(ncclGroupEnd()); +} + +template +void MakeShuffleParams(const IDX* host_num_unique_matrix, const int64_t num_ids, + const int64_t row_size, int64_t parallel_id, int64_t parallel_num, + std::vector* scatter_offset_vec, + std::vector* scatter_elem_cnt_vec, + std::vector* gather_offset_vec, + std::vector* gather_elem_cnt_vec) { + scatter_offset_vec->resize(parallel_num); + scatter_elem_cnt_vec->resize(parallel_num); + gather_offset_vec->resize(parallel_num); + gather_elem_cnt_vec->resize(parallel_num); + int64_t gather_offset = 0; + for (int64_t i = 0; i < parallel_num; ++i) { + const int64_t scatter_elem_cnt = + host_num_unique_matrix[parallel_id * parallel_num + i] * row_size; + const int64_t gather_elem_cnt = + host_num_unique_matrix[i * parallel_num + parallel_id] * row_size; + scatter_offset_vec->at(i) = i * num_ids * row_size; + scatter_elem_cnt_vec->at(i) = scatter_elem_cnt; + gather_offset_vec->at(i) = gather_offset; + gather_elem_cnt_vec->at(i) = gather_elem_cnt; + gather_offset += gather_elem_cnt; + } +} + +template +void ShuffleIdsAndTableIds(hipStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id, + int64_t parallel_num, int64_t num_ids, DataType ids_data_type, + DataType table_ids_data_type, IDX* host_num_unique_matrix, + K* partitioned_unique_ids, U* partitioned_unique_table_ids, + K* received_ids, U* received_table_ids, int64_t* received_elem_cnt, + bool need_process_table_ids) { + std::vector send_offsets; + std::vector send_elem_cnt; + std::vector recv_offsets; + std::vector recv_elem_cnt; + MakeShuffleParams(host_num_unique_matrix, num_ids, 1, parallel_id, parallel_num, &send_offsets, + &send_elem_cnt, &recv_offsets, &recv_elem_cnt); + ShuffleData(cuda_stream, comm, ids_data_type, send_offsets, send_elem_cnt, partitioned_unique_ids, + recv_offsets, recv_elem_cnt, received_ids); + *received_elem_cnt = recv_offsets.at(parallel_num - 1) + recv_elem_cnt.at(parallel_num - 1); + if (need_process_table_ids) { + ShuffleData(cuda_stream, comm, table_ids_data_type, send_offsets, send_elem_cnt, + partitioned_unique_table_ids, recv_offsets, recv_elem_cnt, received_table_ids); + } +} + +enum class IdShuffleBufferType { + kNumPartitionedUnique = 0, + kPartitionedUniqueIds, + kReceivedIds, + kTableIds, + kPartitionedUniqueTableIds, + kReceivedTableIds, + kWorkspace, + kMaxType +}; + +template +class IdShuffleTmpBufferManager final { + public: + OF_DISALLOW_COPY_AND_MOVE(IdShuffleTmpBufferManager); + IdShuffleTmpBufferManager(void* ptr, const int64_t num_ids, const int64_t parallel_num, + bool need_table_ids, bool need_process_table_ids) + : offset_(0), + offsets_(static_cast(IdShuffleBufferType::kMaxType), -1), + sizes_(static_cast(IdShuffleBufferType::kMaxType)), + ptr_(ptr) { + const int64_t num_table_ids = need_process_table_ids ? num_ids : 0; + const size_t table_ids_bytes = need_table_ids ? num_ids * sizeof(U) : 0; + AllocBuffer(IdShuffleBufferType::kNumPartitionedUnique, parallel_num * sizeof(IDX)); + size_t partitioned_ids_bytes = parallel_num * num_ids * sizeof(K); + AllocBuffer(IdShuffleBufferType::kPartitionedUniqueIds, partitioned_ids_bytes); + AllocBuffer(IdShuffleBufferType::kReceivedIds, partitioned_ids_bytes); + AllocBuffer(IdShuffleBufferType::kTableIds, table_ids_bytes); + size_t partitioned_table_ids_bytes = parallel_num * num_table_ids * sizeof(U); + AllocBuffer(IdShuffleBufferType::kPartitionedUniqueTableIds, partitioned_table_ids_bytes); + AllocBuffer(IdShuffleBufferType::kReceivedTableIds, partitioned_table_ids_bytes); + const size_t hash_table_capacity = parallel_num * num_ids; + AllocBuffer(IdShuffleBufferType::kWorkspace, hash_table_capacity * sizeof(TableEntry)); + } + + template + T* Ptr(IdShuffleBufferType type) { + CHECK(ptr_ != nullptr); + int64_t offset = offsets_.at(static_cast(type)); + CHECK_NE(offset, -1); + return reinterpret_cast(reinterpret_cast(ptr_) + offset); + } + + int64_t Size(IdShuffleBufferType type) { return sizes_.at(static_cast(type)); } + + size_t TotalBufferSize() const { return offset_; } + + private: + void AllocBuffer(IdShuffleBufferType type, size_t size) { + const size_t type_id = static_cast(type); + CHECK_EQ(offsets_.at(type_id), -1); + offsets_.at(type_id) = offset_; + sizes_.at(type_id) = size; + offset_ += GetCudaAlignedSize(size); + } + size_t offset_; + std::vector offsets_; + std::vector sizes_; + void* ptr_; +}; + +template +class DataShuffleKernelState final : public user_op::OpKernelState { + public: + explicit DataShuffleKernelState(user_op::KernelInitContext* ctx) + : device_index_(-1), + stream_name_(EagerNcclCommMgr::kDefaultStreamName), + parallel_desc_(ctx->parallel_desc()) { + OF_CUDA_CHECK(hipGetDevice(&device_index_)); + if (ctx->op_conf().has_stream_name_hint()) { stream_name_ = ctx->op_conf().stream_name_hint(); } + OF_CUDA_CHECK(hipMallocHost( + reinterpret_cast(&host_num_unique_matrix_), + parallel_desc_.parallel_num() * parallel_desc_.parallel_num() * sizeof(IDX))); + } + ~DataShuffleKernelState() { + CudaCurrentDeviceGuard guard(device_index_); + OF_CUDA_CHECK(hipHostFree(host_num_unique_matrix_)); + } + + ncclComm_t comm() { return GetOrCreate().comm; } + + IDX* HostNumUniqueMatrix() { return host_num_unique_matrix_; } + + private: + struct Comm { + Comm(ncclComm_t comm) : comm(comm) {} + ncclComm_t comm; + }; + + const Comm& GetOrCreate() { + if (!comm_) { Init(); } + return *comm_; + } + + void Init() { + std::set> device_set; + for (int64_t parallel_id = 0; parallel_id < parallel_desc_.parallel_num(); ++parallel_id) { + int64_t machine_id = CHECK_JUST(parallel_desc_.MachineId4ParallelId(parallel_id)); + int64_t device_id = CHECK_JUST(parallel_desc_.DeviceId4ParallelId(parallel_id)); + device_set.emplace(std::make_pair(machine_id, device_id)); + } + EagerNcclCommMgr* comm_mgr = CHECK_NOTNULL(Singleton::Get()); + ncclComm_t comm; + comm = comm_mgr->GetCommForDeviceAndStreamName(device_set, stream_name_); + comm_.reset(new Comm(comm)); + } + + int device_index_; + bool has_independent_stream_; + std::string stream_name_; + ParallelDesc parallel_desc_; + std::unique_ptr comm_; + IDX* host_num_unique_matrix_; +}; + +} // namespace + +template +class IdShuffleKernel final : public user_op::OpKernel { + public: + IdShuffleKernel() = default; + ~IdShuffleKernel() override = default; + + std::shared_ptr CreateOpKernelState( + user_op::KernelInitContext* ctx) const override { + return std::make_shared>(ctx); + } + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, + const user_op::OpKernelCache*) const override { + auto* kernel_state = dynamic_cast*>(state); + CHECK(kernel_state != nullptr); + const user_op::Tensor* ids = ctx->Tensor4ArgNameAndIndex("ids", 0); + user_op::Tensor* num_unique_matrix = ctx->Tensor4ArgNameAndIndex("num_unique_matrix", 0); + user_op::Tensor* inverse_unique_partition_indices = + ctx->Tensor4ArgNameAndIndex("inverse_unique_partition_indices", 0); + user_op::Tensor* cur_rank_num_unique = ctx->Tensor4ArgNameAndIndex("cur_rank_num_unique", 0); + user_op::Tensor* cur_rank_unique_ids = ctx->Tensor4ArgNameAndIndex("cur_rank_unique_ids", 0); + user_op::Tensor* cur_rank_unique_table_ids = + ctx->Tensor4ArgNameAndIndex("cur_rank_unique_table_ids", 0); + user_op::Tensor* cur_rank_inverse_indices = + ctx->Tensor4ArgNameAndIndex("cur_rank_inverse_indices", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + const int32_t num_tables = ctx->Attr("num_tables"); + const bool has_table_ids = ctx->has_input("table_ids", 0); + const bool need_gen_table_ids = (!has_table_ids && num_tables > 1); + const bool need_process_table_ids = (has_table_ids || num_tables > 1); + const int64_t num_ids = ids->shape_view().elem_cnt(); + const int64_t parallel_num = ctx->parallel_ctx().parallel_num(); + const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); + hipStream_t cuda_stream = ctx->stream()->As()->cuda_stream(); + IdShuffleTmpBufferManager buffer_manager( + tmp_buffer->mut_dptr(), num_ids, parallel_num, need_gen_table_ids, need_process_table_ids); + CHECK_GE(tmp_buffer->shape_view().elem_cnt(), buffer_manager.TotalBufferSize()); + + const U* table_ids_ptr; + if (has_table_ids) { + const user_op::Tensor* table_ids = ctx->Tensor4ArgNameAndIndex("table_ids", 0); + table_ids_ptr = reinterpret_cast(table_ids->dptr()); + } else if (need_gen_table_ids) { + hipLaunchKernelGGL(GenerateTableIds, BlocksNum4ThreadsNum(num_ids), kCudaThreadsNumPerBlock, 0, cuda_stream, + num_ids, num_tables, buffer_manager.template Ptr(IdShuffleBufferType::kTableIds)); + table_ids_ptr = buffer_manager.template Ptr(IdShuffleBufferType::kTableIds); + } else { + table_ids_ptr = nullptr; + } + IDX* num_partitioned_unique = + buffer_manager.template Ptr(IdShuffleBufferType::kNumPartitionedUnique); + K* partitioned_unique_ids = + buffer_manager.template Ptr(IdShuffleBufferType::kPartitionedUniqueIds); + U* partitioned_unique_table_ids = + buffer_manager.template Ptr(IdShuffleBufferType::kPartitionedUniqueTableIds); + IDX* num_unique_matrix_ptr = reinterpret_cast(num_unique_matrix->mut_dptr()); + size_t hash_table_capacity = parallel_num * num_ids; + void* workspace_ptr = buffer_manager.Ptr(IdShuffleBufferType::kWorkspace); + size_t workspace_size = buffer_manager.Size(IdShuffleBufferType::kWorkspace); + UniqueAndPartition( + cuda_stream, num_ids, hash_table_capacity, parallel_num, + reinterpret_cast(ids->dptr()), table_ids_ptr, num_partitioned_unique, + partitioned_unique_ids, partitioned_unique_table_ids, + reinterpret_cast(inverse_unique_partition_indices->mut_dptr()), workspace_ptr, + workspace_size, need_process_table_ids); + ncclComm_t comm = kernel_state->comm(); + OF_NCCL_CHECK(ncclAllGather(num_partitioned_unique, num_unique_matrix_ptr, parallel_num, + GetNcclDataType(num_unique_matrix->data_type()), comm, + cuda_stream)); + IDX* host_num_unique_matrix = kernel_state->HostNumUniqueMatrix(); + OF_CUDA_CHECK(hipMemcpyAsync(host_num_unique_matrix, num_unique_matrix_ptr, + parallel_num * parallel_num * sizeof(IDX), hipMemcpyDefault, + cuda_stream)); + CHECK_JUST(ctx->stream()->Sync()); + + K* received_ids = buffer_manager.template Ptr(IdShuffleBufferType::kReceivedIds); + U* received_table_ids = buffer_manager.template Ptr(IdShuffleBufferType::kReceivedTableIds); + int64_t received_elem_cnt = 0; + ShuffleIdsAndTableIds(cuda_stream, comm, parallel_id, parallel_num, num_ids, ids->data_type(), + cur_rank_unique_table_ids->data_type(), host_num_unique_matrix, + partitioned_unique_ids, partitioned_unique_table_ids, received_ids, + received_table_ids, &received_elem_cnt, need_process_table_ids); + UniqueAndPartition( + cuda_stream, received_elem_cnt, hash_table_capacity, 1, received_ids, received_table_ids, + reinterpret_cast(cur_rank_num_unique->mut_dptr()), + reinterpret_cast(cur_rank_unique_ids->mut_dptr()), + reinterpret_cast(cur_rank_unique_table_ids->mut_dptr()), + reinterpret_cast(cur_rank_inverse_indices->mut_dptr()), workspace_ptr, workspace_size, + need_process_table_ids); + if (!need_process_table_ids) { + OF_CUDA_CHECK(hipMemsetAsync(cur_rank_unique_table_ids->mut_dptr(), 0, + received_elem_cnt * sizeof(U), cuda_stream)); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define ID_DATA_TYPE_SEQ \ + OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \ + OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64) \ + OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) \ + OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64) + +#define TABLE_ID_DATA_TYPE_SEQ \ + OF_PP_MAKE_TUPLE_SEQ(uint8_t, DataType::kUInt8) \ + OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \ + OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64) \ + OF_PP_MAKE_TUPLE_SEQ(int8_t, DataType::kInt8) \ + OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) \ + OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64) + +#define IDX_DATA_TYPE_SEQ \ + OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \ + OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) + +#define REGISTER_CUDA_ID_SHUFFLE_KERNEL(k_dtype_pair, table_id_dtype_pair, idx_dtype_pair) \ + REGISTER_USER_KERNEL("id_shuffle") \ + .SetCreateFn< \ + IdShuffleKernel>() \ + .SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("ids", 0) == OF_PP_PAIR_SECOND(k_dtype_pair)) \ + && (user_op::HobDataType("cur_rank_unique_table_ids", 0) \ + == OF_PP_PAIR_SECOND(table_id_dtype_pair)) \ + && (user_op::HobDataType("num_unique_matrix", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ + const user_op::TensorDesc& ids = ctx->InputTensorDesc("ids", 0); \ + const bool has_table_ids = ctx->has_input("table_ids", 0); \ + const int32_t num_tables = ctx->Attr("num_tables"); \ + const bool need_gen_table_ids = (!has_table_ids && num_tables > 1); \ + const bool need_process_table_ids = (has_table_ids || num_tables > 1); \ + IdShuffleTmpBufferManager \ + buffer_manager(nullptr, ids.shape().elem_cnt(), ctx->parallel_desc().parallel_num(), \ + need_gen_table_ids, need_process_table_ids); \ + return buffer_manager.TotalBufferSize(); \ + }); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_ID_SHUFFLE_KERNEL, ID_DATA_TYPE_SEQ, + TABLE_ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) + +template +void ShuffleEmbeddings(hipStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id, + int64_t parallel_num, int64_t num_ids, int64_t embedding_size, + DataType data_type, IDX* host_num_unique_matrix, + T* reverse_unique_cur_rank_embeddings, T* received_embeddings) { + std::vector send_offsets; + std::vector send_elem_cnt; + std::vector recv_offsets; + std::vector recv_elem_cnt; + MakeShuffleParams(host_num_unique_matrix, num_ids, embedding_size, parallel_id, parallel_num, + &recv_offsets, &recv_elem_cnt, &send_offsets, &send_elem_cnt); + ShuffleData(cuda_stream, comm, data_type, send_offsets, send_elem_cnt, + reverse_unique_cur_rank_embeddings, recv_offsets, recv_elem_cnt, received_embeddings); +} + +// Quantized Version. +template +void ShuffleEmbeddings(hipStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id, + int64_t parallel_num, int64_t num_ids, int64_t embedding_size, + DataType data_type, IDX* host_num_unique_matrix, + int8_t* reverse_unique_cur_rank_embeddings, int8_t* received_embeddings, + T* reverse_cur_rank_quantize_factor, T* recv_quantize_factor) { + std::vector send_offsets; + std::vector send_elem_cnt; + std::vector recv_offsets; + std::vector recv_elem_cnt; + // shuffle quantized_embedding + MakeShuffleParams(host_num_unique_matrix, num_ids, embedding_size, parallel_id, parallel_num, + &recv_offsets, &recv_elem_cnt, &send_offsets, &send_elem_cnt); + ShuffleData(cuda_stream, comm, DataType::kInt8, send_offsets, send_elem_cnt, + reverse_unique_cur_rank_embeddings, recv_offsets, recv_elem_cnt, received_embeddings); + // shuffle quantize_factor + MakeShuffleParams(host_num_unique_matrix, num_ids, /*embedding_size=*/1, parallel_id, + parallel_num, &recv_offsets, &recv_elem_cnt, &send_offsets, &send_elem_cnt); + ShuffleData(cuda_stream, comm, data_type, send_offsets, send_elem_cnt, + reverse_cur_rank_quantize_factor, recv_offsets, recv_elem_cnt, recv_quantize_factor); +} + +__device__ float RoundHalfAwayFromZero(const float x) { + float abs_val = abs(x); + float floor_val = floor(abs_val + static_cast(0.5)); + return copysignf(floor_val, x); +} + +// warp reduce version. +constexpr int32_t kWarpSize = 32; +constexpr int32_t kMaxColSize = 1024; + +template +__inline__ __device__ T WarpMaxAllReduce(T val) { + for (int32_t lane_mask = thread_group_width / 2; lane_mask > 0; lane_mask /= 2) { + // val = max(val, __shfl_xor_sync(0xffffffff, val, lane_mask, thread_group_width)); + val = max(val, __shfl_xor(val, lane_mask, thread_group_width)); + } + return val; +} + +inline hipError_t GetWarpImplNumBlocks(int64_t block_size, int64_t max_blocks, int64_t waves, + int* num_blocks) { + int dev; + { + hipError_t err = hipGetDevice(&dev); + if (err != hipSuccess) { return err; } + } + int sm_count; + { + hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev); + if (err != hipSuccess) { return err; } + } + int tpm; + { + hipError_t err = hipDeviceGetAttribute(&tpm, hipDeviceAttributeMaxThreadsPerMultiProcessor, dev); + if (err != hipSuccess) { return err; } + } + *num_blocks = + std::max(1, std::min(max_blocks, sm_count * tpm / block_size * waves)); + return hipSuccess; +} + +template +__global__ void QuantizeWarpImplKernel(const T* src, int8_t* dst, T* quantize_factor, + const int64_t rows, const int64_t cols) { + static_assert(cols_per_thread % pack_size == 0, ""); + static_assert(thread_group_width <= kWarpSize, ""); + static_assert(kWarpSize % thread_group_width == 0, ""); + constexpr int num_packs = cols_per_thread / pack_size; + assert(cols <= cols_per_thread * thread_group_width); + ComputeType buf[rows_per_access][cols_per_thread]; + const int global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y; + const int num_global_thread_group = gridDim.x * blockDim.y; + const int lane_id = threadIdx.x; + const int64_t step = num_global_thread_group * rows_per_access; + using LoadType = cuda::elementwise::PackType; + using LoadPack = cuda::elementwise::Pack; + using StoreType = cuda::elementwise::PackType; + using StorePack = cuda::elementwise::Pack; + + for (int64_t row = global_thread_group_id * rows_per_access; row < rows; row += step) { + ComputeType thread_abs_max[rows_per_access]; +#pragma unroll + for (int row_id = 0; row_id < rows_per_access; row_id++) { + ComputeType* row_buf = buf[row_id]; + thread_abs_max[row_id] = 0.0; +#pragma unroll + for (int pack_id = 0; pack_id < num_packs; pack_id++) { + const int pack_offset = pack_id * pack_size; + const int col = (pack_id * thread_group_width + lane_id) * pack_size; + LoadPack load_pack; + if (!padding || col < cols) { + const int64_t load_offset = ((row + row_id) * cols + col) / pack_size; + load_pack.storage = *(reinterpret_cast(src) + load_offset); +#pragma unroll + for (int i = 0; i < pack_size; i++) { + row_buf[pack_offset + i] = static_cast(load_pack.elem[i]); + thread_abs_max[row_id] = max(thread_abs_max[row_id], abs(row_buf[pack_offset + i])); + } + } else { +#pragma unroll + for (int i = 0; i < pack_size; i++) { row_buf[pack_offset + i] = 0.0; } + } + } + } + ComputeType warp_max[rows_per_access]; +#pragma unroll + for (int row_id = 0; row_id < rows_per_access; row_id++) { + warp_max[row_id] = WarpMaxAllReduce(thread_abs_max[row_id]); + if (threadIdx.x == 0) { quantize_factor[row + row_id] = static_cast(warp_max[row_id]); } + ComputeType* row_buf = buf[row_id]; + ComputeType quantize_factor_val = static_cast(127.0) / warp_max[row_id]; +#pragma unroll + for (int col = 0; col < cols_per_thread; col++) { + row_buf[col] = RoundHalfAwayFromZero(row_buf[col] * quantize_factor_val); + } +#pragma unroll + for (int pack_id = 0; pack_id < num_packs; pack_id++) { + const int pack_offset = pack_id * pack_size; + const int col = (pack_id * thread_group_width + lane_id) * pack_size; + StorePack store_pack; + if (!padding || col < cols) { + const int64_t store_offset = ((row + row_id) * cols + col) / pack_size; + for (int i = 0; i < pack_size; i++) { + store_pack.elem[i] = static_cast(row_buf[pack_id * pack_size + i]); + } + *(reinterpret_cast(dst) + store_offset) = store_pack.storage; + } + } + } + } +} + +template +inline hipError_t LaunchQuantizeWarpImpl(hipStream_t stream, const T* src, int8_t* dst, + T* quantize_factor, const int64_t rows, + const int64_t cols) { + constexpr int block_size = 128; + constexpr int waves = 32; + static_assert(block_size % thread_group_width == 0, ""); + constexpr int thread_groups_per_block = block_size / thread_group_width; + dim3 block_dim(thread_group_width, thread_groups_per_block); + const int64_t num_blocks = + (rows / rows_per_access + thread_groups_per_block - 1) / thread_groups_per_block; + int grid_dim_x = 0; + + hipError_t err = GetWarpImplNumBlocks(block_size, num_blocks, waves, &grid_dim_x); + if (err != hipSuccess) { return err; } + + QuantizeWarpImplKernel + <<>>(src, dst, quantize_factor, rows, cols); + return hipPeekAtLastError(); +} + +template +inline hipError_t DispatchQuantizeWarpImplPadding(hipStream_t stream, const T* src, int8_t* dst, + T* quantize_factor, const int64_t rows, + const int64_t cols) { + if (cols == cols_per_thread * thread_group_width) { + return LaunchQuantizeWarpImpl(stream, src, dst, quantize_factor, rows, + cols); + } else { + return LaunchQuantizeWarpImpl(stream, src, dst, quantize_factor, rows, + cols); + } +} + +template +typename std::enable_if::type DispatchQuantizeWarpImplCols( + hipStream_t stream, const T* src, int8_t* dst, T* quantize_factor, const int64_t rows, + const int64_t cols) { + if (cols <= 0) { return hipErrorInvalidValue; } +#define DEFINE_ONE_ELIF(thread_group_width) \ + else if (cols <= (thread_group_width)*pack_size) { \ + if (rows % 2 == 0) { \ + return DispatchQuantizeWarpImplPadding(stream, src, dst, \ + quantize_factor, rows, cols); \ + } else { \ + return DispatchQuantizeWarpImplPadding(stream, src, dst, \ + quantize_factor, rows, cols); \ + } \ + } + DEFINE_ONE_ELIF(1) + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF +#define DEFINE_ONE_ELIF(col) \ + else if (cols <= (col)*kWarpSize) { \ + return DispatchQuantizeWarpImplPadding( \ + stream, src, dst, quantize_factor, rows, cols); \ + } + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(3) + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(5) + DEFINE_ONE_ELIF(6) + DEFINE_ONE_ELIF(7) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(9) + DEFINE_ONE_ELIF(10) + DEFINE_ONE_ELIF(11) + DEFINE_ONE_ELIF(12) + DEFINE_ONE_ELIF(13) + DEFINE_ONE_ELIF(14) + DEFINE_ONE_ELIF(15) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(17) + DEFINE_ONE_ELIF(18) + DEFINE_ONE_ELIF(19) + DEFINE_ONE_ELIF(20) + DEFINE_ONE_ELIF(21) + DEFINE_ONE_ELIF(22) + DEFINE_ONE_ELIF(23) + DEFINE_ONE_ELIF(24) + DEFINE_ONE_ELIF(25) + DEFINE_ONE_ELIF(26) + DEFINE_ONE_ELIF(27) + DEFINE_ONE_ELIF(28) + DEFINE_ONE_ELIF(29) + DEFINE_ONE_ELIF(30) + DEFINE_ONE_ELIF(31) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF + else { + return hipErrorInvalidValue; + } +} + +template +typename std::enable_if::type DispatchQuantizeWarpImplCols( + hipStream_t stream, const T* src, int8_t* dst, T* quantize_factor, const int64_t rows, + const int64_t cols) { + if (cols <= 0) { return hipErrorInvalidValue; } +#define DEFINE_ONE_ELIF(thread_group_width) \ + else if (cols <= (thread_group_width)*pack_size) { \ + if (rows % 2 == 0) { \ + return DispatchQuantizeWarpImplPadding(stream, src, dst, \ + quantize_factor, rows, cols); \ + } else { \ + return DispatchQuantizeWarpImplPadding(stream, src, dst, \ + quantize_factor, rows, cols); \ + } \ + } + DEFINE_ONE_ELIF(1) + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF +#define DEFINE_ONE_ELIF(col) \ + else if (cols <= (col)*kWarpSize) { \ + return DispatchQuantizeWarpImplPadding( \ + stream, src, dst, quantize_factor, rows, cols); \ + } + DEFINE_ONE_ELIF(4) + DEFINE_ONE_ELIF(6) + DEFINE_ONE_ELIF(8) + DEFINE_ONE_ELIF(10) + DEFINE_ONE_ELIF(12) + DEFINE_ONE_ELIF(14) + DEFINE_ONE_ELIF(16) + DEFINE_ONE_ELIF(18) + DEFINE_ONE_ELIF(20) + DEFINE_ONE_ELIF(22) + DEFINE_ONE_ELIF(24) + DEFINE_ONE_ELIF(26) + DEFINE_ONE_ELIF(28) + DEFINE_ONE_ELIF(30) + DEFINE_ONE_ELIF(32) +#undef DEFINE_ONE_ELIF + else { + return hipErrorInvalidValue; + } +} + +template +struct DispatchQuantizeWarpImplPackSize { + hipError_t operator()(hipStream_t stream, const T* src, int8_t* dst, T* quantize_factor, + const int64_t rows, const int64_t cols) { + if (cols % 2 == 0) { + return DispatchQuantizeWarpImplCols(stream, src, dst, quantize_factor, + rows, cols); + } else { + return DispatchQuantizeWarpImplCols(stream, src, dst, quantize_factor, + rows, cols); + } + } +}; + +template +__global__ void DequantizeKernel(const int8_t* x, T* quantize_factor, T* out, IDX col_size, + IDX elem_cnt); + +template +__global__ void DequantizeKernel(const int8_t* x, T* quantize_factor, T* out, IDX col_size, + IDX elem_cnt) { + IDX global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + for (int index = global_thread_id * pack_size; index < elem_cnt; + index += gridDim.x * blockDim.x * pack_size) { + IDX quantize_factor_idx = index / col_size; + ComputeType quantize_factor_val = static_cast(quantize_factor[quantize_factor_idx]) + / static_cast(127.0); + using LoadPackType = cuda::elementwise::PackType; + using LoadPack = cuda::elementwise::Pack; + using StorePackType = cuda::elementwise::PackType; + using StorePack = cuda::elementwise::Pack; + LoadPack load_pack{}; + StorePack store_pack{}; + load_pack.storage = *(reinterpret_cast(x) + index / pack_size); +#pragma unroll + for (int i = 0; i < pack_size; i++) { + store_pack.elem[i] = + static_cast(static_cast(load_pack.elem[i]) * quantize_factor_val); + } + *(reinterpret_cast(out) + index / pack_size) = store_pack.storage; + } +} + +template +hipError_t DispatchDequantizeKernelPackSize(hipStream_t stream, const int8_t* src, + T* quantize_factor, T* dst, const int64_t col_size, + const int64_t elem_cnt) { + const int64_t pack_num = elem_cnt / pack_size; + int grid_size = 0; + hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size); + if (err != hipSuccess) { return err; } + hipLaunchKernelGGL(HIP_KERNEL_NAME(DequantizeKernel), grid_size, cuda::elementwise::kBlockSize, 0, stream, src, quantize_factor, dst, col_size, + elem_cnt); + return hipSuccess; +} + +template +inline hipError_t LaunchDequantizeKernel(hipStream_t stream, const int8_t* src, + T* quantize_factor, T* dst, const int64_t col_size, + const int64_t elem_cnt) { + constexpr int quantized_src_pack_size = cuda::elementwise::PackSize(); + constexpr int dst_pack_size = cuda::elementwise::PackSize(); + int launch_pack_size = std::min(quantized_src_pack_size, dst_pack_size); + if (launch_pack_size == 8 && col_size % 8 == 0) { + hipError_t err = DispatchDequantizeKernelPackSize( + stream, src, quantize_factor, dst, col_size, elem_cnt); + if (err != hipSuccess) { return err; } + } else if (launch_pack_size == 4 && col_size % 4 == 0) { + hipError_t err = DispatchDequantizeKernelPackSize( + stream, src, quantize_factor, dst, col_size, elem_cnt); + if (err != hipSuccess) { return err; } + } else if (launch_pack_size == 2 && col_size % 2 == 0) { + hipError_t err = DispatchDequantizeKernelPackSize( + stream, src, quantize_factor, dst, col_size, elem_cnt); + if (err != hipSuccess) { return err; } + } else { + hipError_t err = DispatchDequantizeKernelPackSize( + stream, src, quantize_factor, dst, col_size, elem_cnt); + if (err != hipSuccess) { return err; } + } + return hipPeekAtLastError(); +} + +template +struct DefaultComputeType { + using type = T; +}; + +template<> +struct DefaultComputeType { + using type = float; +}; + +template +class EmbeddingShuffleKernel final : public user_op::OpKernel { + public: + EmbeddingShuffleKernel() = default; + ~EmbeddingShuffleKernel() override = default; + + std::shared_ptr CreateOpKernelState( + user_op::KernelInitContext* ctx) const override { + return std::make_shared>(ctx); + } + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, + const user_op::OpKernelCache*) const override { + auto* kernel_state = dynamic_cast*>(state); + CHECK(kernel_state != nullptr); + const user_op::Tensor* cur_rank_embeddings = + ctx->Tensor4ArgNameAndIndex("cur_rank_embeddings", 0); + const user_op::Tensor* num_unique_matrix = ctx->Tensor4ArgNameAndIndex("num_unique_matrix", 0); + const user_op::Tensor* cur_rank_inverse_indices = + ctx->Tensor4ArgNameAndIndex("cur_rank_inverse_indices", 0); + const user_op::Tensor* inverse_unique_partition_indices = + ctx->Tensor4ArgNameAndIndex("inverse_unique_partition_indices", 0); + user_op::Tensor* embeddings = ctx->Tensor4ArgNameAndIndex("embeddings", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + ncclComm_t comm = kernel_state->comm(); + using ComputeType = typename DefaultComputeType::type; + const int64_t embedding_size = cur_rank_embeddings->shape_view().At(1); + IDX* host_num_unique_matrix = kernel_state->HostNumUniqueMatrix(); + DataType data_type = cur_rank_embeddings->data_type(); + const int64_t num_ids = inverse_unique_partition_indices->shape_view().elem_cnt(); + const int64_t parallel_num = ctx->parallel_ctx().parallel_num(); + const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); + bool enable_quantized_comm_env_var = + ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM", false); + bool enable_quantized_comm = enable_quantized_comm_env_var && (embedding_size < kMaxColSize); + if (enable_quantized_comm_env_var && !enable_quantized_comm) { + LOG(WARNING) << "Only envrionment variable ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM=1 and " + "embedding_size less equal than 1024 can use quantized communication. "; + } + hipStream_t cuda_stream = ctx->stream()->As()->cuda_stream(); + OF_CUDA_CHECK(hipMemcpyAsync( + host_num_unique_matrix, reinterpret_cast(num_unique_matrix->dptr()), + parallel_num * parallel_num * sizeof(IDX), hipMemcpyDefault, cuda_stream)); + CHECK_JUST(ctx->stream()->Sync()); + int64_t cur_rank_num_ids = 0; + for (int64_t i = 0; i < parallel_num; ++i) { + cur_rank_num_ids += host_num_unique_matrix[i * parallel_num + parallel_id]; + } + size_t full_elem_cnt = parallel_num * num_ids * embedding_size; + CHECK_EQ(full_elem_cnt, cur_rank_embeddings->shape_view().elem_cnt()); + if (!enable_quantized_comm) { + size_t reverse_unique_cur_rank_embeddings_size = + GetCudaAlignedSize(full_elem_cnt * sizeof(T)); + size_t received_embeddings_size = reverse_unique_cur_rank_embeddings_size; + + CHECK_GE(tmp_buffer->shape_view().elem_cnt(), + reverse_unique_cur_rank_embeddings_size + received_embeddings_size); + + T* reverse_unique_cur_rank_embeddings = reinterpret_cast(tmp_buffer->mut_dptr()); + T* received_embeddings = reinterpret_cast(tmp_buffer->mut_dptr() + + reverse_unique_cur_rank_embeddings_size); + // reverse cur_rank unique + GatherKernelUtilImpl::Forward( + ctx->stream(), reinterpret_cast(cur_rank_inverse_indices->dptr()), + cur_rank_num_ids, cur_rank_embeddings->dptr(), + Shape({1, cur_rank_embeddings->shape_view().elem_cnt() / embedding_size, embedding_size}), + reverse_unique_cur_rank_embeddings, 0); + + ShuffleEmbeddings(cuda_stream, comm, parallel_id, parallel_num, num_ids, embedding_size, + data_type, host_num_unique_matrix, reverse_unique_cur_rank_embeddings, + received_embeddings); + + // reverse unique_partition + GatherKernelUtilImpl::Forward( + ctx->stream(), reinterpret_cast(inverse_unique_partition_indices->dptr()), + inverse_unique_partition_indices->shape_view().elem_cnt(), received_embeddings, + Shape({1, parallel_num * num_ids, embedding_size}), embeddings->mut_dptr(), 0); + } else { + size_t reverse_unique_cur_rank_embeddings_size = + GetCudaAlignedSize(full_elem_cnt * sizeof(int8_t)); + size_t received_embeddings_size = reverse_unique_cur_rank_embeddings_size; + size_t quantize_cur_rank_embeddings_size = reverse_unique_cur_rank_embeddings_size; + size_t reverse_recv_quantize_cur_rank_embeddings_size = + reverse_unique_cur_rank_embeddings_size; + size_t cur_rank_quantize_factor_size = + GetCudaAlignedSize(cur_rank_embeddings->shape_view().At(0) * sizeof(T)); + size_t reverse_cur_rank_quantize_factor_size = cur_rank_quantize_factor_size; + size_t recv_quantize_factor_size = cur_rank_quantize_factor_size; + size_t reverse_recv_quantize_factor_size = cur_rank_quantize_factor_size; + CHECK_GE(tmp_buffer->shape_view().elem_cnt(), + reverse_unique_cur_rank_embeddings_size + received_embeddings_size + + quantize_cur_rank_embeddings_size + + reverse_recv_quantize_cur_rank_embeddings_size + cur_rank_quantize_factor_size + + reverse_cur_rank_quantize_factor_size + recv_quantize_factor_size + + reverse_recv_quantize_factor_size); + int8_t* reverse_unique_cur_rank_embeddings = + reinterpret_cast(tmp_buffer->mut_dptr()); + int8_t* received_embeddings = reinterpret_cast( + tmp_buffer->mut_dptr() + reverse_unique_cur_rank_embeddings_size); + int8_t* quantize_cur_rank_embeddings = reinterpret_cast( + tmp_buffer->mut_dptr() + reverse_unique_cur_rank_embeddings_size + + received_embeddings_size); + int8_t* reverse_recv_quantize_cur_rank_embeddings = reinterpret_cast( + tmp_buffer->mut_dptr() + reverse_unique_cur_rank_embeddings_size + + received_embeddings_size + quantize_cur_rank_embeddings_size); + T* cur_rank_quantize_factor = reinterpret_cast( + tmp_buffer->mut_dptr() + reverse_unique_cur_rank_embeddings_size + + received_embeddings_size + quantize_cur_rank_embeddings_size + + reverse_recv_quantize_cur_rank_embeddings_size); + T* reverse_cur_rank_quantize_factor = reinterpret_cast( + tmp_buffer->mut_dptr() + reverse_unique_cur_rank_embeddings_size + + received_embeddings_size + quantize_cur_rank_embeddings_size + + reverse_recv_quantize_cur_rank_embeddings_size + cur_rank_quantize_factor_size); + T* recv_quantize_factor = reinterpret_cast( + tmp_buffer->mut_dptr() + reverse_unique_cur_rank_embeddings_size + + received_embeddings_size + quantize_cur_rank_embeddings_size + + reverse_recv_quantize_cur_rank_embeddings_size + cur_rank_quantize_factor_size + + reverse_cur_rank_quantize_factor_size); + T* reverse_recv_quantize_factor = reinterpret_cast( + tmp_buffer->mut_dptr() + reverse_unique_cur_rank_embeddings_size + + received_embeddings_size + quantize_cur_rank_embeddings_size + + reverse_recv_quantize_cur_rank_embeddings_size + cur_rank_quantize_factor_size + + reverse_cur_rank_quantize_factor_size + recv_quantize_factor_size); + DispatchQuantizeWarpImplPackSize()( + cuda_stream, cur_rank_embeddings->dptr(), quantize_cur_rank_embeddings, + cur_rank_quantize_factor, cur_rank_num_ids, embedding_size); + // reverse cur_rank embedding unique + GatherKernelUtilImpl::Forward( + ctx->stream(), reinterpret_cast(cur_rank_inverse_indices->dptr()), + cur_rank_num_ids, quantize_cur_rank_embeddings, + Shape({1, cur_rank_embeddings->shape_view().elem_cnt() / embedding_size, embedding_size}), + reverse_unique_cur_rank_embeddings, 0); + + // reverse cur_rank quantize factor unique + GatherKernelUtilImpl::Forward( + ctx->stream(), reinterpret_cast(cur_rank_inverse_indices->dptr()), + cur_rank_num_ids, cur_rank_quantize_factor, + Shape({1, cur_rank_embeddings->shape_view().elem_cnt() / embedding_size, 1}), + reverse_cur_rank_quantize_factor, 0); + + ShuffleEmbeddings(cuda_stream, comm, parallel_id, parallel_num, num_ids, embedding_size, + data_type, host_num_unique_matrix, reverse_unique_cur_rank_embeddings, + received_embeddings, reverse_cur_rank_quantize_factor, + recv_quantize_factor); + + // reverse unique_partition + GatherKernelUtilImpl::Forward( + ctx->stream(), reinterpret_cast(inverse_unique_partition_indices->dptr()), + inverse_unique_partition_indices->shape_view().elem_cnt(), received_embeddings, + Shape({1, parallel_num * num_ids, embedding_size}), + reverse_recv_quantize_cur_rank_embeddings, 0); + + GatherKernelUtilImpl::Forward( + ctx->stream(), reinterpret_cast(inverse_unique_partition_indices->dptr()), + inverse_unique_partition_indices->shape_view().elem_cnt(), recv_quantize_factor, + Shape({1, parallel_num * num_ids, 1}), reverse_recv_quantize_factor, 0); + + int32_t dequantize_row_size = inverse_unique_partition_indices->shape_view().elem_cnt(); + IDX dequantize_elem_cnt = dequantize_row_size * embedding_size; + OF_CUDA_CHECK((LaunchDequantizeKernel( + cuda_stream, reverse_recv_quantize_cur_rank_embeddings, reverse_recv_quantize_factor, + embeddings->mut_dptr(), embedding_size, dequantize_elem_cnt))); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_EMBEDDING_SHUFFLE_KERNEL(t_dtype_pair, idx_dtype_pair) \ + REGISTER_USER_KERNEL("embedding_shuffle") \ + .SetCreateFn>() \ + .SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("cur_rank_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair)) \ + && (user_op::HobDataType("num_unique_matrix", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ + const user_op::TensorDesc& cur_rank_embeddings = \ + ctx->InputTensorDesc("cur_rank_embeddings", 0); \ + bool enable_quantized_comm = \ + ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM", false) \ + && (cur_rank_embeddings.shape().At(1) < kMaxColSize); \ + size_t tmp_size = 0; \ + if (!enable_quantized_comm) { \ + size_t reverse_cur_rank_embeddings_size = GetCudaAlignedSize( \ + cur_rank_embeddings.shape().elem_cnt() * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair))); \ + size_t recv_unique_embeddings_size = reverse_cur_rank_embeddings_size; \ + tmp_size = reverse_cur_rank_embeddings_size + recv_unique_embeddings_size; \ + } else { \ + size_t total_elem_cnt = cur_rank_embeddings.shape().elem_cnt(); \ + size_t reverse_cur_rank_embeddings_size = \ + GetCudaAlignedSize(total_elem_cnt * sizeof(int8_t)); \ + size_t recv_unique_embeddings = reverse_cur_rank_embeddings_size; \ + size_t quantize_cur_rank_embeddings_size = reverse_cur_rank_embeddings_size; \ + size_t reverse_recv_quantize_cur_rank_embeddings_size = \ + reverse_cur_rank_embeddings_size; \ + size_t cur_rank_quantize_factor_size = GetCudaAlignedSize( \ + cur_rank_embeddings.shape().At(0) * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair))); \ + size_t reverse_cur_rank_quantize_factor_size = cur_rank_quantize_factor_size; \ + size_t recv_quantize_factor_size = cur_rank_quantize_factor_size; \ + size_t reverse_recv_quantize_factor_size = cur_rank_quantize_factor_size; \ + tmp_size = reverse_cur_rank_embeddings_size + recv_unique_embeddings \ + + quantize_cur_rank_embeddings_size \ + + reverse_recv_quantize_cur_rank_embeddings_size \ + + cur_rank_quantize_factor_size + reverse_cur_rank_quantize_factor_size \ + + recv_quantize_factor_size + reverse_recv_quantize_factor_size; \ + } \ + return tmp_size; \ + }); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_SHUFFLE_KERNEL, + FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) + // FLOATING_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) + + +template +void ShuffleEmbeddingsGrad(hipStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id, + int64_t parallel_num, int64_t num_ids, int64_t embedding_size, + DataType data_type, IDX* host_num_unique_matrix, + T* unique_partition_embedding_grad, T* received_embeddings_grad) { + std::vector send_offsets; + std::vector send_elem_cnt; + std::vector recv_offsets; + std::vector recv_elem_cnt; + MakeShuffleParams(host_num_unique_matrix, num_ids, embedding_size, parallel_id, parallel_num, + &send_offsets, &send_elem_cnt, &recv_offsets, &recv_elem_cnt); + ShuffleData(cuda_stream, comm, data_type, send_offsets, send_elem_cnt, + unique_partition_embedding_grad, recv_offsets, recv_elem_cnt, + received_embeddings_grad); +} + +// Quantize Version. +template +void ShuffleEmbeddingsGrad(hipStream_t cuda_stream, ncclComm_t comm, int64_t parallel_id, + int64_t parallel_num, int64_t num_ids, int64_t embedding_size, + DataType data_type, IDX* host_num_unique_matrix, + int8_t* unique_partition_embedding_grad, + int8_t* received_embeddings_grad, T* cur_rank_quantize_factor, + T* received_cur_rank_quantize_factor) { + std::vector send_offsets; + std::vector send_elem_cnt; + std::vector recv_offsets; + std::vector recv_elem_cnt; + // Shuffle Embedding Grad. + MakeShuffleParams(host_num_unique_matrix, num_ids, embedding_size, parallel_id, parallel_num, + &send_offsets, &send_elem_cnt, &recv_offsets, &recv_elem_cnt); + ShuffleData(cuda_stream, comm, DataType::kInt8, send_offsets, send_elem_cnt, + unique_partition_embedding_grad, recv_offsets, recv_elem_cnt, + received_embeddings_grad); + // Shuffle Quantize factor. + MakeShuffleParams(host_num_unique_matrix, num_ids, /*embedding_size=*/1, parallel_id, + parallel_num, &send_offsets, &send_elem_cnt, &recv_offsets, &recv_elem_cnt); + ShuffleData(cuda_stream, comm, data_type, send_offsets, send_elem_cnt, cur_rank_quantize_factor, + recv_offsets, recv_elem_cnt, received_cur_rank_quantize_factor); +} + +template +__global__ void UnsortedSegmentHalfGpu(const IDX in_h2_elem_cnt, const IDX h2_inner_dim_size, + const IDX inner_dim_size, const half* data, + const K* segment_ids, const IDX num_segments, + half2* out_h2) { + CUDA_1D_KERNEL_LOOP_T(IDX, i, in_h2_elem_cnt) { + const IDX segment_id_idx = i / h2_inner_dim_size; + const IDX h2_inner_idx = i - segment_id_idx * h2_inner_dim_size; + const IDX inner_idx_0 = 2 * h2_inner_idx; + const IDX inner_idx_1 = inner_idx_0 + 1; + const half* data_row = data + segment_id_idx * inner_dim_size; + half2 val; + val.data.x = data_row[inner_idx_0]; + val.data.y = (inner_idx_1 >= inner_dim_size) ? static_cast(0) : data_row[inner_idx_1]; + const IDX idx = segment_ids[segment_id_idx]; + const IDX out_h2_offset = idx * h2_inner_dim_size + h2_inner_idx; + cuda::atomic::Add(out_h2 + out_h2_offset, val); + } +} + +template +struct UnsortedSegmentSumPad { + void operator()(ep::Stream* stream, const K* segment_ids, const T* data, int64_t num_segment_ids, + int64_t num_segments, int64_t inner_dim_size, int64_t padded_inner_dim_size, + T* out) const { + UNIMPLEMENTED(); + } +}; + +template +struct UnsortedSegmentSumPad { + void operator()(ep::Stream* stream, const K* segment_ids, const half* data, + int64_t num_segment_ids, int64_t num_segments, int64_t inner_dim_size, + int64_t padded_inner_dim_size, half* out) const { + const int64_t data_elem_cnt = num_segment_ids * inner_dim_size; + const int64_t out_elem_cnt = num_segments * padded_inner_dim_size; + CHECK_EQ(padded_inner_dim_size % 2, 0); + CHECK_EQ(inner_dim_size + 1, padded_inner_dim_size); + const int64_t h2_inner_dim_size = padded_inner_dim_size / 2; + const int64_t in_h2_elem_cnt = num_segment_ids * h2_inner_dim_size; + if (std::max(data_elem_cnt, out_elem_cnt) < GetMaxVal() / 2) { + UnsortedSegmentHalfGpu + <<As()->cuda_stream()>>>( + in_h2_elem_cnt, h2_inner_dim_size, inner_dim_size, data, segment_ids, num_segments, + reinterpret_cast(out)); + } else { + UnsortedSegmentHalfGpu + <<As()->cuda_stream()>>>( + in_h2_elem_cnt, h2_inner_dim_size, inner_dim_size, data, segment_ids, num_segments, + reinterpret_cast(out)); + } + } +}; + +template +void UnsortedSegmentSum(ep::Stream* stream, const K* segment_ids, const T* data, + int64_t num_segment_ids, int64_t num_segments, int64_t inner_dim_size, + int64_t padded_inner_dim_size, T* out) { + if (inner_dim_size == padded_inner_dim_size) { + UnsortedSegmentSumKernelUtil::UnsortedSegmentSum( + stream, segment_ids, data, num_segment_ids, num_segments, 1, inner_dim_size, 0, out); + } else { + CHECK_EQ(inner_dim_size + 1, padded_inner_dim_size); + UnsortedSegmentSumPad()(stream, segment_ids, data, num_segment_ids, num_segments, + inner_dim_size, padded_inner_dim_size, out); + } +} + +template +void UniquePartitionEmbeddingGrad(ep::Stream* stream, int64_t parallel_id, int64_t parallel_num, + int64_t num_ids, int64_t embedding_size, + int64_t padded_embedding_size, const IDX* host_num_unique_matrix, + const T* embedding_grad, + const IDX* inverse_unique_partition_indices, + T* unique_partition_embedding_grad) { + for (int64_t i = 0; i < parallel_num; ++i) { + const int64_t offset = i * num_ids * padded_embedding_size; + const int64_t valid_value_size = + host_num_unique_matrix[parallel_id * parallel_num + i] * padded_embedding_size * sizeof(T); + OF_CUDA_CHECK(hipMemsetAsync(unique_partition_embedding_grad + offset, 0, valid_value_size, + stream->As()->cuda_stream())); + } + UnsortedSegmentSum(stream, inverse_unique_partition_indices, embedding_grad, num_ids, + parallel_num * num_ids, embedding_size, padded_embedding_size, + unique_partition_embedding_grad); +} + +template +void UniqueCurRankEmbeddingGrad(ep::Stream* stream, DataType data_type, int64_t cur_rank_num_ids, + int64_t embedding_size, int64_t padded_embedding_size, + const T* cur_rank_embedding_grad, + const IDX* cur_rank_inverse_indices, + T* cur_rank_unique_embedding_grad, T* tmp_buffer) { + T* unsorted_segment_sum_out = + (embedding_size == padded_embedding_size) ? cur_rank_unique_embedding_grad : tmp_buffer; + OF_CUDA_CHECK(hipMemsetAsync(unsorted_segment_sum_out, 0, + cur_rank_num_ids * padded_embedding_size * sizeof(T), + stream->As()->cuda_stream())); + UnsortedSegmentSum(stream, cur_rank_inverse_indices, cur_rank_embedding_grad, + cur_rank_num_ids, cur_rank_num_ids, padded_embedding_size, + padded_embedding_size, unsorted_segment_sum_out); + if (embedding_size != padded_embedding_size) { + std::unique_ptr primitive = + ep::primitive::NewPrimitive(DeviceType::kCUDA, 2); + DimVector dst_shape = {cur_rank_num_ids, embedding_size}; + DimVector dst_pos_vec = {0, 0}; + DimVector src_shape = {cur_rank_num_ids, padded_embedding_size}; + DimVector src_pos_vec = {0, 0}; + DimVector extent_vec = {cur_rank_num_ids, embedding_size}; + primitive->Launch(stream, data_type, 2, cur_rank_unique_embedding_grad, dst_shape.data(), + dst_pos_vec.data(), unsorted_segment_sum_out, src_shape.data(), + src_pos_vec.data(), extent_vec.data()); + } +} + +int64_t GetPaddedEmbeddingSize(DataType data_type, int64_t embedding_size) { + if (data_type == DataType::kFloat16 && embedding_size % 2 != 0) { + return embedding_size + 1; + } else { + return embedding_size; + } +} + +template +class EmbeddingGradientShuffleKernel final : public user_op::OpKernel { + public: + EmbeddingGradientShuffleKernel() = default; + ~EmbeddingGradientShuffleKernel() override = default; + + std::shared_ptr CreateOpKernelState( + user_op::KernelInitContext* ctx) const override { + return std::make_shared>(ctx); + } + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, + const user_op::OpKernelCache*) const override { + auto* kernel_state = dynamic_cast*>(state); + CHECK(kernel_state != nullptr); + const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0); + + const user_op::Tensor* num_unique_matrix = ctx->Tensor4ArgNameAndIndex("num_unique_matrix", 0); + const user_op::Tensor* cur_rank_inverse_indices = + ctx->Tensor4ArgNameAndIndex("cur_rank_inverse_indices", 0); + const user_op::Tensor* inverse_unique_partition_indices = + ctx->Tensor4ArgNameAndIndex("inverse_unique_partition_indices", 0); + user_op::Tensor* cur_rank_unique_embedding_grad = + ctx->Tensor4ArgNameAndIndex("cur_rank_unique_embedding_grad", 0); + const int64_t embedding_size = cur_rank_unique_embedding_grad->shape_view().At(1); + IDX* host_num_unique_matrix = kernel_state->HostNumUniqueMatrix(); + DataType data_type = embedding_grad->data_type(); + const int64_t num_ids = inverse_unique_partition_indices->shape_view().elem_cnt(); + const int64_t parallel_num = ctx->parallel_ctx().parallel_num(); + const int64_t parallel_id = ctx->parallel_ctx().parallel_id(); + const int64_t padded_embedding_size = GetPaddedEmbeddingSize(data_type, embedding_size); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + ncclComm_t comm = kernel_state->comm(); + using ComputeType = typename DefaultComputeType::type; + bool enable_quantized_comm_env_var = + ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM", false); + bool enable_quantized_comm = + enable_quantized_comm_env_var && (padded_embedding_size < kMaxColSize); + if (enable_quantized_comm_env_var && !enable_quantized_comm) { + LOG(WARNING) << "Only envrionment variable ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM=1 and " + "embedding_size less equal than 1024 can use quantized communication. "; + } + hipStream_t cuda_stream = ctx->stream()->As()->cuda_stream(); + OF_CUDA_CHECK(hipMemcpyAsync(host_num_unique_matrix, num_unique_matrix->dptr(), + parallel_num * parallel_num * sizeof(IDX), hipMemcpyDefault, + cuda_stream)); + CHECK_JUST(ctx->stream()->Sync()); + + int64_t cur_rank_num_ids = 0; + for (int64_t i = 0; i < parallel_num; ++i) { + cur_rank_num_ids += host_num_unique_matrix[i * parallel_num + parallel_id]; + } + size_t full_num_ids = parallel_num * num_ids; + size_t full_elem_cnt = full_num_ids * padded_embedding_size; + size_t unique_partition_embedding_grad_size = GetCudaAlignedSize(full_elem_cnt * sizeof(T)); + + if (!enable_quantized_comm) { + size_t received_embedding_grad_size = unique_partition_embedding_grad_size; + T* unique_partition_embedding_grad = reinterpret_cast(tmp_buffer->mut_dptr()); + T* received_embedding_grad = + reinterpret_cast(tmp_buffer->mut_dptr() + unique_partition_embedding_grad_size); + CHECK_GE(tmp_buffer->shape_view().elem_cnt(), + unique_partition_embedding_grad_size + received_embedding_grad_size); + + UniquePartitionEmbeddingGrad( + ctx->stream(), parallel_id, parallel_num, num_ids, embedding_size, padded_embedding_size, + host_num_unique_matrix, embedding_grad->dptr(), + reinterpret_cast(inverse_unique_partition_indices->dptr()), + unique_partition_embedding_grad); + + ShuffleEmbeddingsGrad(cuda_stream, comm, parallel_id, parallel_num, num_ids, + padded_embedding_size, data_type, host_num_unique_matrix, + unique_partition_embedding_grad, received_embedding_grad); + + // use unique_partition_embedding_grad as UniqueCurRankEmbeddingGrad buffer. + T* buffer_ptr = unique_partition_embedding_grad; + UniqueCurRankEmbeddingGrad(ctx->stream(), data_type, cur_rank_num_ids, embedding_size, + padded_embedding_size, received_embedding_grad, + reinterpret_cast(cur_rank_inverse_indices->dptr()), + cur_rank_unique_embedding_grad->mut_dptr(), buffer_ptr); + } else { + size_t received_embedding_grad_size = GetCudaAlignedSize(full_elem_cnt * sizeof(int8_t)); + size_t quantize_cur_rank_embedding_grad_size = received_embedding_grad_size; + size_t cur_rank_quantize_factor_size = GetCudaAlignedSize(full_num_ids * sizeof(T)); + size_t received_cur_rank_quantize_factor_size = cur_rank_quantize_factor_size; + size_t dequantize_cur_rank_embedding_grad_size = + GetCudaAlignedSize(full_elem_cnt * sizeof(T)); + CHECK_GE(tmp_buffer->shape_view().elem_cnt(), + unique_partition_embedding_grad_size + received_embedding_grad_size + + quantize_cur_rank_embedding_grad_size + cur_rank_quantize_factor_size + + received_cur_rank_quantize_factor_size + + dequantize_cur_rank_embedding_grad_size); + T* unique_partition_embedding_grad = reinterpret_cast(tmp_buffer->mut_dptr()); + int8_t* received_embedding_grad = reinterpret_cast( + tmp_buffer->mut_dptr() + unique_partition_embedding_grad_size); + + int8_t* quantize_cur_rank_embedding_grad = reinterpret_cast( + tmp_buffer->mut_dptr() + unique_partition_embedding_grad_size + + received_embedding_grad_size); + T* cur_rank_quantize_factor = reinterpret_cast( + tmp_buffer->mut_dptr() + unique_partition_embedding_grad_size + + received_embedding_grad_size + quantize_cur_rank_embedding_grad_size); + T* received_cur_rank_quantize_factor = reinterpret_cast( + tmp_buffer->mut_dptr() + unique_partition_embedding_grad_size + + received_embedding_grad_size + quantize_cur_rank_embedding_grad_size + + cur_rank_quantize_factor_size); + T* dequantize_cur_rank_embedding_grad = reinterpret_cast( + tmp_buffer->mut_dptr() + unique_partition_embedding_grad_size + + received_embedding_grad_size + quantize_cur_rank_embedding_grad_size + + cur_rank_quantize_factor_size + received_cur_rank_quantize_factor_size); + + UniquePartitionEmbeddingGrad( + ctx->stream(), parallel_id, parallel_num, num_ids, embedding_size, padded_embedding_size, + host_num_unique_matrix, embedding_grad->dptr(), + reinterpret_cast(inverse_unique_partition_indices->dptr()), + unique_partition_embedding_grad); + + // Quantize. + for (int64_t i = 0; i < parallel_num; ++i) { + const int64_t embedding_grad_offset = i * num_ids * padded_embedding_size; + const int64_t quantize_factor_offset = i * num_ids; + const int64_t valid_row_size = host_num_unique_matrix[parallel_id * parallel_num + i]; + DispatchQuantizeWarpImplPackSize()( + cuda_stream, unique_partition_embedding_grad + embedding_grad_offset, + quantize_cur_rank_embedding_grad + embedding_grad_offset, + cur_rank_quantize_factor + quantize_factor_offset, valid_row_size, + padded_embedding_size); + } + + ShuffleEmbeddingsGrad(cuda_stream, comm, parallel_id, parallel_num, num_ids, + padded_embedding_size, data_type, host_num_unique_matrix, + quantize_cur_rank_embedding_grad, received_embedding_grad, + cur_rank_quantize_factor, received_cur_rank_quantize_factor); + + int64_t dequantize_cur_rank_num = 0; + for (int64_t i = 0; i < parallel_num; ++i) { + /* + Host num unique matrix: + | Partition0 | Partition1 | + | Rank0 | 2 | 4 | + | Rank1 | 3 | 3 | + After ShuffleEmbeddingGrads, each rank will exchange partition. + For example: + Rank0 will have (matrix[rank0][part0] + matrix[rank1][part0]) grad tensor. + Rank1 will have (matrix[rank0][part1] + matrix[rank1][part1]) grad tensor. + */ + dequantize_cur_rank_num += host_num_unique_matrix[i * parallel_num + parallel_id]; + } + IDX dequantize_elem_cnt = dequantize_cur_rank_num * padded_embedding_size; + OF_CUDA_CHECK((LaunchDequantizeKernel( + cuda_stream, received_embedding_grad, received_cur_rank_quantize_factor, + dequantize_cur_rank_embedding_grad, padded_embedding_size, dequantize_elem_cnt))); + // use unique_partition_embedding_grad as UniqueCurRankEmbeddingGrad buffer. + T* buffer_ptr = unique_partition_embedding_grad; + UniqueCurRankEmbeddingGrad(ctx->stream(), data_type, cur_rank_num_ids, embedding_size, + padded_embedding_size, dequantize_cur_rank_embedding_grad, + reinterpret_cast(cur_rank_inverse_indices->dptr()), + cur_rank_unique_embedding_grad->mut_dptr(), buffer_ptr); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_EMBEDDING_GRADIENT_SHUFFLE_KERNEL(t_dtype_pair, idx_dtype_pair) \ + REGISTER_USER_KERNEL("embedding_gradient_shuffle") \ + .SetCreateFn>() \ + .SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(t_dtype_pair)) \ + && (user_op::HobDataType("num_unique_matrix", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ + const user_op::TensorDesc& cur_rank_unique_embedding_grad = \ + ctx->InputTensorDesc("cur_rank_unique_embedding_grad", 0); \ + size_t cur_rank_embedding_grad_num = cur_rank_unique_embedding_grad.shape().At(0); \ + size_t embedding_size = cur_rank_unique_embedding_grad.shape().At(1); \ + size_t padded_embedding_size = \ + GetPaddedEmbeddingSize(cur_rank_unique_embedding_grad.data_type(), embedding_size); \ + size_t cur_rank_embedding_grad_elem_cnt = \ + cur_rank_embedding_grad_num * padded_embedding_size; \ + bool enable_quantized_comm = \ + ParseBooleanFromEnv("ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM", false) \ + && (padded_embedding_size < kMaxColSize); \ + size_t tmp_size = 0; \ + if (!enable_quantized_comm) { \ + size_t cur_rank_embedding_grad_size = GetCudaAlignedSize( \ + cur_rank_embedding_grad_elem_cnt * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair))); \ + tmp_size = 2 * cur_rank_embedding_grad_size; \ + } else { \ + size_t unique_partition_embedding_grad_size = GetCudaAlignedSize( \ + cur_rank_embedding_grad_elem_cnt * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair))); \ + size_t received_embedding_grad_size = \ + GetCudaAlignedSize(cur_rank_embedding_grad_elem_cnt * sizeof(int8_t)); \ + size_t quantize_cur_rank_embedding_grad_size = received_embedding_grad_size; \ + size_t cur_rank_quantize_factor_size = GetCudaAlignedSize( \ + cur_rank_embedding_grad_num * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair))); \ + size_t received_cur_rank_quantize_factor_size = cur_rank_quantize_factor_size; \ + size_t dequantize_cur_rank_embedding_grad_size = unique_partition_embedding_grad_size; \ + tmp_size = unique_partition_embedding_grad_size + received_embedding_grad_size \ + + quantize_cur_rank_embedding_grad_size + cur_rank_quantize_factor_size \ + + received_cur_rank_quantize_factor_size \ + + dequantize_cur_rank_embedding_grad_size; \ + } \ + return tmp_size; \ + }); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_GRADIENT_SHUFFLE_KERNEL, + FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) + // FLOATING_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) + +template +class UniqueKeyValuePairKernel final : public user_op::OpKernel { + public: + UniqueKeyValuePairKernel() = default; + ~UniqueKeyValuePairKernel() override = default; + + private: + using user_op::OpKernel::Compute; + + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* keys = ctx->Tensor4ArgNameAndIndex("keys", 0); + user_op::Tensor* num_unique = ctx->Tensor4ArgNameAndIndex("num_unique", 0); + user_op::Tensor* unique_keys = ctx->Tensor4ArgNameAndIndex("unique_keys", 0); + user_op::Tensor* unique_values = ctx->Tensor4ArgNameAndIndex("unique_values", 0); + user_op::Tensor* inverse_indices = ctx->Tensor4ArgNameAndIndex("inverse_indices", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + const int32_t num_tables = ctx->Attr("num_tables"); + const bool has_values = ctx->has_input("values", 0); + const bool need_values_buffer = (!has_values && num_tables > 1); + size_t values_buffer_bytes = + need_values_buffer ? GetCudaAlignedSize(keys->shape_view().elem_cnt() * sizeof(V)) : 0; + const int64_t num_keys = keys->shape_view().elem_cnt(); + const int64_t hash_capacity = num_keys; + const size_t workspace_bytes = GetCudaAlignedSize(hash_capacity * sizeof(TableEntry)); + CHECK_LE(values_buffer_bytes + workspace_bytes, tmp_buffer->shape_view().elem_cnt()); + hipStream_t cuda_stream = ctx->stream()->As()->cuda_stream(); + const V* values_ptr; + if (has_values) { + const user_op::Tensor* values = ctx->Tensor4ArgNameAndIndex("values", 0); + values_ptr = reinterpret_cast(values->dptr()); + } else if (need_values_buffer) { + V* values_buffer_ptr = reinterpret_cast(tmp_buffer->mut_dptr()); + hipLaunchKernelGGL(GenerateTableIds, BlocksNum4ThreadsNum(num_keys), kCudaThreadsNumPerBlock, 0, cuda_stream, + num_keys, num_tables, values_buffer_ptr); + values_ptr = values_buffer_ptr; + } else { + values_ptr = nullptr; + } + const bool need_process_table_ids = (has_values || num_tables > 1); + TableEntry* workspace_ptr = + reinterpret_cast*>(tmp_buffer->mut_dptr() + values_buffer_bytes); + UniqueAndPartition( + cuda_stream, num_keys, hash_capacity, 1, reinterpret_cast(keys->dptr()), + values_ptr, reinterpret_cast(num_unique->mut_dptr()), + reinterpret_cast(unique_keys->mut_dptr()), + reinterpret_cast(unique_values->mut_dptr()), + reinterpret_cast(inverse_indices->mut_dptr()), workspace_ptr, workspace_bytes, + need_process_table_ids); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_UNIQUE_KEY_VALUE_PAIR_KERNEL(k_dtype_pair, value_dtype_pair, idx_dtype_pair) \ + REGISTER_USER_KERNEL("unique_key_value_pair") \ + .SetCreateFn>() \ + .SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("keys", 0) == OF_PP_PAIR_SECOND(k_dtype_pair)) \ + && (user_op::HobDataType("inverse_indices", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair)) \ + && (user_op::HobDataType("unique_values", 0) == OF_PP_PAIR_SECOND(value_dtype_pair))) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ + const user_op::TensorDesc& keys = ctx->InputTensorDesc("keys", 0); \ + const int64_t num_keys = keys.shape().elem_cnt(); \ + const int64_t hash_capacity = num_keys; \ + const size_t workspace_bytes = GetCudaAlignedSize( \ + hash_capacity * sizeof(TableEntry)); \ + const int32_t num_tables = ctx->Attr("num_tables"); \ + const bool has_values = ctx->has_input("values", 0); \ + const bool need_values_buffer = (!has_values && num_tables > 1); \ + size_t values_buffer_bytes = \ + need_values_buffer \ + ? GetCudaAlignedSize(num_keys * sizeof(OF_PP_PAIR_FIRST(value_dtype_pair))) \ + : 0; \ + return workspace_bytes + values_buffer_bytes; \ + }); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_UNIQUE_KEY_VALUE_PAIR_KERNEL, ID_DATA_TYPE_SEQ, + ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) + +REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("id_shuffle"); +REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("embedding_shuffle"); +REGISTER_USER_KERNEL_UNIFIED_NCCL_COMM_INIT("embedding_gradient_shuffle"); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/diag_kernel.hip.cpp b/oneflow/user/kernels/diag_kernel.hip.cpp index fa8a382..99550b2 100644 --- a/oneflow/user/kernels/diag_kernel.hip.cpp +++ b/oneflow/user/kernels/diag_kernel.hip.cpp @@ -1,80 +1,80 @@ -#include "hip/hip_runtime.h" -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/kernel/kernel_util.h" -#include "oneflow/user/kernels/diag_kernel.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { -namespace { - -template -__global__ void vector_diagonal_kernel(T* out_buf, const T* in_buf, int32_t size, int32_t stride) { - CUDA_1D_KERNEL_LOOP(i, size) { out_buf[i * stride] = in_buf[i]; } -} - -template -__global__ void matrix_diagonal_kernel(T* out_buf, const T* in_buf, int32_t size, int32_t stride) { - CUDA_1D_KERNEL_LOOP(i, size) { out_buf[i] = in_buf[i * stride]; } -} - -template -struct DiagFunctor final { - void operator()(ep::Stream* stream, T* out_buf, const T* in_buf, int32_t size, int32_t stride, - int32_t in_dim) { - if (in_dim == 1) { - vector_diagonal_kernel<<As()->cuda_stream()>>>(out_buf, in_buf, size, - stride); - } else { - matrix_diagonal_kernel<<As()->cuda_stream()>>>(out_buf, in_buf, size, - stride); - } - } -}; - -template -struct DiagGradFunctor final { - void operator()(ep::Stream* stream, T* dx_buf, const T* dy_buf, int32_t dx_cnt, int32_t dy_cnt, - int32_t stride, int32_t in_dim) { - if (in_dim == 1) { - matrix_diagonal_kernel<<As()->cuda_stream()>>>(dx_buf, dy_buf, - dx_cnt, stride); - } else { - vector_diagonal_kernel<<As()->cuda_stream()>>>(dx_buf, dy_buf, - dy_cnt, stride); - } - } -}; - -} // namespace - -REGISTER_DIAG_KERNELS(DeviceType::kCUDA, half); -REGISTER_DIAG_KERNELS(DeviceType::kCUDA, float); -REGISTER_DIAG_KERNELS(DeviceType::kCUDA, double); -REGISTER_DIAG_KERNELS(DeviceType::kCUDA, bool); -REGISTER_DIAG_KERNELS(DeviceType::kCUDA, uint8_t); -REGISTER_DIAG_KERNELS(DeviceType::kCUDA, int8_t); -REGISTER_DIAG_KERNELS(DeviceType::kCUDA, int32_t); -REGISTER_DIAG_KERNELS(DeviceType::kCUDA, int64_t); - +#include "hip/hip_runtime.h" +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/kernel/kernel_util.h" +#include "oneflow/user/kernels/diag_kernel.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { +namespace { + +template +__global__ void vector_diagonal_kernel(T* out_buf, const T* in_buf, int32_t size, int32_t stride) { + CUDA_1D_KERNEL_LOOP(i, size) { out_buf[i * stride] = in_buf[i]; } +} + +template +__global__ void matrix_diagonal_kernel(T* out_buf, const T* in_buf, int32_t size, int32_t stride) { + CUDA_1D_KERNEL_LOOP(i, size) { out_buf[i] = in_buf[i * stride]; } +} + +template +struct DiagFunctor final { + void operator()(ep::Stream* stream, T* out_buf, const T* in_buf, int32_t size, int32_t stride, + int32_t in_dim) { + if (in_dim == 1) { + vector_diagonal_kernel<<As()->cuda_stream()>>>(out_buf, in_buf, size, + stride); + } else { + matrix_diagonal_kernel<<As()->cuda_stream()>>>(out_buf, in_buf, size, + stride); + } + } +}; + +template +struct DiagGradFunctor final { + void operator()(ep::Stream* stream, T* dx_buf, const T* dy_buf, int32_t dx_cnt, int32_t dy_cnt, + int32_t stride, int32_t in_dim) { + if (in_dim == 1) { + matrix_diagonal_kernel<<As()->cuda_stream()>>>(dx_buf, dy_buf, + dx_cnt, stride); + } else { + vector_diagonal_kernel<<As()->cuda_stream()>>>(dx_buf, dy_buf, + dy_cnt, stride); + } + } +}; + +} // namespace + +REGISTER_DIAG_KERNELS(DeviceType::kCUDA, half); +REGISTER_DIAG_KERNELS(DeviceType::kCUDA, float); +REGISTER_DIAG_KERNELS(DeviceType::kCUDA, double); +REGISTER_DIAG_KERNELS(DeviceType::kCUDA, bool); +REGISTER_DIAG_KERNELS(DeviceType::kCUDA, uint8_t); +REGISTER_DIAG_KERNELS(DeviceType::kCUDA, int8_t); +REGISTER_DIAG_KERNELS(DeviceType::kCUDA, int32_t); +REGISTER_DIAG_KERNELS(DeviceType::kCUDA, int64_t); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/diagonal_kernel.hip.cpp b/oneflow/user/kernels/diagonal_kernel.hip.cpp index cd0815e..e6aa116 100644 --- a/oneflow/user/kernels/diagonal_kernel.hip.cpp +++ b/oneflow/user/kernels/diagonal_kernel.hip.cpp @@ -1,163 +1,163 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/common/util.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/kernel/kernel_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { -namespace { - -template -__global__ void forward_diagonal_kernel(T* out_buf, const T* in_buf, int32_t size, int32_t dim1, - int32_t dim2) { - int32_t offset_index = (dim1 + 1) * dim2; - CUDA_1D_KERNEL_LOOP(index, size * dim2) { - int32_t i = index / dim2; - int32_t j = index - i * dim2; - out_buf[j * size + i] = in_buf[i * offset_index + j]; - } -} - -template -__global__ void backward_diagonal_kernel(T* dx_buf, const T* dy_buf, int32_t size, int32_t dim1, - int32_t dim2) { - int32_t offset_index = (dim1 + 1) * dim2; - CUDA_1D_KERNEL_LOOP(index, size * dim2) { - int32_t i = index / dim2; - int32_t j = index - i * dim2; - dx_buf[i * offset_index + j] = dy_buf[j * size + i]; - } -} - -template -struct DiagonalFunctor final { - void operator()(ep::Stream* stream, T* out_buf, const T* in_buf, int32_t size, int32_t dim1, - int32_t dim2) { - if (size * dim2 > 0) { - forward_diagonal_kernel - <<As()->cuda_stream()>>>(out_buf, in_buf, size, dim1, dim2); - } - } -}; - -template -struct DiagonalGradFunctor final { - void operator()(ep::Stream* stream, T* dx_buf, const T* dy_buf, int32_t size, int32_t dim1, - int32_t dim2) { - if (size * dim2 > 0) { - backward_diagonal_kernel - <<As()->cuda_stream()>>>(dx_buf, dy_buf, size, dim1, dim2); - } - } -}; - -} // namespace - -template -class GpuDiagonalKernel final : public user_op::OpKernel { - public: - GpuDiagonalKernel() = default; - ~GpuDiagonalKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const int32_t offset = ctx->Attr("offset"); - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const ShapeView& out_shape = out->shape_view(); - const ShapeView& in_shape = in->shape_view(); - const T* in_buf = in->dptr(); - T* out_buf = out->mut_dptr(); - - int32_t size = out_shape.At(out_shape.NumAxes() - 1); - int32_t dim1 = in_shape.At(1); - int32_t dim2 = 0; - if (in_shape.NumAxes() <= 2) { - dim2 = 1; - } else { - dim2 = in_shape.Count(2, in_shape.NumAxes()); - } - - int32_t offset_in_bufer = (offset >= 0 ? offset * dim2 : -offset * dim1 * dim2); - in_buf += offset_in_bufer; - - DiagonalFunctor()(ctx->stream(), out_buf, in_buf, size, dim1, dim2); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class GpuDiagonalBackwardKernel final : public user_op::OpKernel { - public: - GpuDiagonalBackwardKernel() = default; - ~GpuDiagonalBackwardKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - int32_t offset = ctx->Attr("offset"); - const ShapeView& dx_shape = dx->shape_view(); - const ShapeView& dy_shape = dy->shape_view(); - T* dx_buf = dx->mut_dptr(); - const T* dy_buf = dy->dptr(); - - Memset(ctx->stream(), dx->mut_dptr(), 0, dx_shape.elem_cnt() * sizeof(T)); - - int32_t dim1 = dx_shape.At(1); - int32_t dim2 = 0; - if (dx_shape.NumAxes() <= 2) { - dim2 = 1; - } else { - dim2 = dx_shape.Count(2, dx_shape.NumAxes()); - } - int32_t size = dy_shape.At(dy_shape.NumAxes() - 1); - int32_t offset_in_bufer = (offset >= 0 ? offset * dim2 : -offset * dim1 * dim2); - dx_buf += offset_in_bufer; - - DiagonalGradFunctor()(ctx->stream(), dx_buf, dy_buf, size, dim1, dim2); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_DIAGONAL_KERNELS(dtype) \ - REGISTER_USER_KERNEL("diagonal") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("in", 0) == GetDataType::value)); \ - REGISTER_USER_KERNEL("diagonal_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("in", 0) == GetDataType::value)); - -REGISTER_DIAGONAL_KERNELS(bool); -REGISTER_DIAGONAL_KERNELS(half); -REGISTER_DIAGONAL_KERNELS(float); -REGISTER_DIAGONAL_KERNELS(double); -REGISTER_DIAGONAL_KERNELS(int8_t); -REGISTER_DIAGONAL_KERNELS(int32_t); -REGISTER_DIAGONAL_KERNELS(int64_t); - -#undef REGISTER_DIAGONAL_KERNELS - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/common/util.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/kernel/kernel_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { +namespace { + +template +__global__ void forward_diagonal_kernel(T* out_buf, const T* in_buf, int32_t size, int32_t dim1, + int32_t dim2) { + int32_t offset_index = (dim1 + 1) * dim2; + CUDA_1D_KERNEL_LOOP(index, size * dim2) { + int32_t i = index / dim2; + int32_t j = index - i * dim2; + out_buf[j * size + i] = in_buf[i * offset_index + j]; + } +} + +template +__global__ void backward_diagonal_kernel(T* dx_buf, const T* dy_buf, int32_t size, int32_t dim1, + int32_t dim2) { + int32_t offset_index = (dim1 + 1) * dim2; + CUDA_1D_KERNEL_LOOP(index, size * dim2) { + int32_t i = index / dim2; + int32_t j = index - i * dim2; + dx_buf[i * offset_index + j] = dy_buf[j * size + i]; + } +} + +template +struct DiagonalFunctor final { + void operator()(ep::Stream* stream, T* out_buf, const T* in_buf, int32_t size, int32_t dim1, + int32_t dim2) { + if (size * dim2 > 0) { + forward_diagonal_kernel + <<As()->cuda_stream()>>>(out_buf, in_buf, size, dim1, dim2); + } + } +}; + +template +struct DiagonalGradFunctor final { + void operator()(ep::Stream* stream, T* dx_buf, const T* dy_buf, int32_t size, int32_t dim1, + int32_t dim2) { + if (size * dim2 > 0) { + backward_diagonal_kernel + <<As()->cuda_stream()>>>(dx_buf, dy_buf, size, dim1, dim2); + } + } +}; + +} // namespace + +template +class GpuDiagonalKernel final : public user_op::OpKernel { + public: + GpuDiagonalKernel() = default; + ~GpuDiagonalKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const int32_t offset = ctx->Attr("offset"); + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + const ShapeView& out_shape = out->shape_view(); + const ShapeView& in_shape = in->shape_view(); + const T* in_buf = in->dptr(); + T* out_buf = out->mut_dptr(); + + int32_t size = out_shape.At(out_shape.NumAxes() - 1); + int32_t dim1 = in_shape.At(1); + int32_t dim2 = 0; + if (in_shape.NumAxes() <= 2) { + dim2 = 1; + } else { + dim2 = in_shape.Count(2, in_shape.NumAxes()); + } + + int32_t offset_in_bufer = (offset >= 0 ? offset * dim2 : -offset * dim1 * dim2); + in_buf += offset_in_bufer; + + DiagonalFunctor()(ctx->stream(), out_buf, in_buf, size, dim1, dim2); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class GpuDiagonalBackwardKernel final : public user_op::OpKernel { + public: + GpuDiagonalBackwardKernel() = default; + ~GpuDiagonalBackwardKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + int32_t offset = ctx->Attr("offset"); + const ShapeView& dx_shape = dx->shape_view(); + const ShapeView& dy_shape = dy->shape_view(); + T* dx_buf = dx->mut_dptr(); + const T* dy_buf = dy->dptr(); + + Memset(ctx->stream(), dx->mut_dptr(), 0, dx_shape.elem_cnt() * sizeof(T)); + + int32_t dim1 = dx_shape.At(1); + int32_t dim2 = 0; + if (dx_shape.NumAxes() <= 2) { + dim2 = 1; + } else { + dim2 = dx_shape.Count(2, dx_shape.NumAxes()); + } + int32_t size = dy_shape.At(dy_shape.NumAxes() - 1); + int32_t offset_in_bufer = (offset >= 0 ? offset * dim2 : -offset * dim1 * dim2); + dx_buf += offset_in_bufer; + + DiagonalGradFunctor()(ctx->stream(), dx_buf, dy_buf, size, dim1, dim2); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_DIAGONAL_KERNELS(dtype) \ + REGISTER_USER_KERNEL("diagonal") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("in", 0) == GetDataType::value)); \ + REGISTER_USER_KERNEL("diagonal_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("in", 0) == GetDataType::value)); + +REGISTER_DIAGONAL_KERNELS(bool); +REGISTER_DIAGONAL_KERNELS(half); +REGISTER_DIAGONAL_KERNELS(float); +REGISTER_DIAGONAL_KERNELS(double); +REGISTER_DIAGONAL_KERNELS(int8_t); +REGISTER_DIAGONAL_KERNELS(int32_t); +REGISTER_DIAGONAL_KERNELS(int64_t); + +#undef REGISTER_DIAGONAL_KERNELS + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/dim_gather_kernel_util.hip.cpp b/oneflow/user/kernels/dim_gather_kernel_util.hip.cpp index 69ce03c..934c29f 100644 --- a/oneflow/user/kernels/dim_gather_kernel_util.hip.cpp +++ b/oneflow/user/kernels/dim_gather_kernel_util.hip.cpp @@ -1,65 +1,65 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifdef WITH_ROCM -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/user/kernels/dim_gather_kernel_util.h" - -namespace oneflow { - -namespace user_op { - -template -__global__ void DoCUDADimGather(const DimOpIndexNdHelper input_nd_helper, - const DimOpIndexNdHelper index_nd_helper, int ndim, - int64_t elem_cnt, int32_t dim_length, int32_t dim, - const IDX_T* index, const IN_T* input, IN_T* output) { - DoDimGather(input_nd_helper, index_nd_helper, ndim, elem_cnt, dim_length, dim, index, - input, output); -} - -template -struct DimGatherFunctor final { - void operator()(ep::Stream* stream, const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& index_nd_helper, int ndim, int64_t elem_cnt, - int32_t dim_length, int32_t dim, const IDX_T* index, const IN_T* input, - IN_T* output) { - RUN_CUDA_KERNEL((DoCUDADimGather), stream, BlocksNum4ThreadsNum(elem_cnt), - input_nd_helper, index_nd_helper, ndim, elem_cnt, dim_length, dim, index, input, - output); - } -}; - -// float16 special case of DimGatherFunctor template -template -struct DimGatherFunctor final { - void operator()(ep::Stream* stream, const DimOpIndexNdHelper& input_nd_helper, - const DimOpIndexNdHelper& index_nd_helper, int ndim, int64_t elem_cnt, - int32_t dim_length, int32_t dim, const IDX_T* index, const float16* input, - float16* output) { - RUN_CUDA_KERNEL((DoCUDADimGather), stream, BlocksNum4ThreadsNum(elem_cnt), - input_nd_helper, index_nd_helper, ndim, elem_cnt, dim_length, dim, index, - reinterpret_cast(input), reinterpret_cast(output)); - } -}; - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_GATHER_FUNCTOR, (DeviceType::kCUDA), - DIM_GATHER_SCATTER_DATA_TYPE_CUDA_SEQ, INDEX_DATA_TYPE_SEQ); - -} // namespace user_op -} // namespace oneflow - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef WITH_ROCM +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/user/kernels/dim_gather_kernel_util.h" + +namespace oneflow { + +namespace user_op { + +template +__global__ void DoCUDADimGather(const DimOpIndexNdHelper input_nd_helper, + const DimOpIndexNdHelper index_nd_helper, int ndim, + int64_t elem_cnt, int32_t dim_length, int32_t dim, + const IDX_T* index, const IN_T* input, IN_T* output) { + DoDimGather(input_nd_helper, index_nd_helper, ndim, elem_cnt, dim_length, dim, index, + input, output); +} + +template +struct DimGatherFunctor final { + void operator()(ep::Stream* stream, const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& index_nd_helper, int ndim, int64_t elem_cnt, + int32_t dim_length, int32_t dim, const IDX_T* index, const IN_T* input, + IN_T* output) { + RUN_CUDA_KERNEL((DoCUDADimGather), stream, BlocksNum4ThreadsNum(elem_cnt), + input_nd_helper, index_nd_helper, ndim, elem_cnt, dim_length, dim, index, input, + output); + } +}; + +// float16 special case of DimGatherFunctor template +template +struct DimGatherFunctor final { + void operator()(ep::Stream* stream, const DimOpIndexNdHelper& input_nd_helper, + const DimOpIndexNdHelper& index_nd_helper, int ndim, int64_t elem_cnt, + int32_t dim_length, int32_t dim, const IDX_T* index, const float16* input, + float16* output) { + RUN_CUDA_KERNEL((DoCUDADimGather), stream, BlocksNum4ThreadsNum(elem_cnt), + input_nd_helper, index_nd_helper, ndim, elem_cnt, dim_length, dim, index, + reinterpret_cast(input), reinterpret_cast(output)); + } +}; + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_DIM_GATHER_FUNCTOR, (DeviceType::kCUDA), + DIM_GATHER_SCATTER_DATA_TYPE_CUDA_SEQ, INDEX_DATA_TYPE_SEQ); + +} // namespace user_op +} // namespace oneflow + #endif // WITH_ROCM \ No newline at end of file diff --git a/oneflow/user/kernels/dim_scatter_kernel_util.hip.cpp b/oneflow/user/kernels/dim_scatter_kernel_util.hip.cpp index cc996ea..d436a74 100644 --- a/oneflow/user/kernels/dim_scatter_kernel_util.hip.cpp +++ b/oneflow/user/kernels/dim_scatter_kernel_util.hip.cpp @@ -1,67 +1,67 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifdef WITH_ROCM -#include "hip/hip_runtime.h" -#include "oneflow/user/kernels/dim_scatter_kernel_util.h" - -namespace oneflow { -namespace user_op { - -template class Opt> -__global__ void DoCUDADimScatter(const DimOpIndexNdHelper src_nd_helper, - const DimOpIndexNdHelper idx_nd_helper, - const DimOpIndexNdHelper output_nd_helper, const int ndim, - const int64_t elem_cnt, const int32_t dim, - const int64_t upper_bound, const IDX_T* index, const IN_T* src, - IN_T* output) { - DoDimScatter(src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, - dim, upper_bound, index, src, output); -} - -template class Opt> -struct DimScatterFunctor final { - void operator()(ep::Stream* stream, const DimOpIndexNdHelper& src_nd_helper, - const DimOpIndexNdHelper& idx_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, const int ndim, - const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, - const IDX_T* index, const IN_T* src, IN_T* output) { - RUN_CUDA_KERNEL((DoCUDADimScatter), stream, BlocksNum4ThreadsNum(elem_cnt), - src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, - upper_bound, index, src, output); - } -}; - -template class Opt> -struct DimScatterFunctor final { - void operator()(ep::Stream* stream, const DimOpIndexNdHelper& src_nd_helper, - const DimOpIndexNdHelper& idx_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, const int ndim, - const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, - const IDX_T* index, const float16* src, float16* output) { - RUN_CUDA_KERNEL((DoCUDADimScatter), stream, BlocksNum4ThreadsNum(elem_cnt), - src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, - upper_bound, index, reinterpret_cast(src), - reinterpret_cast(output)); - } -}; - -INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kCUDA, BinOpAddFunctor); -INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kCUDA, BinOpUpdateFunctor); - -} // namespace user_op -} // namespace oneflow - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef WITH_ROCM +#include "hip/hip_runtime.h" +#include "oneflow/user/kernels/dim_scatter_kernel_util.h" + +namespace oneflow { +namespace user_op { + +template class Opt> +__global__ void DoCUDADimScatter(const DimOpIndexNdHelper src_nd_helper, + const DimOpIndexNdHelper idx_nd_helper, + const DimOpIndexNdHelper output_nd_helper, const int ndim, + const int64_t elem_cnt, const int32_t dim, + const int64_t upper_bound, const IDX_T* index, const IN_T* src, + IN_T* output) { + DoDimScatter(src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, + dim, upper_bound, index, src, output); +} + +template class Opt> +struct DimScatterFunctor final { + void operator()(ep::Stream* stream, const DimOpIndexNdHelper& src_nd_helper, + const DimOpIndexNdHelper& idx_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, const int ndim, + const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, + const IDX_T* index, const IN_T* src, IN_T* output) { + RUN_CUDA_KERNEL((DoCUDADimScatter), stream, BlocksNum4ThreadsNum(elem_cnt), + src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, + upper_bound, index, src, output); + } +}; + +template class Opt> +struct DimScatterFunctor final { + void operator()(ep::Stream* stream, const DimOpIndexNdHelper& src_nd_helper, + const DimOpIndexNdHelper& idx_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, const int ndim, + const int64_t elem_cnt, const int32_t dim, const int64_t upper_bound, + const IDX_T* index, const float16* src, float16* output) { + RUN_CUDA_KERNEL((DoCUDADimScatter), stream, BlocksNum4ThreadsNum(elem_cnt), + src_nd_helper, idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, + upper_bound, index, reinterpret_cast(src), + reinterpret_cast(output)); + } +}; + +INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kCUDA, BinOpAddFunctor); +INSTANTIATE_DIM_SCATTER_FUNCTORS(DeviceType::kCUDA, BinOpUpdateFunctor); + +} // namespace user_op +} // namespace oneflow + #endif // WITH_ROCM \ No newline at end of file diff --git a/oneflow/user/kernels/dim_scatter_scalar_kernel_util.hip.cpp b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.hip.cpp index 297271b..e01a1ef 100644 --- a/oneflow/user/kernels/dim_scatter_scalar_kernel_util.hip.cpp +++ b/oneflow/user/kernels/dim_scatter_scalar_kernel_util.hip.cpp @@ -1,51 +1,51 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifdef WITH_ROCM -#include "hip/hip_runtime.h" -#include "oneflow/user/kernels/dim_scatter_scalar_kernel_util.h" - -namespace oneflow { - -namespace user_op { - -template class Opt> -__global__ void DoCUDADimScatterScalar(const DimOpIndexNdHelper idx_nd_helper, - const DimOpIndexNdHelper output_nd_helper, - const int ndim, const int64_t elem_cnt, const int32_t dim, - const int64_t upper_bound, const IDX_T* index, - const IN_T src_scalar, IN_T* output) { - DoScatterScalarFunctor(idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, - upper_bound, index, src_scalar, output); -} - -template class Opt> -struct DimScatterScalarFunctor final { - void operator()(ep::Stream* stream, const DimOpIndexNdHelper& idx_nd_helper, - const DimOpIndexNdHelper& output_nd_helper, const int ndim, - const int64_t elem_cnt, const int32_t dim, int64_t upper_bound, - const IDX_T* index, const IN_T src, IN_T* output) { - RUN_CUDA_KERNEL((DoCUDADimScatterScalar), stream, - BlocksNum4ThreadsNum(elem_cnt), idx_nd_helper, output_nd_helper, ndim, elem_cnt, - dim, upper_bound, index, src, output); - } -}; - -INSTANTIATE_DIM_SCATTER_SCARLAR_FUNCTORS(DeviceType::kCUDA, UpdateScalarFunctor); -INSTANTIATE_DIM_SCATTER_SCARLAR_FUNCTORS(DeviceType::kCUDA, AddScalarFunctor); - -} // namespace user_op -} // namespace oneflow +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef WITH_ROCM +#include "hip/hip_runtime.h" +#include "oneflow/user/kernels/dim_scatter_scalar_kernel_util.h" + +namespace oneflow { + +namespace user_op { + +template class Opt> +__global__ void DoCUDADimScatterScalar(const DimOpIndexNdHelper idx_nd_helper, + const DimOpIndexNdHelper output_nd_helper, + const int ndim, const int64_t elem_cnt, const int32_t dim, + const int64_t upper_bound, const IDX_T* index, + const IN_T src_scalar, IN_T* output) { + DoScatterScalarFunctor(idx_nd_helper, output_nd_helper, ndim, elem_cnt, dim, + upper_bound, index, src_scalar, output); +} + +template class Opt> +struct DimScatterScalarFunctor final { + void operator()(ep::Stream* stream, const DimOpIndexNdHelper& idx_nd_helper, + const DimOpIndexNdHelper& output_nd_helper, const int ndim, + const int64_t elem_cnt, const int32_t dim, int64_t upper_bound, + const IDX_T* index, const IN_T src, IN_T* output) { + RUN_CUDA_KERNEL((DoCUDADimScatterScalar), stream, + BlocksNum4ThreadsNum(elem_cnt), idx_nd_helper, output_nd_helper, ndim, elem_cnt, + dim, upper_bound, index, src, output); + } +}; + +INSTANTIATE_DIM_SCATTER_SCARLAR_FUNCTORS(DeviceType::kCUDA, UpdateScalarFunctor); +INSTANTIATE_DIM_SCATTER_SCARLAR_FUNCTORS(DeviceType::kCUDA, AddScalarFunctor); + +} // namespace user_op +} // namespace oneflow #endif \ No newline at end of file diff --git a/oneflow/user/kernels/distributions/normal_distribution.hip.cpp b/oneflow/user/kernels/distributions/normal_distribution.hip.cpp index 6056f47..6a888fc 100644 --- a/oneflow/user/kernels/distributions/normal_distribution.hip.cpp +++ b/oneflow/user/kernels/distributions/normal_distribution.hip.cpp @@ -1,71 +1,71 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/user/kernels/distributions/normal_distribution.h" -#include "oneflow/core/common/data_type.h" -#include "oneflow/core/ep/include/device.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -__device__ T GenNormal(hiprandState* state, const T mean, const T std); - -template<> -__device__ float GenNormal(hiprandState* state, const float mean, const float std) { - return (hiprand_normal(state) + mean) / std; -} - -template<> -__device__ double GenNormal(hiprandState* state, const double mean, const double std) { - return (hiprand_normal_double(state) + mean) / std; -} - -template -__global__ void GenerateGpu(hiprandState* state, const int64_t elem_cnt, T* dptr, const T mean, - const T std) { - const int id = blockIdx.x * blockDim.x + threadIdx.x; - hiprandState localState = state[id]; - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { dptr[i] = GenNormal(&localState, mean, std); } - state[id] = localState; -} - -} // namespace - -template -void NormalDistribution::operator()( - ep::Stream* stream, const int64_t elem_cnt, T* dptr, - const std::shared_ptr& generator) const { - CHECK_GE(elem_cnt, 0); - const auto device_index = stream->device()->device_index(); - auto gen = CHECK_JUST(generator->Get(device_index)); - int32_t block_num = gen->max_block_num(); - int32_t thread_num = gen->max_thread_num(); - auto* curand_states = gen->curand_states(); - GenerateGpu<<As()->cuda_stream()>>>( - curand_states, elem_cnt, dptr, mean_, std_); -} - -#define INITIATE_CUDA_NORMAL_DISTRIBUTION(T, typeproto) \ - template void NormalDistribution::operator()( \ - ep::Stream* stream, const int64_t elem_cnt, T* dptr, \ - const std::shared_ptr& generator) const; - -OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_NORMAL_DISTRIBUTION, FLOATING_DATA_TYPE_SEQ) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/user/kernels/distributions/normal_distribution.h" +#include "oneflow/core/common/data_type.h" +#include "oneflow/core/ep/include/device.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +__device__ T GenNormal(hiprandState* state, const T mean, const T std); + +template<> +__device__ float GenNormal(hiprandState* state, const float mean, const float std) { + return (hiprand_normal(state) + mean) / std; +} + +template<> +__device__ double GenNormal(hiprandState* state, const double mean, const double std) { + return (hiprand_normal_double(state) + mean) / std; +} + +template +__global__ void GenerateGpu(hiprandState* state, const int64_t elem_cnt, T* dptr, const T mean, + const T std) { + const int id = blockIdx.x * blockDim.x + threadIdx.x; + hiprandState localState = state[id]; + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { dptr[i] = GenNormal(&localState, mean, std); } + state[id] = localState; +} + +} // namespace + +template +void NormalDistribution::operator()( + ep::Stream* stream, const int64_t elem_cnt, T* dptr, + const std::shared_ptr& generator) const { + CHECK_GE(elem_cnt, 0); + const auto device_index = stream->device()->device_index(); + auto gen = CHECK_JUST(generator->Get(device_index)); + int32_t block_num = gen->max_block_num(); + int32_t thread_num = gen->max_thread_num(); + auto* curand_states = gen->curand_states(); + GenerateGpu<<As()->cuda_stream()>>>( + curand_states, elem_cnt, dptr, mean_, std_); +} + +#define INITIATE_CUDA_NORMAL_DISTRIBUTION(T, typeproto) \ + template void NormalDistribution::operator()( \ + ep::Stream* stream, const int64_t elem_cnt, T* dptr, \ + const std::shared_ptr& generator) const; + +OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_NORMAL_DISTRIBUTION, FLOATING_DATA_TYPE_SEQ) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/distributions/uniform_distribution.hip.cpp b/oneflow/user/kernels/distributions/uniform_distribution.hip.cpp index b6fbc7c..d22241c 100644 --- a/oneflow/user/kernels/distributions/uniform_distribution.hip.cpp +++ b/oneflow/user/kernels/distributions/uniform_distribution.hip.cpp @@ -1,77 +1,77 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/common/data_type.h" -#include "oneflow/user/kernels/distributions/uniform_distribution.h" -#include "oneflow/core/ep/include/device.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -__device__ T GenUniform(hiprandState* state, const T low, const T high); - -template<> -__device__ float GenUniform(hiprandState* state, const float low, const float high) { - auto rand_num = hiprand_uniform(state); - // hiprand_uniform generates (0.0, 1.0], but we want [0.0, 1.0) here - if (rand_num == 1.0) { rand_num = 0.0; } - return rand_num * (high - low) + low; -} - -template<> -__device__ double GenUniform(hiprandState* state, const double low, const double high) { - auto rand_num = hiprand_uniform_double(state); - // hiprand_uniform_double generates (0.0, 1.0], but we want [0.0, 1.0) here - if (rand_num == 1.0) { rand_num = 0.0; } - return rand_num * (high - low) + low; -} - -template -__global__ void GenerateGpu(hiprandState* state, const int64_t elem_cnt, T* dptr, const T low, - const T high) { - const int id = blockIdx.x * blockDim.x + threadIdx.x; - hiprandState localState = state[id]; - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { dptr[i] = GenUniform(&localState, low, high); } - state[id] = localState; -} - -} // namespace - -template -void UniformDistribution::operator()( - ep::Stream* stream, const int64_t elem_cnt, T* dptr, - const std::shared_ptr& generator) const { - CHECK_GE(elem_cnt, 0); - const auto device_index = stream->device()->device_index(); - auto gen = CHECK_JUST(generator->Get(device_index)); - int32_t block_num = gen->max_block_num(); - int32_t thread_num = gen->max_thread_num(); - auto* curand_states = gen->curand_states(); - GenerateGpu<<As()->cuda_stream()>>>( - curand_states, elem_cnt, dptr, low_, high_); -} - -#define INITIATE_CUDA_UNIFORM_DISTRIBUTION(T, typeproto) \ - template void UniformDistribution::operator()( \ - ep::Stream* stream, const int64_t elem_cnt, T* dptr, \ - const std::shared_ptr& generator) const; - -OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_UNIFORM_DISTRIBUTION, FLOATING_DATA_TYPE_SEQ) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/common/data_type.h" +#include "oneflow/user/kernels/distributions/uniform_distribution.h" +#include "oneflow/core/ep/include/device.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +__device__ T GenUniform(hiprandState* state, const T low, const T high); + +template<> +__device__ float GenUniform(hiprandState* state, const float low, const float high) { + auto rand_num = hiprand_uniform(state); + // hiprand_uniform generates (0.0, 1.0], but we want [0.0, 1.0) here + if (rand_num == 1.0) { rand_num = 0.0; } + return rand_num * (high - low) + low; +} + +template<> +__device__ double GenUniform(hiprandState* state, const double low, const double high) { + auto rand_num = hiprand_uniform_double(state); + // hiprand_uniform_double generates (0.0, 1.0], but we want [0.0, 1.0) here + if (rand_num == 1.0) { rand_num = 0.0; } + return rand_num * (high - low) + low; +} + +template +__global__ void GenerateGpu(hiprandState* state, const int64_t elem_cnt, T* dptr, const T low, + const T high) { + const int id = blockIdx.x * blockDim.x + threadIdx.x; + hiprandState localState = state[id]; + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { dptr[i] = GenUniform(&localState, low, high); } + state[id] = localState; +} + +} // namespace + +template +void UniformDistribution::operator()( + ep::Stream* stream, const int64_t elem_cnt, T* dptr, + const std::shared_ptr& generator) const { + CHECK_GE(elem_cnt, 0); + const auto device_index = stream->device()->device_index(); + auto gen = CHECK_JUST(generator->Get(device_index)); + int32_t block_num = gen->max_block_num(); + int32_t thread_num = gen->max_thread_num(); + auto* curand_states = gen->curand_states(); + GenerateGpu<<As()->cuda_stream()>>>( + curand_states, elem_cnt, dptr, low_, high_); +} + +#define INITIATE_CUDA_UNIFORM_DISTRIBUTION(T, typeproto) \ + template void UniformDistribution::operator()( \ + ep::Stream* stream, const int64_t elem_cnt, T* dptr, \ + const std::shared_ptr& generator) const; + +OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_UNIFORM_DISTRIBUTION, FLOATING_DATA_TYPE_SEQ) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/distributions/uniform_int_distribution.hip.cpp b/oneflow/user/kernels/distributions/uniform_int_distribution.hip.cpp index 0ff5160..e4e72a2 100644 --- a/oneflow/user/kernels/distributions/uniform_int_distribution.hip.cpp +++ b/oneflow/user/kernels/distributions/uniform_int_distribution.hip.cpp @@ -1,72 +1,72 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/common/data_type.h" -#include "oneflow/core/common/preprocessor.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/framework/dtype.h" -#include "oneflow/user/kernels/distributions/uniform_int_distribution.h" -#include "oneflow/core/ep/include/device.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -__device__ int64_t GenUniformInt(hiprandState* state, const int64_t low, const int64_t high) { - auto rand_num = hiprand_uniform(state); - // hiprand_uniform generates (0.0, 1.0], but we want [0.0, 1.0) here - if (rand_num == 1.0) { rand_num = 0.0; } - return static_cast(rand_num * (high - low) + low); -} - -template -__global__ void GenerateGpu(hiprandState* state, const int64_t elem_cnt, T* dptr, const int64_t low, - const int64_t high) { - const int id = blockIdx.x * blockDim.x + threadIdx.x; - hiprandState localState = state[id]; - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { - dptr[i] = static_cast(GenUniformInt(&localState, low, high)); - } - state[id] = localState; -} - -} // namespace - -template -void UniformIntDistribution::operator()( - ep::Stream* stream, const int64_t elem_cnt, T* dptr, - const std::shared_ptr& generator) const { - CHECK_GE(elem_cnt, 0); - const auto device_index = stream->device()->device_index(); - auto gen = CHECK_JUST(generator->Get(device_index)); - int32_t block_num = gen->max_block_num(); - int32_t thread_num = gen->max_thread_num(); - auto* curand_states = gen->curand_states(); - GenerateGpu<<As()->cuda_stream()>>>( - curand_states, elem_cnt, dptr, low_, high_); -} - -#define INITIATE_CUDA_UNIFORM_INT_DISTRIBUTION(T, typeproto) \ - template void UniformIntDistribution::operator()( \ - ep::Stream* stream, const int64_t elem_cnt, T* dptr, \ - const std::shared_ptr& generator) const; - -OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_UNIFORM_INT_DISTRIBUTION, FLOATING_DATA_TYPE_SEQ) -OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_UNIFORM_INT_DISTRIBUTION, INT_DATA_TYPE_SEQ) -OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_UNIFORM_INT_DISTRIBUTION, UNSIGNED_INT_DATA_TYPE_SEQ) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/common/data_type.h" +#include "oneflow/core/common/preprocessor.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/framework/dtype.h" +#include "oneflow/user/kernels/distributions/uniform_int_distribution.h" +#include "oneflow/core/ep/include/device.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +__device__ int64_t GenUniformInt(hiprandState* state, const int64_t low, const int64_t high) { + auto rand_num = hiprand_uniform(state); + // hiprand_uniform generates (0.0, 1.0], but we want [0.0, 1.0) here + if (rand_num == 1.0) { rand_num = 0.0; } + return static_cast(rand_num * (high - low) + low); +} + +template +__global__ void GenerateGpu(hiprandState* state, const int64_t elem_cnt, T* dptr, const int64_t low, + const int64_t high) { + const int id = blockIdx.x * blockDim.x + threadIdx.x; + hiprandState localState = state[id]; + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { + dptr[i] = static_cast(GenUniformInt(&localState, low, high)); + } + state[id] = localState; +} + +} // namespace + +template +void UniformIntDistribution::operator()( + ep::Stream* stream, const int64_t elem_cnt, T* dptr, + const std::shared_ptr& generator) const { + CHECK_GE(elem_cnt, 0); + const auto device_index = stream->device()->device_index(); + auto gen = CHECK_JUST(generator->Get(device_index)); + int32_t block_num = gen->max_block_num(); + int32_t thread_num = gen->max_thread_num(); + auto* curand_states = gen->curand_states(); + GenerateGpu<<As()->cuda_stream()>>>( + curand_states, elem_cnt, dptr, low_, high_); +} + +#define INITIATE_CUDA_UNIFORM_INT_DISTRIBUTION(T, typeproto) \ + template void UniformIntDistribution::operator()( \ + ep::Stream* stream, const int64_t elem_cnt, T* dptr, \ + const std::shared_ptr& generator) const; + +OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_UNIFORM_INT_DISTRIBUTION, FLOATING_DATA_TYPE_SEQ) +OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_UNIFORM_INT_DISTRIBUTION, INT_DATA_TYPE_SEQ) +OF_PP_FOR_EACH_TUPLE(INITIATE_CUDA_UNIFORM_INT_DISTRIBUTION, UNSIGNED_INT_DATA_TYPE_SEQ) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/dropout_kernel.hip.cpp b/oneflow/user/kernels/dropout_kernel.hip.cpp index 4fe3552..5a4aab1 100644 --- a/oneflow/user/kernels/dropout_kernel.hip.cpp +++ b/oneflow/user/kernels/dropout_kernel.hip.cpp @@ -1,463 +1,463 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/user/kernels/op_kernel_wrapper.h" -#include "oneflow/core/common/data_type.h" -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/core/hip/atomic.hip.h" -#include "oneflow/user/kernels/dropout_kernel.h" -#include "oneflow/core/kernel/cuda_graph_support.h" -#include "oneflow/core/ep/include/device.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -// #include "oneflow/core/device/cuda_pseudo_bfloat16.h" -namespace oneflow { - -namespace { - -constexpr int32_t kVecSize = 4; -constexpr int32_t kBlockSize = 256; - -template -constexpr int32_t GetDropoutPackSize() { - // For float, bfloat16, half. - return 4; -}; - -template<> -constexpr int32_t GetDropoutPackSize() { - return 2; -}; - -template<> -constexpr int32_t GetDropoutPackSize() { - return 2; -}; - -union RandPack4 { - float4 storage; - float elem[4]; -}; - -template -struct GetPack2Type { - using T2 = typename std::aligned_storage<2 * sizeof(T), 2 * sizeof(T)>::type; -}; - -template<> -struct GetPack2Type { - using T2 = half2; -}; - - -template -using Pack2Type = typename GetPack2Type::T2; - -using H2PackType = typename std::aligned_storage<4 * sizeof(half), 4 * sizeof(half)>::type; - -template -union H2Pack { - cuda::elementwise::Pack pack_storage; - Pack2Type h2[2]; - __device__ H2Pack() { - // do nothing - } -}; - -template<> -union H2Pack { - cuda::elementwise::Pack pack_storage; - half2 h2[2]; - __device__ H2Pack() { - // do nothing - } -}; - -template -__device__ Pack2Type Make2(float v); - -template<> -__device__ Pack2Type Make2(float v) { - return __float2half2_rn(v); -} - -#define RETURN_VOID_IF_HALF typename std::enable_if_t::value, void> - -#define RETURN_VOID_IF_FLOAT typename std::enable_if_t::value, void> -#define RETURN_VOID_IF_DOUBLE typename std::enable_if_t::value, void> - -template -__global__ RETURN_VOID_IF_FLOAT FusedDropoutAddGpu( - uint64_t seed, one::CUDAGeneratorState* cuda_gen_state, uint64_t inc_offset, - const int64_t elem_cnt, float rate, float scale, int64_t n_tail, const T* x, bool* mask, - const T* addend, T* y, const T* tail_x, bool* tail_mask, const T* tail_addend, T* tail_y) { - int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - hiprandStatePhilox4_32_10_t state; - hiprand_init(seed, global_thread_id, cuda_gen_state->dev_offset, &state); - using LoadType = cuda::elementwise::PackType; - using LoadPack = cuda::elementwise::Pack; - using MaskType = cuda::elementwise::PackType; - using MaskPack = cuda::elementwise::Pack; - - T t_scale = static_cast(scale); - RandPack4 rand_uniform_pack4; - for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt; - linear_index += gridDim.x * blockDim.x * pack_size) { - rand_uniform_pack4.storage = hiprand_uniform4(&state); - - const LoadType* x_load = reinterpret_cast(x + linear_index); - LoadPack x_vec; - x_vec.storage = *x_load; - - LoadPack addend_vec; - if (has_addend) { - const LoadType* addend_load = reinterpret_cast(addend + linear_index); - addend_vec.storage = *addend_load; - } - - MaskPack mask_vec; - LoadPack y_vec; -#pragma unroll - for (int i = 0; i < pack_size; i++) { - mask_vec.elem[i] = rand_uniform_pack4.elem[i] > rate; - T tmp_float_mask = static_cast(mask_vec.elem[i]); - y_vec.elem[i] = x_vec.elem[i] * tmp_float_mask * t_scale; - if (has_addend) { y_vec.elem[i] += addend_vec.elem[i]; } - } - - *(reinterpret_cast(y + linear_index)) = y_vec.storage; - *(reinterpret_cast(mask + linear_index)) = mask_vec.storage; - } - - if (tail && global_thread_id < n_tail) { - const float rand_uniform = hiprand_uniform(&state); - const bool mask_val = rand_uniform > rate; - tail_mask[global_thread_id] = mask_val; - T tmp_float_mask = static_cast(mask_val); - T tmp_tail_out = tail_x[global_thread_id] * tmp_float_mask * t_scale; - if (has_addend) { tmp_tail_out += tail_addend[global_thread_id]; } - tail_y[global_thread_id] = tmp_tail_out; - } - - __syncthreads(); - - if (threadIdx.x == 0) { - int32_t new_counter = cuda::atomic::Add(&cuda_gen_state->dev_counter, 1) + 1; - if (new_counter == gridDim.x) { - cuda_gen_state->dev_counter = 0; // reset counter to zero - cuda_gen_state->dev_offset += inc_offset; // maintain the state of generator's dev_offset - } - } -} - -template -__global__ RETURN_VOID_IF_HALF FusedDropoutAddGpu( - uint64_t seed, one::CUDAGeneratorState* cuda_gen_state, uint64_t inc_offset, - const int64_t elem_cnt, float rate, float scale, int64_t n_tail, const T* x, bool* mask, - const T* addend, T* y, const T* tail_x, bool* tail_mask, const T* tail_addend, T* tail_y) { - int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - hiprandStatePhilox4_32_10_t state; - hiprand_init(seed, global_thread_id, cuda_gen_state->dev_offset, &state); - using LoadType = cuda::elementwise::PackType; - using LoadPack = cuda::elementwise::Pack; - using StoreType = cuda::elementwise::PackType, pack_size / 2>; - using StorePack = cuda::elementwise::Pack, pack_size / 2>; - using MaskType = cuda::elementwise::PackType; - using MaskPack = cuda::elementwise::Pack; - - RandPack4 rand_uniform_pack4; - Pack2Type h2_scale = Make2(scale); - - for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt; - linear_index += gridDim.x * blockDim.x * pack_size) { - rand_uniform_pack4.storage = hiprand_uniform4(&state); - const LoadType* x_load = reinterpret_cast(x + linear_index); - H2Pack x_vec{}; - x_vec.pack_storage.storage = *x_load; - - H2Pack addend_vec{}; - if (has_addend) { - const LoadType* addend_load = reinterpret_cast(addend + linear_index); - addend_vec.pack_storage.storage = *addend_load; - } - - MaskPack mask_vec; - StorePack y_vec; - StorePack one_or_zero_h2; - - mask_vec.elem[0] = rand_uniform_pack4.elem[0] > rate; - float tmp_float_mask = static_cast(mask_vec.elem[0]); - one_or_zero_h2.elem[0].x = tmp_float_mask; - mask_vec.elem[1] = rand_uniform_pack4.elem[1] > rate; - tmp_float_mask = static_cast(mask_vec.elem[1]); - one_or_zero_h2.elem[0].y = tmp_float_mask; - y_vec.elem[0] = __hmul2(__hmul2(x_vec.h2[0], one_or_zero_h2.elem[0]), h2_scale); - - mask_vec.elem[2] = rand_uniform_pack4.elem[2] > rate; - tmp_float_mask = static_cast(mask_vec.elem[2]); - one_or_zero_h2.elem[1].x = tmp_float_mask; - mask_vec.elem[3] = rand_uniform_pack4.elem[3] > rate; - tmp_float_mask = static_cast(mask_vec.elem[3]); - one_or_zero_h2.elem[1].y = tmp_float_mask; - y_vec.elem[1] = __hmul2(__hmul2(x_vec.h2[1], one_or_zero_h2.elem[1]), h2_scale); - - if (has_addend) { - y_vec.elem[0] = __hadd2(y_vec.elem[0], addend_vec.h2[0]); - y_vec.elem[1] = __hadd2(y_vec.elem[1], addend_vec.h2[1]); - } - - *(reinterpret_cast(y + linear_index)) = y_vec.storage; - *(reinterpret_cast(mask + linear_index)) = mask_vec.storage; - } - - if (tail && global_thread_id < n_tail) { - const float rand_uniform = hiprand_uniform(&state); - const bool mask_val = rand_uniform > rate; - tail_mask[global_thread_id] = mask_val; - float tmp_half_mask = static_cast(mask_val); - T tmp_tail_out = tail_x[global_thread_id] * static_cast(tmp_half_mask) * static_cast(h2_scale.data.x); - if (has_addend) { tmp_tail_out += tail_addend[global_thread_id]; } - tail_y[global_thread_id] = tmp_tail_out; - } - - __syncthreads(); - if (threadIdx.x == 0) { - int32_t new_counter = cuda::atomic::Add(&cuda_gen_state->dev_counter, 1) + 1; - if (new_counter == gridDim.x) { - cuda_gen_state->dev_counter = 0; // reset counter to zero - cuda_gen_state->dev_offset += inc_offset; // maintain the state of generator's dev_offset - } - } -} - -template -__global__ RETURN_VOID_IF_DOUBLE FusedDropoutAddGpu( - uint64_t seed, one::CUDAGeneratorState* cuda_gen_state, uint64_t inc_offset, - const int64_t elem_cnt, float rate, float scale, int64_t n_tail, const T* x, bool* mask, - const T* addend, T* y, const T* tail_x, bool* tail_mask, const T* tail_addend, T* tail_y) { - int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - hiprandStatePhilox4_32_10_t state; - hiprand_init(seed, global_thread_id, cuda_gen_state->dev_offset, &state); - using LoadType = cuda::elementwise::PackType; - using LoadPack = cuda::elementwise::Pack; - using MaskType = cuda::elementwise::PackType; - using MaskPack = cuda::elementwise::Pack; - - RandPack4 rand_uniform_pack4; - bool grid_loop_rand_state = 0; - - for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt; - linear_index += gridDim.x * blockDim.x * pack_size) { - if (grid_loop_rand_state == 0) { - rand_uniform_pack4.storage = hiprand_uniform4(&state); - grid_loop_rand_state ^= 1; - } else { - // Use the last two random numbers we generated in previous iteration. - rand_uniform_pack4.elem[0] = rand_uniform_pack4.elem[2]; - rand_uniform_pack4.elem[1] = rand_uniform_pack4.elem[3]; - grid_loop_rand_state ^= 1; - } - const LoadType* x_load = reinterpret_cast(x + linear_index); - LoadPack x_vec; - x_vec.storage = *x_load; - - LoadPack addend_vec; - if (has_addend) { - const LoadType* addend_load = reinterpret_cast(addend + linear_index); - addend_vec.storage = *addend_load; - } - - MaskPack mask_vec; - LoadPack y_vec; -#pragma unroll - for (int i = 0; i < pack_size; i++) { - mask_vec.elem[i] = rand_uniform_pack4.elem[i] > rate; - y_vec.elem[i] = x_vec.elem[i] * mask_vec.elem[i] * scale; - if (has_addend) { y_vec.elem[i] += addend_vec.elem[i]; } - } - *(reinterpret_cast(y + linear_index)) = y_vec.storage; - *(reinterpret_cast(mask + linear_index)) = mask_vec.storage; - } - - if (tail && global_thread_id < n_tail) { - const float rand_uniform = hiprand_uniform(&state); - const bool mask_val = rand_uniform > rate; - tail_mask[global_thread_id] = mask_val; - double tmp_tail_out = tail_x[global_thread_id] * mask_val * scale; - if (has_addend) { tmp_tail_out += tail_addend[global_thread_id]; } - tail_y[global_thread_id] = tmp_tail_out; - } - - __syncthreads(); - if (threadIdx.x == 0) { - int32_t new_counter = cuda::atomic::Add(&cuda_gen_state->dev_counter, 1) + 1; - if (new_counter == gridDim.x) { - cuda_gen_state->dev_counter = 0; // reset counter to zero - cuda_gen_state->dev_offset += inc_offset; // maintain the state of generator's dev_offset - } - } -} - -unsigned int ComputeGridSize(ep::Stream* stream, const int32_t block_size, const int64_t elem_cnt) { - auto* cuda_stream = stream->As(); - const int32_t max_threads_multi_process = - cuda_stream->device_properties().maxThreadsPerMultiProcessor; - const int32_t multi_processor_count = cuda_stream->device_properties().multiProcessorCount; - unsigned int blocks_per_sm = max_threads_multi_process / block_size; - unsigned int grid_size = std::max((int64_t)1, ((elem_cnt + block_size - 1) / block_size)); - grid_size = std::min((unsigned int)multi_processor_count * blocks_per_sm, grid_size); - return grid_size; -} - -template -void DispatchTail(ep::Stream* stream, uint64_t seed, one::CUDAGeneratorState* cuda_gen_state, - const int64_t elem_cnt, float rate, float scale, const T* x, bool* mask, - const T* addend, T* y) { - constexpr int pack_size = GetDropoutPackSize(); - const int64_t pack_num = elem_cnt / pack_size; - unsigned int grid_size = ComputeGridSize(stream, kBlockSize, pack_num); - const int64_t tail_offset = pack_num * pack_size; - const int64_t n_tail = elem_cnt - tail_offset; - const bool tail = n_tail > 0 ? true : false; - uint64_t inc_offset = 0; - - if (tail) { - // If tail, we need generate randnum one more time, so here we add another `1`. - inc_offset = ((elem_cnt - 1) / (kBlockSize * grid_size * kVecSize) + 1) * kVecSize + 1; - FusedDropoutAddGpu - <<As()->cuda_stream()>>>( - seed, cuda_gen_state, inc_offset, elem_cnt, rate, scale, n_tail, x, mask, addend, y, - (x + tail_offset), (mask + tail_offset), (addend + tail_offset), (y + tail_offset)); - } else { - inc_offset = ((elem_cnt - 1) / (kBlockSize * grid_size * kVecSize) + 1) * kVecSize; - FusedDropoutAddGpu - <<As()->cuda_stream()>>>( - seed, cuda_gen_state, inc_offset, elem_cnt, rate, scale, n_tail, x, mask, addend, y, - nullptr, nullptr, nullptr, nullptr); - } -} - -template -struct MaskAndScaleFunctor { - OF_DEVICE_FUNC explicit MaskAndScaleFunctor(float scale) : scale(scale) {} - __device__ T operator()(T x, bool mask) const { - return x * static_cast(mask) * static_cast(scale); - } - float scale; -}; - - - -template -class DropoutKernelGPU final : public user_op::OpKernel, public user_op::CudaGraphSupport { - public: - DropoutKernelGPU() = default; - ~DropoutKernelGPU() = default; - - std::shared_ptr CreateOpKernelState( - user_op::KernelInitContext* ctx) const override { - const auto& generator = CHECK_JUST(one::MakeGenerator(DeviceType::kCUDA)); - return std::make_shared(generator); - } - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, - const user_op::OpKernelCache*) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); - auto* fused_dropout_kernel_state = dynamic_cast(state); - CHECK_NOTNULL(fused_dropout_kernel_state); - const auto& generator = fused_dropout_kernel_state->generator(); - CHECK_NOTNULL(generator); - auto* stream = ctx->stream(); - const auto device_index = stream->device()->device_index(); - std::shared_ptr cuda_generator = - CHECK_JUST(generator->Get(device_index)); - uint64_t seed = cuda_generator->current_seed(); - - const float rate = ctx->Attr("rate"); - float scale = 0.0; - if (rate < 1.0f) { scale = 1.0f / (1.0f - rate); } - one::CUDAGeneratorState* cuda_gen_state = cuda_generator->cuda_gen_state(); - - if (ctx->has_input("_add_to_output", 0)) { - const user_op::Tensor* addend = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); - DispatchTail( - stream, seed, cuda_gen_state, in->shape_view().elem_cnt(), rate, scale, - reinterpret_cast(in->dptr()), reinterpret_cast(mask->mut_dptr()), - reinterpret_cast(addend->dptr()), reinterpret_cast(out->mut_dptr())); - } else { - DispatchTail(stream, seed, cuda_gen_state, in->shape_view().elem_cnt(), rate, scale, - reinterpret_cast(in->dptr()), - reinterpret_cast(mask->mut_dptr()), nullptr, - reinterpret_cast(out->mut_dptr())); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_DROPOUT_KERNEL_GPU(cpp_type, data_type) \ - REGISTER_USER_KERNEL("dropout").SetCreateFn>().SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("out", 0) == data_type) \ - && (user_op::HobDataType("mask", 0) == GetDataType::value)) - -REGISTER_DROPOUT_KERNEL_GPU(half, DataType::kFloat16); -REGISTER_DROPOUT_KERNEL_GPU(float, DataType::kFloat); -REGISTER_DROPOUT_KERNEL_GPU(double, DataType::kDouble); - - -template -class DropoutGradKernelGPU final : public user_op::OpKernel, public user_op::CudaGraphSupport { - public: - DropoutGradKernelGPU() = default; - ~DropoutGradKernelGPU() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); - user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const float scale = ctx->Attr("scale"); - const int64_t elem_cnt = dy->shape_view().elem_cnt(); - OF_CUDA_CHECK((cuda::elementwise::Binary( - MaskAndScaleFunctor(scale), elem_cnt, reinterpret_cast(dx->mut_dptr()), - reinterpret_cast(dy->dptr()), reinterpret_cast(mask->dptr()), - ctx->stream()->As()->cuda_stream()))); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_DROPOUT_GRAD_KERNEL_GPU(cpp_type, data_type) \ - REGISTER_USER_KERNEL("dropout_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dx", 0) == data_type)) \ - .SetInplaceProposalFn([](const user_op::InferContext&, \ - user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe { \ - OF_RETURN_IF_ERROR(AddInplaceArgPairFn("dx", 0, "dy", 0, true)); \ - return Maybe::Ok(); \ - }) - -REGISTER_DROPOUT_GRAD_KERNEL_GPU(half, DataType::kFloat16); -REGISTER_DROPOUT_GRAD_KERNEL_GPU(float, DataType::kFloat); -REGISTER_DROPOUT_GRAD_KERNEL_GPU(double, DataType::kDouble); - -} // namespace - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/user/kernels/op_kernel_wrapper.h" +#include "oneflow/core/common/data_type.h" +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/core/hip/atomic.hip.h" +#include "oneflow/user/kernels/dropout_kernel.h" +#include "oneflow/core/kernel/cuda_graph_support.h" +#include "oneflow/core/ep/include/device.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +// #include "oneflow/core/device/cuda_pseudo_bfloat16.h" +namespace oneflow { + +namespace { + +constexpr int32_t kVecSize = 4; +constexpr int32_t kBlockSize = 256; + +template +constexpr int32_t GetDropoutPackSize() { + // For float, bfloat16, half. + return 4; +}; + +template<> +constexpr int32_t GetDropoutPackSize() { + return 2; +}; + +template<> +constexpr int32_t GetDropoutPackSize() { + return 2; +}; + +union RandPack4 { + float4 storage; + float elem[4]; +}; + +template +struct GetPack2Type { + using T2 = typename std::aligned_storage<2 * sizeof(T), 2 * sizeof(T)>::type; +}; + +template<> +struct GetPack2Type { + using T2 = half2; +}; + + +template +using Pack2Type = typename GetPack2Type::T2; + +using H2PackType = typename std::aligned_storage<4 * sizeof(half), 4 * sizeof(half)>::type; + +template +union H2Pack { + cuda::elementwise::Pack pack_storage; + Pack2Type h2[2]; + __device__ H2Pack() { + // do nothing + } +}; + +template<> +union H2Pack { + cuda::elementwise::Pack pack_storage; + half2 h2[2]; + __device__ H2Pack() { + // do nothing + } +}; + +template +__device__ Pack2Type Make2(float v); + +template<> +__device__ Pack2Type Make2(float v) { + return __float2half2_rn(v); +} + +#define RETURN_VOID_IF_HALF typename std::enable_if_t::value, void> + +#define RETURN_VOID_IF_FLOAT typename std::enable_if_t::value, void> +#define RETURN_VOID_IF_DOUBLE typename std::enable_if_t::value, void> + +template +__global__ RETURN_VOID_IF_FLOAT FusedDropoutAddGpu( + uint64_t seed, one::CUDAGeneratorState* cuda_gen_state, uint64_t inc_offset, + const int64_t elem_cnt, float rate, float scale, int64_t n_tail, const T* x, bool* mask, + const T* addend, T* y, const T* tail_x, bool* tail_mask, const T* tail_addend, T* tail_y) { + int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + hiprandStatePhilox4_32_10_t state; + hiprand_init(seed, global_thread_id, cuda_gen_state->dev_offset, &state); + using LoadType = cuda::elementwise::PackType; + using LoadPack = cuda::elementwise::Pack; + using MaskType = cuda::elementwise::PackType; + using MaskPack = cuda::elementwise::Pack; + + T t_scale = static_cast(scale); + RandPack4 rand_uniform_pack4; + for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt; + linear_index += gridDim.x * blockDim.x * pack_size) { + rand_uniform_pack4.storage = hiprand_uniform4(&state); + + const LoadType* x_load = reinterpret_cast(x + linear_index); + LoadPack x_vec; + x_vec.storage = *x_load; + + LoadPack addend_vec; + if (has_addend) { + const LoadType* addend_load = reinterpret_cast(addend + linear_index); + addend_vec.storage = *addend_load; + } + + MaskPack mask_vec; + LoadPack y_vec; +#pragma unroll + for (int i = 0; i < pack_size; i++) { + mask_vec.elem[i] = rand_uniform_pack4.elem[i] > rate; + T tmp_float_mask = static_cast(mask_vec.elem[i]); + y_vec.elem[i] = x_vec.elem[i] * tmp_float_mask * t_scale; + if (has_addend) { y_vec.elem[i] += addend_vec.elem[i]; } + } + + *(reinterpret_cast(y + linear_index)) = y_vec.storage; + *(reinterpret_cast(mask + linear_index)) = mask_vec.storage; + } + + if (tail && global_thread_id < n_tail) { + const float rand_uniform = hiprand_uniform(&state); + const bool mask_val = rand_uniform > rate; + tail_mask[global_thread_id] = mask_val; + T tmp_float_mask = static_cast(mask_val); + T tmp_tail_out = tail_x[global_thread_id] * tmp_float_mask * t_scale; + if (has_addend) { tmp_tail_out += tail_addend[global_thread_id]; } + tail_y[global_thread_id] = tmp_tail_out; + } + + __syncthreads(); + + if (threadIdx.x == 0) { + int32_t new_counter = cuda::atomic::Add(&cuda_gen_state->dev_counter, 1) + 1; + if (new_counter == gridDim.x) { + cuda_gen_state->dev_counter = 0; // reset counter to zero + cuda_gen_state->dev_offset += inc_offset; // maintain the state of generator's dev_offset + } + } +} + +template +__global__ RETURN_VOID_IF_HALF FusedDropoutAddGpu( + uint64_t seed, one::CUDAGeneratorState* cuda_gen_state, uint64_t inc_offset, + const int64_t elem_cnt, float rate, float scale, int64_t n_tail, const T* x, bool* mask, + const T* addend, T* y, const T* tail_x, bool* tail_mask, const T* tail_addend, T* tail_y) { + int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + hiprandStatePhilox4_32_10_t state; + hiprand_init(seed, global_thread_id, cuda_gen_state->dev_offset, &state); + using LoadType = cuda::elementwise::PackType; + using LoadPack = cuda::elementwise::Pack; + using StoreType = cuda::elementwise::PackType, pack_size / 2>; + using StorePack = cuda::elementwise::Pack, pack_size / 2>; + using MaskType = cuda::elementwise::PackType; + using MaskPack = cuda::elementwise::Pack; + + RandPack4 rand_uniform_pack4; + Pack2Type h2_scale = Make2(scale); + + for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt; + linear_index += gridDim.x * blockDim.x * pack_size) { + rand_uniform_pack4.storage = hiprand_uniform4(&state); + const LoadType* x_load = reinterpret_cast(x + linear_index); + H2Pack x_vec{}; + x_vec.pack_storage.storage = *x_load; + + H2Pack addend_vec{}; + if (has_addend) { + const LoadType* addend_load = reinterpret_cast(addend + linear_index); + addend_vec.pack_storage.storage = *addend_load; + } + + MaskPack mask_vec; + StorePack y_vec; + StorePack one_or_zero_h2; + + mask_vec.elem[0] = rand_uniform_pack4.elem[0] > rate; + float tmp_float_mask = static_cast(mask_vec.elem[0]); + one_or_zero_h2.elem[0].x = tmp_float_mask; + mask_vec.elem[1] = rand_uniform_pack4.elem[1] > rate; + tmp_float_mask = static_cast(mask_vec.elem[1]); + one_or_zero_h2.elem[0].y = tmp_float_mask; + y_vec.elem[0] = __hmul2(__hmul2(x_vec.h2[0], one_or_zero_h2.elem[0]), h2_scale); + + mask_vec.elem[2] = rand_uniform_pack4.elem[2] > rate; + tmp_float_mask = static_cast(mask_vec.elem[2]); + one_or_zero_h2.elem[1].x = tmp_float_mask; + mask_vec.elem[3] = rand_uniform_pack4.elem[3] > rate; + tmp_float_mask = static_cast(mask_vec.elem[3]); + one_or_zero_h2.elem[1].y = tmp_float_mask; + y_vec.elem[1] = __hmul2(__hmul2(x_vec.h2[1], one_or_zero_h2.elem[1]), h2_scale); + + if (has_addend) { + y_vec.elem[0] = __hadd2(y_vec.elem[0], addend_vec.h2[0]); + y_vec.elem[1] = __hadd2(y_vec.elem[1], addend_vec.h2[1]); + } + + *(reinterpret_cast(y + linear_index)) = y_vec.storage; + *(reinterpret_cast(mask + linear_index)) = mask_vec.storage; + } + + if (tail && global_thread_id < n_tail) { + const float rand_uniform = hiprand_uniform(&state); + const bool mask_val = rand_uniform > rate; + tail_mask[global_thread_id] = mask_val; + float tmp_half_mask = static_cast(mask_val); + T tmp_tail_out = tail_x[global_thread_id] * static_cast(tmp_half_mask) * static_cast(h2_scale.data.x); + if (has_addend) { tmp_tail_out += tail_addend[global_thread_id]; } + tail_y[global_thread_id] = tmp_tail_out; + } + + __syncthreads(); + if (threadIdx.x == 0) { + int32_t new_counter = cuda::atomic::Add(&cuda_gen_state->dev_counter, 1) + 1; + if (new_counter == gridDim.x) { + cuda_gen_state->dev_counter = 0; // reset counter to zero + cuda_gen_state->dev_offset += inc_offset; // maintain the state of generator's dev_offset + } + } +} + +template +__global__ RETURN_VOID_IF_DOUBLE FusedDropoutAddGpu( + uint64_t seed, one::CUDAGeneratorState* cuda_gen_state, uint64_t inc_offset, + const int64_t elem_cnt, float rate, float scale, int64_t n_tail, const T* x, bool* mask, + const T* addend, T* y, const T* tail_x, bool* tail_mask, const T* tail_addend, T* tail_y) { + int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + hiprandStatePhilox4_32_10_t state; + hiprand_init(seed, global_thread_id, cuda_gen_state->dev_offset, &state); + using LoadType = cuda::elementwise::PackType; + using LoadPack = cuda::elementwise::Pack; + using MaskType = cuda::elementwise::PackType; + using MaskPack = cuda::elementwise::Pack; + + RandPack4 rand_uniform_pack4; + bool grid_loop_rand_state = 0; + + for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt; + linear_index += gridDim.x * blockDim.x * pack_size) { + if (grid_loop_rand_state == 0) { + rand_uniform_pack4.storage = hiprand_uniform4(&state); + grid_loop_rand_state ^= 1; + } else { + // Use the last two random numbers we generated in previous iteration. + rand_uniform_pack4.elem[0] = rand_uniform_pack4.elem[2]; + rand_uniform_pack4.elem[1] = rand_uniform_pack4.elem[3]; + grid_loop_rand_state ^= 1; + } + const LoadType* x_load = reinterpret_cast(x + linear_index); + LoadPack x_vec; + x_vec.storage = *x_load; + + LoadPack addend_vec; + if (has_addend) { + const LoadType* addend_load = reinterpret_cast(addend + linear_index); + addend_vec.storage = *addend_load; + } + + MaskPack mask_vec; + LoadPack y_vec; +#pragma unroll + for (int i = 0; i < pack_size; i++) { + mask_vec.elem[i] = rand_uniform_pack4.elem[i] > rate; + y_vec.elem[i] = x_vec.elem[i] * mask_vec.elem[i] * scale; + if (has_addend) { y_vec.elem[i] += addend_vec.elem[i]; } + } + *(reinterpret_cast(y + linear_index)) = y_vec.storage; + *(reinterpret_cast(mask + linear_index)) = mask_vec.storage; + } + + if (tail && global_thread_id < n_tail) { + const float rand_uniform = hiprand_uniform(&state); + const bool mask_val = rand_uniform > rate; + tail_mask[global_thread_id] = mask_val; + double tmp_tail_out = tail_x[global_thread_id] * mask_val * scale; + if (has_addend) { tmp_tail_out += tail_addend[global_thread_id]; } + tail_y[global_thread_id] = tmp_tail_out; + } + + __syncthreads(); + if (threadIdx.x == 0) { + int32_t new_counter = cuda::atomic::Add(&cuda_gen_state->dev_counter, 1) + 1; + if (new_counter == gridDim.x) { + cuda_gen_state->dev_counter = 0; // reset counter to zero + cuda_gen_state->dev_offset += inc_offset; // maintain the state of generator's dev_offset + } + } +} + +unsigned int ComputeGridSize(ep::Stream* stream, const int32_t block_size, const int64_t elem_cnt) { + auto* cuda_stream = stream->As(); + const int32_t max_threads_multi_process = + cuda_stream->device_properties().maxThreadsPerMultiProcessor; + const int32_t multi_processor_count = cuda_stream->device_properties().multiProcessorCount; + unsigned int blocks_per_sm = max_threads_multi_process / block_size; + unsigned int grid_size = std::max((int64_t)1, ((elem_cnt + block_size - 1) / block_size)); + grid_size = std::min((unsigned int)multi_processor_count * blocks_per_sm, grid_size); + return grid_size; +} + +template +void DispatchTail(ep::Stream* stream, uint64_t seed, one::CUDAGeneratorState* cuda_gen_state, + const int64_t elem_cnt, float rate, float scale, const T* x, bool* mask, + const T* addend, T* y) { + constexpr int pack_size = GetDropoutPackSize(); + const int64_t pack_num = elem_cnt / pack_size; + unsigned int grid_size = ComputeGridSize(stream, kBlockSize, pack_num); + const int64_t tail_offset = pack_num * pack_size; + const int64_t n_tail = elem_cnt - tail_offset; + const bool tail = n_tail > 0 ? true : false; + uint64_t inc_offset = 0; + + if (tail) { + // If tail, we need generate randnum one more time, so here we add another `1`. + inc_offset = ((elem_cnt - 1) / (kBlockSize * grid_size * kVecSize) + 1) * kVecSize + 1; + FusedDropoutAddGpu + <<As()->cuda_stream()>>>( + seed, cuda_gen_state, inc_offset, elem_cnt, rate, scale, n_tail, x, mask, addend, y, + (x + tail_offset), (mask + tail_offset), (addend + tail_offset), (y + tail_offset)); + } else { + inc_offset = ((elem_cnt - 1) / (kBlockSize * grid_size * kVecSize) + 1) * kVecSize; + FusedDropoutAddGpu + <<As()->cuda_stream()>>>( + seed, cuda_gen_state, inc_offset, elem_cnt, rate, scale, n_tail, x, mask, addend, y, + nullptr, nullptr, nullptr, nullptr); + } +} + +template +struct MaskAndScaleFunctor { + OF_DEVICE_FUNC explicit MaskAndScaleFunctor(float scale) : scale(scale) {} + __device__ T operator()(T x, bool mask) const { + return x * static_cast(mask) * static_cast(scale); + } + float scale; +}; + + + +template +class DropoutKernelGPU final : public user_op::OpKernel, public user_op::CudaGraphSupport { + public: + DropoutKernelGPU() = default; + ~DropoutKernelGPU() = default; + + std::shared_ptr CreateOpKernelState( + user_op::KernelInitContext* ctx) const override { + const auto& generator = CHECK_JUST(one::MakeGenerator(DeviceType::kCUDA)); + return std::make_shared(generator); + } + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, + const user_op::OpKernelCache*) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); + auto* fused_dropout_kernel_state = dynamic_cast(state); + CHECK_NOTNULL(fused_dropout_kernel_state); + const auto& generator = fused_dropout_kernel_state->generator(); + CHECK_NOTNULL(generator); + auto* stream = ctx->stream(); + const auto device_index = stream->device()->device_index(); + std::shared_ptr cuda_generator = + CHECK_JUST(generator->Get(device_index)); + uint64_t seed = cuda_generator->current_seed(); + + const float rate = ctx->Attr("rate"); + float scale = 0.0; + if (rate < 1.0f) { scale = 1.0f / (1.0f - rate); } + one::CUDAGeneratorState* cuda_gen_state = cuda_generator->cuda_gen_state(); + + if (ctx->has_input("_add_to_output", 0)) { + const user_op::Tensor* addend = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); + DispatchTail( + stream, seed, cuda_gen_state, in->shape_view().elem_cnt(), rate, scale, + reinterpret_cast(in->dptr()), reinterpret_cast(mask->mut_dptr()), + reinterpret_cast(addend->dptr()), reinterpret_cast(out->mut_dptr())); + } else { + DispatchTail(stream, seed, cuda_gen_state, in->shape_view().elem_cnt(), rate, scale, + reinterpret_cast(in->dptr()), + reinterpret_cast(mask->mut_dptr()), nullptr, + reinterpret_cast(out->mut_dptr())); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_DROPOUT_KERNEL_GPU(cpp_type, data_type) \ + REGISTER_USER_KERNEL("dropout").SetCreateFn>().SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("out", 0) == data_type) \ + && (user_op::HobDataType("mask", 0) == GetDataType::value)) + +REGISTER_DROPOUT_KERNEL_GPU(half, DataType::kFloat16); +REGISTER_DROPOUT_KERNEL_GPU(float, DataType::kFloat); +REGISTER_DROPOUT_KERNEL_GPU(double, DataType::kDouble); + + +template +class DropoutGradKernelGPU final : public user_op::OpKernel, public user_op::CudaGraphSupport { + public: + DropoutGradKernelGPU() = default; + ~DropoutGradKernelGPU() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); + user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + const float scale = ctx->Attr("scale"); + const int64_t elem_cnt = dy->shape_view().elem_cnt(); + OF_CUDA_CHECK((cuda::elementwise::Binary( + MaskAndScaleFunctor(scale), elem_cnt, reinterpret_cast(dx->mut_dptr()), + reinterpret_cast(dy->dptr()), reinterpret_cast(mask->dptr()), + ctx->stream()->As()->cuda_stream()))); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_DROPOUT_GRAD_KERNEL_GPU(cpp_type, data_type) \ + REGISTER_USER_KERNEL("dropout_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dx", 0) == data_type)) \ + .SetInplaceProposalFn([](const user_op::InferContext&, \ + user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe { \ + OF_RETURN_IF_ERROR(AddInplaceArgPairFn("dx", 0, "dy", 0, true)); \ + return Maybe::Ok(); \ + }) + +REGISTER_DROPOUT_GRAD_KERNEL_GPU(half, DataType::kFloat16); +REGISTER_DROPOUT_GRAD_KERNEL_GPU(float, DataType::kFloat); +REGISTER_DROPOUT_GRAD_KERNEL_GPU(double, DataType::kDouble); + +} // namespace + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/dynamic_loss_scale_schedule_kernel.hip.cpp b/oneflow/user/kernels/dynamic_loss_scale_schedule_kernel.hip.cpp index d614a8e..6a43bf1 100644 --- a/oneflow/user/kernels/dynamic_loss_scale_schedule_kernel.hip.cpp +++ b/oneflow/user/kernels/dynamic_loss_scale_schedule_kernel.hip.cpp @@ -1,67 +1,67 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -__global__ void DynamicLossScaleScheduleGpu(const int64_t increment_period, const float multiplier, - const int64_t* count_not_finite, float* loss_scale, - int64_t* good_step_counter) { - if (*count_not_finite == 0) { - int64_t cur_good_step_counter = *good_step_counter + 1; - if (cur_good_step_counter >= increment_period) { - *loss_scale = static_cast( - min(static_cast(*loss_scale) * multiplier, static_cast(FLT_MAX))); - cur_good_step_counter = 0; - } - *good_step_counter = cur_good_step_counter; - } else { - *good_step_counter = 0; - *loss_scale = static_cast(max(static_cast(*loss_scale) / multiplier, 1.0)); - } -} - -} // namespace - -class DynamicLossScaleScheduleGpuKernel final : public user_op::OpKernel { - public: - DynamicLossScaleScheduleGpuKernel() = default; - ~DynamicLossScaleScheduleGpuKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* count_not_finite = ctx->Tensor4ArgNameAndIndex("count_not_finite", 0); - user_op::Tensor* loss_scale = ctx->Tensor4ArgNameAndIndex("loss_scale", 0); - user_op::Tensor* good_step_counter = ctx->Tensor4ArgNameAndIndex("good_step_counter", 0); - const auto increment_period = ctx->Attr("increment_period"); - const auto multiplier = ctx->Attr("multiplier"); - DynamicLossScaleScheduleGpu<<<1, 1, 0, ctx->stream()->As()->cuda_stream()>>>( - increment_period, multiplier, count_not_finite->dptr(), - loss_scale->mut_dptr(), good_step_counter->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; } -}; - -REGISTER_USER_KERNEL("dynamic_loss_scale_schedule") - .SetCreateFn() - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +__global__ void DynamicLossScaleScheduleGpu(const int64_t increment_period, const float multiplier, + const int64_t* count_not_finite, float* loss_scale, + int64_t* good_step_counter) { + if (*count_not_finite == 0) { + int64_t cur_good_step_counter = *good_step_counter + 1; + if (cur_good_step_counter >= increment_period) { + *loss_scale = static_cast( + min(static_cast(*loss_scale) * multiplier, static_cast(FLT_MAX))); + cur_good_step_counter = 0; + } + *good_step_counter = cur_good_step_counter; + } else { + *good_step_counter = 0; + *loss_scale = static_cast(max(static_cast(*loss_scale) / multiplier, 1.0)); + } +} + +} // namespace + +class DynamicLossScaleScheduleGpuKernel final : public user_op::OpKernel { + public: + DynamicLossScaleScheduleGpuKernel() = default; + ~DynamicLossScaleScheduleGpuKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* count_not_finite = ctx->Tensor4ArgNameAndIndex("count_not_finite", 0); + user_op::Tensor* loss_scale = ctx->Tensor4ArgNameAndIndex("loss_scale", 0); + user_op::Tensor* good_step_counter = ctx->Tensor4ArgNameAndIndex("good_step_counter", 0); + const auto increment_period = ctx->Attr("increment_period"); + const auto multiplier = ctx->Attr("multiplier"); + DynamicLossScaleScheduleGpu<<<1, 1, 0, ctx->stream()->As()->cuda_stream()>>>( + increment_period, multiplier, count_not_finite->dptr(), + loss_scale->mut_dptr(), good_step_counter->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; } +}; + +REGISTER_USER_KERNEL("dynamic_loss_scale_schedule") + .SetCreateFn() + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/eager_nccl_kernels.hip.cpp b/oneflow/user/kernels/eager_nccl_kernels.hip.cpp index 55ebfcf..77d2ad6 100644 --- a/oneflow/user/kernels/eager_nccl_kernels.hip.cpp +++ b/oneflow/user/kernels/eager_nccl_kernels.hip.cpp @@ -1,404 +1,404 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/common/container_util.h" -#include "oneflow/core/control/global_process_ctx.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/device/nccl_util.h" -#include "oneflow/core/job/eager_nccl_comm_manager.h" -#include "oneflow/core/job/parallel_desc.h" -#include "oneflow/core/ep/include/primitive/permute.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -#if defined(WITH_ROCM) - -namespace oneflow { - -namespace { - -class EagerNcclOpKernelCache final : public user_op::OpKernelCache { - public: - explicit EagerNcclOpKernelCache(user_op::KernelCacheContext* ctx) { Init(ctx); } - ~EagerNcclOpKernelCache() override = default; - - Symbol parallel_desc() const { return parallel_desc_; } - ncclComm_t comm() const { return comm_; } - - private: - void Init(user_op::KernelCacheContext* ctx) { - const std::string& parallel_conf_txt = ctx->Attr("parallel_conf"); - ParallelConf parallel_conf; - std::set> device_set; - CHECK(TxtString2PbMessage(parallel_conf_txt, ¶llel_conf)); - parallel_desc_ = SymbolOf(ParallelDesc(parallel_conf)); - FOR_RANGE(int64_t, parallel_id, 0, parallel_desc_->parallel_num()) { - int64_t machine_id = CHECK_JUST(parallel_desc_->MachineId4ParallelId(parallel_id)); - int64_t device_id = CHECK_JUST(parallel_desc_->DeviceId4ParallelId(parallel_id)); - device_set.emplace(std::make_pair(machine_id, device_id)); - } - comm_ = CHECK_NOTNULL(Singleton::Get())->GetCommForDevice(device_set); - } - - Symbol parallel_desc_; - ncclComm_t comm_{}; -}; - -size_t InferEagerNcclS2SKernelTmpBufferSize(user_op::InferContext* ctx) { - const user_op::TensorDesc& in_tensor = ctx->InputTensorDesc("in", 0); - size_t tensor_byte_size = - GetCudaAlignedSize(in_tensor.shape().elem_cnt() * GetSizeOfDataType(in_tensor.data_type())); - // NOTE(hanbinbin): Set tmp_buffer_size to twice tensor_byte_size because the - // SbpParallel4ArgNameAndIndex function of LocalUserOpInferContext is unimplemented - return tensor_byte_size * 2; -} - -void InitEagerNcclOpKernelCache(user_op::KernelCacheContext* ctx, - std::shared_ptr* cache_ptr) { - // NOTE(jianhao): the cache only depends on parallel_conf, and the kernel is singleton - // once parallel_conf is determined, so only init the cache at the first time. - if (*cache_ptr == nullptr) { *cache_ptr = std::make_shared(ctx); } -} -} // namespace - -class EagerNcclAllReduceKernel final : public user_op::OpKernel { - public: - EagerNcclAllReduceKernel() = default; - ~EagerNcclAllReduceKernel() override = default; - - void InitOpKernelCacheWithFlags( - user_op::KernelCacheContext* ctx, int8_t flag, - std::shared_ptr* cache_ptr) const override { - InitEagerNcclOpKernelCache(ctx, cache_ptr); - } - - private: - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, - const user_op::OpKernelCache* cache) const override { - auto* kernel_cache = dynamic_cast(cache); - CHECK(kernel_cache != nullptr); - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - CHECK_EQ(in->shape_view(), out->shape_view()); - CHECK_EQ(in->data_type(), out->data_type()); - ncclRedOp_t reduce_type = ncclSum; - if (in->data_type() == kBool) { reduce_type = ncclMax; } - OF_NCCL_CHECK(ncclAllReduce(in->dptr(), out->mut_dptr(), in->shape_view().elem_cnt(), - GetNcclDataType(in->data_type()), reduce_type, kernel_cache->comm(), - ctx->stream()->As()->cuda_stream())); - }; - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -REGISTER_USER_KERNEL("eager_nccl_all_reduce") - .SetCreateFn() - .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA); - -class EagerNcclBroadcastKernel final : public user_op::OpKernel { - public: - EagerNcclBroadcastKernel() = default; - ~EagerNcclBroadcastKernel() override = default; - - void InitOpKernelCacheWithFlags( - user_op::KernelCacheContext* ctx, int8_t flag, - std::shared_ptr* cache_ptr) const override { - InitEagerNcclOpKernelCache(ctx, cache_ptr); - } - - private: - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, - const user_op::OpKernelCache* cache) const override { - auto* kernel_cache = dynamic_cast(cache); - CHECK(kernel_cache != nullptr); - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - int64_t root = ctx->Attr("root"); - int64_t dev_id = GlobalProcessCtx::LocalRank(root); - int64_t nccl_root = - CHECK_JUST(kernel_cache->parallel_desc()->ParallelId4MachineDeviceId(root, dev_id)); - const void* in_ptr = nullptr; - if (GlobalProcessCtx::Rank() == root) { - CHECK_EQ(in->shape_view(), out->shape_view()); - CHECK_EQ(in->data_type(), out->data_type()); - in_ptr = in->dptr(); - } - OF_NCCL_CHECK(ncclBroadcast(in_ptr, out->mut_dptr(), out->shape_view().elem_cnt(), - GetNcclDataType(out->data_type()), nccl_root, kernel_cache->comm(), - ctx->stream()->As()->cuda_stream())); - }; - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -REGISTER_USER_KERNEL("eager_nccl_broadcast") - .SetCreateFn() - .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA); - -class EagerNcclTouchKernel final : public user_op::OpKernel { - public: - EagerNcclTouchKernel() = default; - ~EagerNcclTouchKernel() override = default; - - private: - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, - const user_op::OpKernelCache* cache) const override{ - // Do nothing. - }; - bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; } -}; - -REGISTER_USER_KERNEL("eager_nccl_touch") - .SetCreateFn() - .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA); - -class EagerNcclReduceKernel final : public user_op::OpKernel { - public: - EagerNcclReduceKernel() = default; - ~EagerNcclReduceKernel() override = default; - - void InitOpKernelCacheWithFlags( - user_op::KernelCacheContext* ctx, int8_t flag, - std::shared_ptr* cache_ptr) const override { - InitEagerNcclOpKernelCache(ctx, cache_ptr); - } - - private: - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, - const user_op::OpKernelCache* cache) const override { - auto* kernel_cache = dynamic_cast(cache); - CHECK(kernel_cache != nullptr); - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - int64_t root = ctx->Attr("root"); - void* out_ptr = nullptr; - if (GlobalProcessCtx::Rank() == root) { - CHECK_EQ(in->shape_view(), out->shape_view()); - CHECK_EQ(in->data_type(), out->data_type()); - out_ptr = out->mut_dptr(); - } - ncclRedOp_t reduce_type = ncclSum; - if (in->data_type() == kBool) { reduce_type = ncclMax; } - OF_NCCL_CHECK(ncclReduce(in->dptr(), out_ptr, in->shape_view().elem_cnt(), - GetNcclDataType(in->data_type()), reduce_type, root, - kernel_cache->comm(), - ctx->stream()->As()->cuda_stream())); - }; - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -REGISTER_USER_KERNEL("eager_nccl_reduce") - .SetCreateFn() - .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA); - -class EagerNcclReduceScatterKernel final : public user_op::OpKernel { - public: - EagerNcclReduceScatterKernel() = default; - ~EagerNcclReduceScatterKernel() override = default; - - void InitOpKernelCacheWithFlags( - user_op::KernelCacheContext* ctx, int8_t flag, - std::shared_ptr* cache_ptr) const override { - InitEagerNcclOpKernelCache(ctx, cache_ptr); - } - - private: - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, - const user_op::OpKernelCache* cache) const override { - auto* kernel_cache = dynamic_cast(cache); - CHECK(kernel_cache != nullptr); - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - CHECK_EQ(in->data_type(), out->data_type()); - ncclRedOp_t reduce_type = ncclSum; - if (in->data_type() == kBool) { - reduce_type = ncclMax; - } else { - const auto& op_type = ctx->Attr("op_type"); - reduce_type = CHECK_JUST(MapAt(op_type2ncclRedOp_t, op_type)); - } - OF_NCCL_CHECK(ncclReduceScatter( - in->dptr(), out->mut_dptr(), out->shape_view().elem_cnt(), GetNcclDataType(in->data_type()), - reduce_type, kernel_cache->comm(), ctx->stream()->As()->cuda_stream())); - }; - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } - - static HashMap op_type2ncclRedOp_t; -}; - -HashMap EagerNcclReduceScatterKernel::op_type2ncclRedOp_t = { - {"sum", ncclSum}, {"max", ncclMax}}; - -REGISTER_USER_KERNEL("eager_nccl_reduce_scatter") - .SetCreateFn() - .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA); - -class EagerNcclAllGatherKernel final : public user_op::OpKernel { - public: - EagerNcclAllGatherKernel() = default; - ~EagerNcclAllGatherKernel() override = default; - - void InitOpKernelCacheWithFlags( - user_op::KernelCacheContext* ctx, int8_t flag, - std::shared_ptr* cache_ptr) const override { - InitEagerNcclOpKernelCache(ctx, cache_ptr); - } - - private: - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, - const user_op::OpKernelCache* cache) const override { - auto* kernel_cache = dynamic_cast(cache); - CHECK(kernel_cache != nullptr); - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - CHECK_EQ(in->data_type(), out->data_type()); - OF_NCCL_CHECK(ncclAllGather(in->dptr(), out->mut_dptr(), in->shape_view().elem_cnt(), - GetNcclDataType(in->data_type()), kernel_cache->comm(), - ctx->stream()->As()->cuda_stream())); - }; - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -REGISTER_USER_KERNEL("eager_nccl_all_gather") - .SetCreateFn() - .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA); - -template -class EagerNcclS2SKernel final : public user_op::OpKernel { - public: - EagerNcclS2SKernel() = default; - ~EagerNcclS2SKernel() override = default; - - void InitOpKernelCacheWithFlags( - user_op::KernelCacheContext* ctx, int8_t flag, - std::shared_ptr* cache_ptr) const override { - InitEagerNcclOpKernelCache(ctx, cache_ptr); - } - - private: - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, - const user_op::OpKernelCache* cache) const override { - auto* kernel_cache = dynamic_cast(cache); - CHECK(kernel_cache != nullptr); - // NOTE(hanbinbin): Compute logic copy from _nccl_logical_s2s - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - int64_t tmp_size = 0; - const int64_t dtype_size = GetSizeOfDataType(in->data_type()); - int64_t data_size = GetCudaAlignedSize(in->shape_view().elem_cnt() * dtype_size); - // NOTE(chengcheng): in (transpose)-> pack_to_ptr (all2all)-> unpack_from_ptr (transpose)-> out - const char* pack_to_ptr = in->dptr(); - char* unpack_from_ptr = out->mut_dptr(); - if (tmp_buffer) { tmp_size = tmp_buffer->shape_view().elem_cnt(); } - CHECK(tmp_size == 0 || tmp_size == data_size || tmp_size == data_size * 2); - - CHECK_EQ(in->data_type(), out->data_type()); - const int64_t num_ranks = kernel_cache->parallel_desc()->parallel_num(); - CHECK_EQ(in->shape_view().elem_cnt(), out->shape_view().elem_cnt()) - << in->shape_view().ToString() << " vs " << out->shape_view().ToString(); - const int64_t elem_cnt = in->shape_view().elem_cnt(); - const int64_t in_split_axis = ctx->Attr("in_split_axis"); - const int64_t out_split_axis = ctx->Attr("out_split_axis"); - - DimVector logical_shape_dim_vec; - in->shape_view().ToDimVector(&logical_shape_dim_vec); - logical_shape_dim_vec[in_split_axis] = logical_shape_dim_vec.at(in_split_axis) * num_ranks; - - if (out_split_axis != 0) { - // NOTE(chengcheng): Do pack. Need transpose in -> pack_to - // pack use temp buffer offset: [0, data_size] - pack_to_ptr = tmp_buffer->dptr(); - DimVector transpose_in_dim_vec = logical_shape_dim_vec; - CHECK_EQ(transpose_in_dim_vec.at(in_split_axis) % num_ranks, 0); - transpose_in_dim_vec[in_split_axis] = transpose_in_dim_vec.at(in_split_axis) / num_ranks; - CHECK_EQ(transpose_in_dim_vec.at(out_split_axis) % num_ranks, 0); - transpose_in_dim_vec[out_split_axis] = transpose_in_dim_vec.at(out_split_axis) / num_ranks; - transpose_in_dim_vec.insert(transpose_in_dim_vec.begin() + out_split_axis, num_ranks); - std::vector perm; - perm.emplace_back(out_split_axis); - FOR_RANGE(int64_t, i, 0, transpose_in_dim_vec.size()) { - if (i != out_split_axis) { perm.emplace_back(i); } - } - auto transpose = ep::primitive::NewPrimitive( - ctx->stream()->device_type(), transpose_in_dim_vec.size()); - CHECK(transpose); - transpose->Launch(ctx->stream(), in->data_type(), transpose_in_dim_vec.size(), - transpose_in_dim_vec.data(), in->dptr(), perm.data(), - tmp_buffer->mut_dptr()); - } - - if (in_split_axis != 0) { - // NOTE(chengcheng): Do unpack. Need transpose unpack_from -> out - // unpack use temp buffer offset: [tmp_size - data_size, tmp_size] - unpack_from_ptr = tmp_buffer->mut_dptr() + (tmp_size - data_size); - } - - { - // NOTE: Do S2S - OF_NCCL_CHECK(ncclGroupStart()); - const int64_t elem_per_chunk = elem_cnt / num_ranks; - const int64_t chunk_size = elem_per_chunk * dtype_size; - for (int64_t j = 0; j < num_ranks; ++j) { - OF_NCCL_CHECK(ncclSend(reinterpret_cast( - reinterpret_cast(pack_to_ptr) + j * chunk_size), - elem_per_chunk, GetNcclDataType(in->data_type()), j, - kernel_cache->comm(), - ctx->stream()->As()->cuda_stream())); - OF_NCCL_CHECK(ncclRecv( - reinterpret_cast(reinterpret_cast(unpack_from_ptr) + j * chunk_size), - elem_per_chunk, GetNcclDataType(in->data_type()), j, kernel_cache->comm(), - ctx->stream()->As()->cuda_stream())); - } - OF_NCCL_CHECK(ncclGroupEnd()); - } - - if (in_split_axis != 0) { - // Do unpack. - CHECK(unpack_from_ptr != out->mut_dptr()); - DimVector unpack_from_dim_vec = logical_shape_dim_vec; - CHECK_EQ(unpack_from_dim_vec.at(in_split_axis) % num_ranks, 0); - unpack_from_dim_vec[in_split_axis] = unpack_from_dim_vec.at(in_split_axis) / num_ranks; - CHECK_EQ(unpack_from_dim_vec.at(out_split_axis) % num_ranks, 0); - unpack_from_dim_vec[out_split_axis] = unpack_from_dim_vec.at(out_split_axis) / num_ranks; - unpack_from_dim_vec.insert(unpack_from_dim_vec.begin(), num_ranks); - std::vector perm; - FOR_RANGE(int64_t, i, 1, unpack_from_dim_vec.size()) { perm.emplace_back(i); } - perm.insert(perm.begin() + in_split_axis, 0); - auto transpose = ep::primitive::NewPrimitive( - ctx->stream()->device_type(), unpack_from_dim_vec.size()); - CHECK(transpose); - transpose->Launch(ctx->stream(), in->data_type(), unpack_from_dim_vec.size(), - unpack_from_dim_vec.data(), unpack_from_ptr, perm.data(), out->mut_dptr()); - } - }; - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_EAGER_NCCL_S2S_KERNEL(dtype) \ - REGISTER_USER_KERNEL("eager_nccl_s2s") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("in", 0) == GetDataType::value) \ - && (user_op::HobDataType("out", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn(InferEagerNcclS2SKernelTmpBufferSize); - -REGISTER_EAGER_NCCL_S2S_KERNEL(int8_t) -REGISTER_EAGER_NCCL_S2S_KERNEL(int32_t) -REGISTER_EAGER_NCCL_S2S_KERNEL(int64_t) -REGISTER_EAGER_NCCL_S2S_KERNEL(bool) -REGISTER_EAGER_NCCL_S2S_KERNEL(float) -REGISTER_EAGER_NCCL_S2S_KERNEL(double) -REGISTER_EAGER_NCCL_S2S_KERNEL(float16) -} // namespace oneflow - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/common/container_util.h" +#include "oneflow/core/control/global_process_ctx.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/device/nccl_util.h" +#include "oneflow/core/job/eager_nccl_comm_manager.h" +#include "oneflow/core/job/parallel_desc.h" +#include "oneflow/core/ep/include/primitive/permute.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +#if defined(WITH_ROCM) + +namespace oneflow { + +namespace { + +class EagerNcclOpKernelCache final : public user_op::OpKernelCache { + public: + explicit EagerNcclOpKernelCache(user_op::KernelCacheContext* ctx) { Init(ctx); } + ~EagerNcclOpKernelCache() override = default; + + Symbol parallel_desc() const { return parallel_desc_; } + ncclComm_t comm() const { return comm_; } + + private: + void Init(user_op::KernelCacheContext* ctx) { + const std::string& parallel_conf_txt = ctx->Attr("parallel_conf"); + ParallelConf parallel_conf; + std::set> device_set; + CHECK(TxtString2PbMessage(parallel_conf_txt, ¶llel_conf)); + parallel_desc_ = SymbolOf(ParallelDesc(parallel_conf)); + FOR_RANGE(int64_t, parallel_id, 0, parallel_desc_->parallel_num()) { + int64_t machine_id = CHECK_JUST(parallel_desc_->MachineId4ParallelId(parallel_id)); + int64_t device_id = CHECK_JUST(parallel_desc_->DeviceId4ParallelId(parallel_id)); + device_set.emplace(std::make_pair(machine_id, device_id)); + } + comm_ = CHECK_NOTNULL(Singleton::Get())->GetCommForDevice(device_set); + } + + Symbol parallel_desc_; + ncclComm_t comm_{}; +}; + +size_t InferEagerNcclS2SKernelTmpBufferSize(user_op::InferContext* ctx) { + const user_op::TensorDesc& in_tensor = ctx->InputTensorDesc("in", 0); + size_t tensor_byte_size = + GetCudaAlignedSize(in_tensor.shape().elem_cnt() * GetSizeOfDataType(in_tensor.data_type())); + // NOTE(hanbinbin): Set tmp_buffer_size to twice tensor_byte_size because the + // SbpParallel4ArgNameAndIndex function of LocalUserOpInferContext is unimplemented + return tensor_byte_size * 2; +} + +void InitEagerNcclOpKernelCache(user_op::KernelCacheContext* ctx, + std::shared_ptr* cache_ptr) { + // NOTE(jianhao): the cache only depends on parallel_conf, and the kernel is singleton + // once parallel_conf is determined, so only init the cache at the first time. + if (*cache_ptr == nullptr) { *cache_ptr = std::make_shared(ctx); } +} +} // namespace + +class EagerNcclAllReduceKernel final : public user_op::OpKernel { + public: + EagerNcclAllReduceKernel() = default; + ~EagerNcclAllReduceKernel() override = default; + + void InitOpKernelCacheWithFlags( + user_op::KernelCacheContext* ctx, int8_t flag, + std::shared_ptr* cache_ptr) const override { + InitEagerNcclOpKernelCache(ctx, cache_ptr); + } + + private: + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, + const user_op::OpKernelCache* cache) const override { + auto* kernel_cache = dynamic_cast(cache); + CHECK(kernel_cache != nullptr); + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + CHECK_EQ(in->shape_view(), out->shape_view()); + CHECK_EQ(in->data_type(), out->data_type()); + ncclRedOp_t reduce_type = ncclSum; + if (in->data_type() == kBool) { reduce_type = ncclMax; } + OF_NCCL_CHECK(ncclAllReduce(in->dptr(), out->mut_dptr(), in->shape_view().elem_cnt(), + GetNcclDataType(in->data_type()), reduce_type, kernel_cache->comm(), + ctx->stream()->As()->cuda_stream())); + }; + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +REGISTER_USER_KERNEL("eager_nccl_all_reduce") + .SetCreateFn() + .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA); + +class EagerNcclBroadcastKernel final : public user_op::OpKernel { + public: + EagerNcclBroadcastKernel() = default; + ~EagerNcclBroadcastKernel() override = default; + + void InitOpKernelCacheWithFlags( + user_op::KernelCacheContext* ctx, int8_t flag, + std::shared_ptr* cache_ptr) const override { + InitEagerNcclOpKernelCache(ctx, cache_ptr); + } + + private: + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, + const user_op::OpKernelCache* cache) const override { + auto* kernel_cache = dynamic_cast(cache); + CHECK(kernel_cache != nullptr); + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + int64_t root = ctx->Attr("root"); + int64_t dev_id = GlobalProcessCtx::LocalRank(root); + int64_t nccl_root = + CHECK_JUST(kernel_cache->parallel_desc()->ParallelId4MachineDeviceId(root, dev_id)); + const void* in_ptr = nullptr; + if (GlobalProcessCtx::Rank() == root) { + CHECK_EQ(in->shape_view(), out->shape_view()); + CHECK_EQ(in->data_type(), out->data_type()); + in_ptr = in->dptr(); + } + OF_NCCL_CHECK(ncclBroadcast(in_ptr, out->mut_dptr(), out->shape_view().elem_cnt(), + GetNcclDataType(out->data_type()), nccl_root, kernel_cache->comm(), + ctx->stream()->As()->cuda_stream())); + }; + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +REGISTER_USER_KERNEL("eager_nccl_broadcast") + .SetCreateFn() + .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA); + +class EagerNcclTouchKernel final : public user_op::OpKernel { + public: + EagerNcclTouchKernel() = default; + ~EagerNcclTouchKernel() override = default; + + private: + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, + const user_op::OpKernelCache* cache) const override{ + // Do nothing. + }; + bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; } +}; + +REGISTER_USER_KERNEL("eager_nccl_touch") + .SetCreateFn() + .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA); + +class EagerNcclReduceKernel final : public user_op::OpKernel { + public: + EagerNcclReduceKernel() = default; + ~EagerNcclReduceKernel() override = default; + + void InitOpKernelCacheWithFlags( + user_op::KernelCacheContext* ctx, int8_t flag, + std::shared_ptr* cache_ptr) const override { + InitEagerNcclOpKernelCache(ctx, cache_ptr); + } + + private: + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, + const user_op::OpKernelCache* cache) const override { + auto* kernel_cache = dynamic_cast(cache); + CHECK(kernel_cache != nullptr); + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + int64_t root = ctx->Attr("root"); + void* out_ptr = nullptr; + if (GlobalProcessCtx::Rank() == root) { + CHECK_EQ(in->shape_view(), out->shape_view()); + CHECK_EQ(in->data_type(), out->data_type()); + out_ptr = out->mut_dptr(); + } + ncclRedOp_t reduce_type = ncclSum; + if (in->data_type() == kBool) { reduce_type = ncclMax; } + OF_NCCL_CHECK(ncclReduce(in->dptr(), out_ptr, in->shape_view().elem_cnt(), + GetNcclDataType(in->data_type()), reduce_type, root, + kernel_cache->comm(), + ctx->stream()->As()->cuda_stream())); + }; + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +REGISTER_USER_KERNEL("eager_nccl_reduce") + .SetCreateFn() + .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA); + +class EagerNcclReduceScatterKernel final : public user_op::OpKernel { + public: + EagerNcclReduceScatterKernel() = default; + ~EagerNcclReduceScatterKernel() override = default; + + void InitOpKernelCacheWithFlags( + user_op::KernelCacheContext* ctx, int8_t flag, + std::shared_ptr* cache_ptr) const override { + InitEagerNcclOpKernelCache(ctx, cache_ptr); + } + + private: + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, + const user_op::OpKernelCache* cache) const override { + auto* kernel_cache = dynamic_cast(cache); + CHECK(kernel_cache != nullptr); + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + CHECK_EQ(in->data_type(), out->data_type()); + ncclRedOp_t reduce_type = ncclSum; + if (in->data_type() == kBool) { + reduce_type = ncclMax; + } else { + const auto& op_type = ctx->Attr("op_type"); + reduce_type = CHECK_JUST(MapAt(op_type2ncclRedOp_t, op_type)); + } + OF_NCCL_CHECK(ncclReduceScatter( + in->dptr(), out->mut_dptr(), out->shape_view().elem_cnt(), GetNcclDataType(in->data_type()), + reduce_type, kernel_cache->comm(), ctx->stream()->As()->cuda_stream())); + }; + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + + static HashMap op_type2ncclRedOp_t; +}; + +HashMap EagerNcclReduceScatterKernel::op_type2ncclRedOp_t = { + {"sum", ncclSum}, {"max", ncclMax}}; + +REGISTER_USER_KERNEL("eager_nccl_reduce_scatter") + .SetCreateFn() + .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA); + +class EagerNcclAllGatherKernel final : public user_op::OpKernel { + public: + EagerNcclAllGatherKernel() = default; + ~EagerNcclAllGatherKernel() override = default; + + void InitOpKernelCacheWithFlags( + user_op::KernelCacheContext* ctx, int8_t flag, + std::shared_ptr* cache_ptr) const override { + InitEagerNcclOpKernelCache(ctx, cache_ptr); + } + + private: + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, + const user_op::OpKernelCache* cache) const override { + auto* kernel_cache = dynamic_cast(cache); + CHECK(kernel_cache != nullptr); + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + CHECK_EQ(in->data_type(), out->data_type()); + OF_NCCL_CHECK(ncclAllGather(in->dptr(), out->mut_dptr(), in->shape_view().elem_cnt(), + GetNcclDataType(in->data_type()), kernel_cache->comm(), + ctx->stream()->As()->cuda_stream())); + }; + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +REGISTER_USER_KERNEL("eager_nccl_all_gather") + .SetCreateFn() + .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA); + +template +class EagerNcclS2SKernel final : public user_op::OpKernel { + public: + EagerNcclS2SKernel() = default; + ~EagerNcclS2SKernel() override = default; + + void InitOpKernelCacheWithFlags( + user_op::KernelCacheContext* ctx, int8_t flag, + std::shared_ptr* cache_ptr) const override { + InitEagerNcclOpKernelCache(ctx, cache_ptr); + } + + private: + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState*, + const user_op::OpKernelCache* cache) const override { + auto* kernel_cache = dynamic_cast(cache); + CHECK(kernel_cache != nullptr); + // NOTE(hanbinbin): Compute logic copy from _nccl_logical_s2s + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + int64_t tmp_size = 0; + const int64_t dtype_size = GetSizeOfDataType(in->data_type()); + int64_t data_size = GetCudaAlignedSize(in->shape_view().elem_cnt() * dtype_size); + // NOTE(chengcheng): in (transpose)-> pack_to_ptr (all2all)-> unpack_from_ptr (transpose)-> out + const char* pack_to_ptr = in->dptr(); + char* unpack_from_ptr = out->mut_dptr(); + if (tmp_buffer) { tmp_size = tmp_buffer->shape_view().elem_cnt(); } + CHECK(tmp_size == 0 || tmp_size == data_size || tmp_size == data_size * 2); + + CHECK_EQ(in->data_type(), out->data_type()); + const int64_t num_ranks = kernel_cache->parallel_desc()->parallel_num(); + CHECK_EQ(in->shape_view().elem_cnt(), out->shape_view().elem_cnt()) + << in->shape_view().ToString() << " vs " << out->shape_view().ToString(); + const int64_t elem_cnt = in->shape_view().elem_cnt(); + const int64_t in_split_axis = ctx->Attr("in_split_axis"); + const int64_t out_split_axis = ctx->Attr("out_split_axis"); + + DimVector logical_shape_dim_vec; + in->shape_view().ToDimVector(&logical_shape_dim_vec); + logical_shape_dim_vec[in_split_axis] = logical_shape_dim_vec.at(in_split_axis) * num_ranks; + + if (out_split_axis != 0) { + // NOTE(chengcheng): Do pack. Need transpose in -> pack_to + // pack use temp buffer offset: [0, data_size] + pack_to_ptr = tmp_buffer->dptr(); + DimVector transpose_in_dim_vec = logical_shape_dim_vec; + CHECK_EQ(transpose_in_dim_vec.at(in_split_axis) % num_ranks, 0); + transpose_in_dim_vec[in_split_axis] = transpose_in_dim_vec.at(in_split_axis) / num_ranks; + CHECK_EQ(transpose_in_dim_vec.at(out_split_axis) % num_ranks, 0); + transpose_in_dim_vec[out_split_axis] = transpose_in_dim_vec.at(out_split_axis) / num_ranks; + transpose_in_dim_vec.insert(transpose_in_dim_vec.begin() + out_split_axis, num_ranks); + std::vector perm; + perm.emplace_back(out_split_axis); + FOR_RANGE(int64_t, i, 0, transpose_in_dim_vec.size()) { + if (i != out_split_axis) { perm.emplace_back(i); } + } + auto transpose = ep::primitive::NewPrimitive( + ctx->stream()->device_type(), transpose_in_dim_vec.size()); + CHECK(transpose); + transpose->Launch(ctx->stream(), in->data_type(), transpose_in_dim_vec.size(), + transpose_in_dim_vec.data(), in->dptr(), perm.data(), + tmp_buffer->mut_dptr()); + } + + if (in_split_axis != 0) { + // NOTE(chengcheng): Do unpack. Need transpose unpack_from -> out + // unpack use temp buffer offset: [tmp_size - data_size, tmp_size] + unpack_from_ptr = tmp_buffer->mut_dptr() + (tmp_size - data_size); + } + + { + // NOTE: Do S2S + OF_NCCL_CHECK(ncclGroupStart()); + const int64_t elem_per_chunk = elem_cnt / num_ranks; + const int64_t chunk_size = elem_per_chunk * dtype_size; + for (int64_t j = 0; j < num_ranks; ++j) { + OF_NCCL_CHECK(ncclSend(reinterpret_cast( + reinterpret_cast(pack_to_ptr) + j * chunk_size), + elem_per_chunk, GetNcclDataType(in->data_type()), j, + kernel_cache->comm(), + ctx->stream()->As()->cuda_stream())); + OF_NCCL_CHECK(ncclRecv( + reinterpret_cast(reinterpret_cast(unpack_from_ptr) + j * chunk_size), + elem_per_chunk, GetNcclDataType(in->data_type()), j, kernel_cache->comm(), + ctx->stream()->As()->cuda_stream())); + } + OF_NCCL_CHECK(ncclGroupEnd()); + } + + if (in_split_axis != 0) { + // Do unpack. + CHECK(unpack_from_ptr != out->mut_dptr()); + DimVector unpack_from_dim_vec = logical_shape_dim_vec; + CHECK_EQ(unpack_from_dim_vec.at(in_split_axis) % num_ranks, 0); + unpack_from_dim_vec[in_split_axis] = unpack_from_dim_vec.at(in_split_axis) / num_ranks; + CHECK_EQ(unpack_from_dim_vec.at(out_split_axis) % num_ranks, 0); + unpack_from_dim_vec[out_split_axis] = unpack_from_dim_vec.at(out_split_axis) / num_ranks; + unpack_from_dim_vec.insert(unpack_from_dim_vec.begin(), num_ranks); + std::vector perm; + FOR_RANGE(int64_t, i, 1, unpack_from_dim_vec.size()) { perm.emplace_back(i); } + perm.insert(perm.begin() + in_split_axis, 0); + auto transpose = ep::primitive::NewPrimitive( + ctx->stream()->device_type(), unpack_from_dim_vec.size()); + CHECK(transpose); + transpose->Launch(ctx->stream(), in->data_type(), unpack_from_dim_vec.size(), + unpack_from_dim_vec.data(), unpack_from_ptr, perm.data(), out->mut_dptr()); + } + }; + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_EAGER_NCCL_S2S_KERNEL(dtype) \ + REGISTER_USER_KERNEL("eager_nccl_s2s") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("in", 0) == GetDataType::value) \ + && (user_op::HobDataType("out", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn(InferEagerNcclS2SKernelTmpBufferSize); + +REGISTER_EAGER_NCCL_S2S_KERNEL(int8_t) +REGISTER_EAGER_NCCL_S2S_KERNEL(int32_t) +REGISTER_EAGER_NCCL_S2S_KERNEL(int64_t) +REGISTER_EAGER_NCCL_S2S_KERNEL(bool) +REGISTER_EAGER_NCCL_S2S_KERNEL(float) +REGISTER_EAGER_NCCL_S2S_KERNEL(double) +REGISTER_EAGER_NCCL_S2S_KERNEL(float16) +} // namespace oneflow + #endif // WITH_ROCM \ No newline at end of file diff --git a/oneflow/user/kernels/elementwise_maximum_minimum_kernel.hip.cpp b/oneflow/user/kernels/elementwise_maximum_minimum_kernel.hip.cpp index 9e9e330..e139f4f 100644 --- a/oneflow/user/kernels/elementwise_maximum_minimum_kernel.hip.cpp +++ b/oneflow/user/kernels/elementwise_maximum_minimum_kernel.hip.cpp @@ -1,57 +1,57 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifdef WITH_ROCM -#include "hip/hip_runtime.h" -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/user/kernels/elementwise_maximum_minimum_kernel.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { -template class Opt, typename T> -__global__ void ElementwiseXimumGradGpuKernel(int64_t elem_cnt, const T* dz, const T* x, const T* y, - T* dx, T* dy) { - XPU_1D_KERNEL_LOOP(idx, elem_cnt) { - Opt()(dz[idx], x[idx], y[idx], dx ? &dx[idx] : nullptr, dy ? &dy[idx] : nullptr); - } -} - -template class Opt, typename T> -struct ElemwiseXimumGradFunctor final { - void operator()(ep::Stream* stream, int64_t elem_cnt, const T* dz, const T* x, const T* y, T* dx, - T* dy) { - ElementwiseXimumGradGpuKernel - <<As()->cuda_stream()>>>(elem_cnt, dz, x, y, dx, dy); - } -}; - -template class Opt, typename T> -struct ElemwiseXimumFunctor final { - void operator()(ep::Stream* stream, int64_t elem_cnt, T* z, const T* x, const T* y) { - OF_CUDA_CHECK(cuda::elementwise::Binary(Opt(), elem_cnt, z, x, y, - stream->As()->cuda_stream())); - } -}; -} // namespace - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MAXIMUM_KERNELS, (DeviceType::kCUDA), - ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ) -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MINIMUM_KERNELS, (DeviceType::kCUDA), - ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ) -} // namespace oneflow +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef WITH_ROCM +#include "hip/hip_runtime.h" +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/user/kernels/elementwise_maximum_minimum_kernel.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { +template class Opt, typename T> +__global__ void ElementwiseXimumGradGpuKernel(int64_t elem_cnt, const T* dz, const T* x, const T* y, + T* dx, T* dy) { + XPU_1D_KERNEL_LOOP(idx, elem_cnt) { + Opt()(dz[idx], x[idx], y[idx], dx ? &dx[idx] : nullptr, dy ? &dy[idx] : nullptr); + } +} + +template class Opt, typename T> +struct ElemwiseXimumGradFunctor final { + void operator()(ep::Stream* stream, int64_t elem_cnt, const T* dz, const T* x, const T* y, T* dx, + T* dy) { + ElementwiseXimumGradGpuKernel + <<As()->cuda_stream()>>>(elem_cnt, dz, x, y, dx, dy); + } +}; + +template class Opt, typename T> +struct ElemwiseXimumFunctor final { + void operator()(ep::Stream* stream, int64_t elem_cnt, T* z, const T* x, const T* y) { + OF_CUDA_CHECK(cuda::elementwise::Binary(Opt(), elem_cnt, z, x, y, + stream->As()->cuda_stream())); + } +}; +} // namespace + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MAXIMUM_KERNELS, (DeviceType::kCUDA), + ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ) +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MINIMUM_KERNELS, (DeviceType::kCUDA), + ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ) +} // namespace oneflow #endif // WITH_ROCM \ No newline at end of file diff --git a/oneflow/user/kernels/embedding_kernel.hip.cpp b/oneflow/user/kernels/embedding_kernel.hip.cpp index 5c9c6f7..2dc5e55 100644 --- a/oneflow/user/kernels/embedding_kernel.hip.cpp +++ b/oneflow/user/kernels/embedding_kernel.hip.cpp @@ -1,160 +1,160 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include "oneflow/core/ep/include/primitive/memset.h" -#include "oneflow/user/kernels/embedding_kernel_util.h" - -namespace oneflow { - -template -class GpuEmbeddingRenormKernel final : public user_op::OpKernel { - public: - GpuEmbeddingRenormKernel() = default; - ~GpuEmbeddingRenormKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const double max_norm = ctx->Attr("max_norm"); - const double norm_type = ctx->Attr("norm_type"); - - const ShapeView& in_shape = in->shape_view(); - const int64_t emb_size = in_shape.At(0); - const int64_t emb_dim = in_shape.At(1); - const T* in_buf = in->dptr(); - const IndexType* indices_buf = indices->dptr(); - T* out_buf = out->mut_dptr(); - const int64_t num_indices = indices->shape_view().elem_cnt(); - int32_t* tmp_buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0)->mut_dptr(); - std::unique_ptr memset_primitive = - ep::primitive::NewPrimitive(ctx->device_type()); - CHECK(memset_primitive); - memset_primitive->Launch(ctx->stream(), tmp_buf, 0, - GetCudaAlignedSize(sizeof(int32_t) * emb_size)); - EmbeddingReNormFunctor()( - ctx->stream(), in_buf, indices_buf, out_buf, max_norm, norm_type, num_indices, emb_size, - emb_dim, tmp_buf); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class GpuEmbeddingKernel final : public user_op::OpKernel { - public: - GpuEmbeddingKernel() = default; - ~GpuEmbeddingKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); - const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const int64_t padding_idx = ctx->Attr("padding_idx"); - const bool scale_grad_by_freq = ctx->Attr("scale_grad_by_freq"); - - const int64_t num_indices = indices->shape_view().elem_cnt(); - const int64_t emb_size = weight->shape_view().At(0); - const int64_t emb_dim = weight->shape_view().At(1); - const T* weight_buf = weight->dptr(); - const IndexType* indices_buf = indices->dptr(); - T* out_buf = out->mut_dptr(); - - EmbeddingFunctor()(ctx->stream(), weight_buf, indices_buf, - out_buf, padding_idx, scale_grad_by_freq, - num_indices, emb_size, emb_dim); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class GpuEmbeddingGradKernel final : public user_op::OpKernel { - public: - GpuEmbeddingGradKernel() = default; - ~GpuEmbeddingGradKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); - const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0); - user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const int64_t padding_idx = ctx->Attr("padding_idx"); - const bool scale_grad_by_freq = ctx->Attr("scale_grad_by_freq"); - - const int64_t num_indices = indices->shape_view().elem_cnt(); - const int64_t emb_size = weight->shape_view().At(0); - const int64_t emb_dim = weight->shape_view().At(1); - - const T* dy_buf = dy->dptr(); - const IndexType* indices_buf = indices->dptr(); - T* dx_buf = dx->mut_dptr(); - int32_t* tmp_buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0)->mut_dptr(); - std::unique_ptr memset_primitive = - ep::primitive::NewPrimitive(ctx->device_type()); - CHECK(memset_primitive); - memset_primitive->Launch(ctx->stream(), dx_buf, 0, dx->shape_view().elem_cnt() * sizeof(T)); - memset_primitive->Launch(ctx->stream(), tmp_buf, 0, - GetCudaAlignedSize(sizeof(int32_t) * emb_size)); - EmbeddingGradFunctor()( - ctx->stream(), dy_buf, indices_buf, dx_buf, padding_idx, scale_grad_by_freq, num_indices, - emb_size, emb_dim, tmp_buf); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_EMBEDDING_KERNEL(in_type, indices_type) \ - REGISTER_USER_KERNEL("embedding_renorm") \ - .SetCreateFn< \ - GpuEmbeddingRenormKernel>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("in", 0) == OF_PP_PAIR_SECOND(in_type)) \ - && (user_op::HobDataType("indices", 0) == OF_PP_PAIR_SECOND(indices_type))) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { \ - const Shape& in_shape = ctx->InputShape("in", 0); \ - const int64_t emb_size = in_shape.At(0); \ - return GetCudaAlignedSize(sizeof(int32_t) * emb_size); \ - }); \ - REGISTER_USER_KERNEL("embedding") \ - .SetCreateFn< \ - GpuEmbeddingKernel>() \ - .SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("weight", 0) == OF_PP_PAIR_SECOND(in_type)) \ - && (user_op::HobDataType("indices", 0) == OF_PP_PAIR_SECOND(indices_type))); \ - REGISTER_USER_KERNEL("embedding_grad") \ - .SetCreateFn< \ - GpuEmbeddingGradKernel>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("weight", 0) == OF_PP_PAIR_SECOND(in_type)) \ - && (user_op::HobDataType("indices", 0) == OF_PP_PAIR_SECOND(indices_type))) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { \ - const Shape& in_shape = ctx->InputShape("weight", 0); \ - const int64_t emb_size = in_shape.At(0); \ - return GetCudaAlignedSize(sizeof(int32_t) * emb_size); \ - }); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_KERNEL, EMBEDDING_DATA_TYPE_SEQ_CUDA, - INDEX_DATA_TYPE_SEQ) -#undef REGISTER_CUDA_EMBEDDING_KERNEL - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include "oneflow/core/ep/include/primitive/memset.h" +#include "oneflow/user/kernels/embedding_kernel_util.h" + +namespace oneflow { + +template +class GpuEmbeddingRenormKernel final : public user_op::OpKernel { + public: + GpuEmbeddingRenormKernel() = default; + ~GpuEmbeddingRenormKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + const double max_norm = ctx->Attr("max_norm"); + const double norm_type = ctx->Attr("norm_type"); + + const ShapeView& in_shape = in->shape_view(); + const int64_t emb_size = in_shape.At(0); + const int64_t emb_dim = in_shape.At(1); + const T* in_buf = in->dptr(); + const IndexType* indices_buf = indices->dptr(); + T* out_buf = out->mut_dptr(); + const int64_t num_indices = indices->shape_view().elem_cnt(); + int32_t* tmp_buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0)->mut_dptr(); + std::unique_ptr memset_primitive = + ep::primitive::NewPrimitive(ctx->device_type()); + CHECK(memset_primitive); + memset_primitive->Launch(ctx->stream(), tmp_buf, 0, + GetCudaAlignedSize(sizeof(int32_t) * emb_size)); + EmbeddingReNormFunctor()( + ctx->stream(), in_buf, indices_buf, out_buf, max_norm, norm_type, num_indices, emb_size, + emb_dim, tmp_buf); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class GpuEmbeddingKernel final : public user_op::OpKernel { + public: + GpuEmbeddingKernel() = default; + ~GpuEmbeddingKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); + const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + const int64_t padding_idx = ctx->Attr("padding_idx"); + const bool scale_grad_by_freq = ctx->Attr("scale_grad_by_freq"); + + const int64_t num_indices = indices->shape_view().elem_cnt(); + const int64_t emb_size = weight->shape_view().At(0); + const int64_t emb_dim = weight->shape_view().At(1); + const T* weight_buf = weight->dptr(); + const IndexType* indices_buf = indices->dptr(); + T* out_buf = out->mut_dptr(); + + EmbeddingFunctor()(ctx->stream(), weight_buf, indices_buf, + out_buf, padding_idx, scale_grad_by_freq, + num_indices, emb_size, emb_dim); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class GpuEmbeddingGradKernel final : public user_op::OpKernel { + public: + GpuEmbeddingGradKernel() = default; + ~GpuEmbeddingGradKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); + const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0); + user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + const int64_t padding_idx = ctx->Attr("padding_idx"); + const bool scale_grad_by_freq = ctx->Attr("scale_grad_by_freq"); + + const int64_t num_indices = indices->shape_view().elem_cnt(); + const int64_t emb_size = weight->shape_view().At(0); + const int64_t emb_dim = weight->shape_view().At(1); + + const T* dy_buf = dy->dptr(); + const IndexType* indices_buf = indices->dptr(); + T* dx_buf = dx->mut_dptr(); + int32_t* tmp_buf = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0)->mut_dptr(); + std::unique_ptr memset_primitive = + ep::primitive::NewPrimitive(ctx->device_type()); + CHECK(memset_primitive); + memset_primitive->Launch(ctx->stream(), dx_buf, 0, dx->shape_view().elem_cnt() * sizeof(T)); + memset_primitive->Launch(ctx->stream(), tmp_buf, 0, + GetCudaAlignedSize(sizeof(int32_t) * emb_size)); + EmbeddingGradFunctor()( + ctx->stream(), dy_buf, indices_buf, dx_buf, padding_idx, scale_grad_by_freq, num_indices, + emb_size, emb_dim, tmp_buf); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_EMBEDDING_KERNEL(in_type, indices_type) \ + REGISTER_USER_KERNEL("embedding_renorm") \ + .SetCreateFn< \ + GpuEmbeddingRenormKernel>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("in", 0) == OF_PP_PAIR_SECOND(in_type)) \ + && (user_op::HobDataType("indices", 0) == OF_PP_PAIR_SECOND(indices_type))) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { \ + const Shape& in_shape = ctx->InputShape("in", 0); \ + const int64_t emb_size = in_shape.At(0); \ + return GetCudaAlignedSize(sizeof(int32_t) * emb_size); \ + }); \ + REGISTER_USER_KERNEL("embedding") \ + .SetCreateFn< \ + GpuEmbeddingKernel>() \ + .SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("weight", 0) == OF_PP_PAIR_SECOND(in_type)) \ + && (user_op::HobDataType("indices", 0) == OF_PP_PAIR_SECOND(indices_type))); \ + REGISTER_USER_KERNEL("embedding_grad") \ + .SetCreateFn< \ + GpuEmbeddingGradKernel>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("weight", 0) == OF_PP_PAIR_SECOND(in_type)) \ + && (user_op::HobDataType("indices", 0) == OF_PP_PAIR_SECOND(indices_type))) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { \ + const Shape& in_shape = ctx->InputShape("weight", 0); \ + const int64_t emb_size = in_shape.At(0); \ + return GetCudaAlignedSize(sizeof(int32_t) * emb_size); \ + }); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_KERNEL, EMBEDDING_DATA_TYPE_SEQ_CUDA, + INDEX_DATA_TYPE_SEQ) +#undef REGISTER_CUDA_EMBEDDING_KERNEL + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/embedding_kernel_util.hip.cpp b/oneflow/user/kernels/embedding_kernel_util.hip.cpp index 7c0df30..a2f5e7d 100644 --- a/oneflow/user/kernels/embedding_kernel_util.hip.cpp +++ b/oneflow/user/kernels/embedding_kernel_util.hip.cpp @@ -1,182 +1,182 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include -#include "oneflow/core/hip/atomic.hip.h" -#include "oneflow/user/kernels/embedding_kernel_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -struct AccumulateType { - using type = T; -}; - -template<> -struct AccumulateType { - using type = float; -}; - -template -__global__ void embedding_kernel(const T* weight_buf, const IndexType* indices_buf, T* out_buf, - const int64_t num_indices, const int64_t emb_size, - const int64_t emb_dim) { - CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_indices * emb_dim) { - IndexType indices_index = i / emb_dim; - IndexType emb_dim_index = i - indices_index * emb_dim; - IndexType emb_size_index = indices_buf[indices_index]; - assert(emb_size_index >= 0 && emb_size_index < emb_size); - IndexType from_index = emb_size_index * emb_dim + emb_dim_index; - out_buf[i] = weight_buf[from_index]; - } -} - -template -__global__ void embedding_grad_kernel(const T* dy_buf, const IndexType* indices_buf, T* dx_buf, - const int64_t padding_idx, const int64_t num_indices, - const int64_t emb_dim) { - CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_indices * emb_dim) { - IndexType indices_index = i / emb_dim; - IndexType emb_dim_index = i - indices_index * emb_dim; - IndexType emb_size_index = indices_buf[indices_index]; - if (emb_size_index != padding_idx) { - IndexType from_index = emb_size_index * emb_dim + emb_dim_index; - cuda::atomic::Add(dx_buf + from_index, dy_buf[i]); - } - } -} - -template -__global__ void indices_freq_kernel(const IndexType* indices_buf, const int64_t num_indices, - int32_t* indices_freq, const int64_t emb_size) { - CUDA_1D_KERNEL_LOOP_T(IndexType, i, num_indices) { - IndexType index = indices_buf[i]; - assert(index >= 0 && index < emb_size); - cuda::atomic::Add(indices_freq + index, 1); - } -} - -template -__global__ void emb_scale_kernel(T* dx_buf, const int64_t emb_size, const int64_t emb_dim, - int32_t* indices_freq) { - CUDA_1D_KERNEL_LOOP_T(int64_t, i, emb_size * emb_dim) { - IndexType emb_size_index = i / emb_dim; - if (indices_freq[emb_size_index] > 1) { - dx_buf[i] /= static_cast(indices_freq[emb_size_index]); - } - } -} - -template -__global__ void embedding_renorm_kernel(const T* in_buf, T* out_buf, int32_t* indices_freq, - const AccumType max_norm, const AccumType norm_type, - const int64_t emb_size, const int64_t emb_dim) { - int64_t tid = threadIdx.x; - for (int64_t emb_idx = blockIdx.x; emb_idx < emb_size; emb_idx += gridDim.x) { - if (indices_freq[emb_idx] == 0) { continue; } - int64_t base_index = emb_idx * emb_dim; - - AccumType v = 0; - for (int64_t i = tid; i < emb_dim; i += blockDim.x) { - v += pow(abs(static_cast(in_buf[base_index + i])), norm_type); - } - - using BlockReduce = hipcub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - __shared__ AccumType norm; - v = BlockReduce(temp_storage).Sum(v); - - if (tid == 0) { norm = pow(v, static_cast(1.0 / norm_type)); } - __syncthreads(); - - if (norm > max_norm) { - auto scale = static_cast(max_norm / (norm + 1e-7)); - for (int64_t i = tid; i < emb_dim; i += blockDim.x) { - out_buf[base_index + i] = in_buf[base_index + i] * scale; - } - } - } -} - -} // namespace - -template -struct EmbeddingReNormFunctor final { - void operator()(ep::Stream* stream, const T* in_buf, const IndexType* indices_buf, T* out_buf, - const double max_norm, const double norm_type, const int64_t num_indices, - const int64_t emb_size, const int64_t emb_dim, int32_t* tmp_buf) { - indices_freq_kernel<<As()->cuda_stream()>>>( - indices_buf, num_indices, tmp_buf, emb_size); - - using AccumType = typename AccumulateType::type; - embedding_renorm_kernel - <<As()->cuda_stream()>>>( - in_buf, out_buf, tmp_buf, static_cast(max_norm), - static_cast(norm_type), emb_size, emb_dim); - } -}; - -template -struct EmbeddingFunctor final { - void operator()(ep::Stream* stream, const T* weight_buf, const IndexType* indices_buf, T* out_buf, - const int64_t padding_idx, const bool scale_grad_by_freq, - const int64_t num_indices, const int64_t emb_size, const int64_t emb_dim) { - embedding_kernel - <<As()->cuda_stream()>>>(weight_buf, indices_buf, out_buf, - num_indices, emb_size, emb_dim); - } -}; - -template -struct EmbeddingGradFunctor final { - void operator()(ep::Stream* stream, const T* dy_buf, const IndexType* indices_buf, T* dx_buf, - const int64_t padding_idx, const bool scale_grad_by_freq, - const int64_t num_indices, const int64_t emb_size, const int64_t emb_dim, - int32_t* tmp_buf) { - embedding_grad_kernel - <<As()->cuda_stream()>>>(dy_buf, indices_buf, dx_buf, padding_idx, - num_indices, emb_dim); - if (scale_grad_by_freq) { - indices_freq_kernel<<As()->cuda_stream()>>>( - indices_buf, num_indices, tmp_buf, emb_size); - emb_scale_kernel - <<As()->cuda_stream()>>>(dx_buf, emb_size, emb_dim, tmp_buf); - } - } -}; - -#define INITIATE_EMBEDDING_KERNEL_UTIL_CUDA_IMPL(in_type_pair, index_type_pair) \ - template struct EmbeddingReNormFunctor; \ - template struct EmbeddingFunctor; \ - template struct EmbeddingGradFunctor; -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_EMBEDDING_KERNEL_UTIL_CUDA_IMPL, - EMBEDDING_DATA_TYPE_SEQ_CUDA, INDEX_DATA_TYPE_SEQ); - -#undef INITIATE_EMBEDDING_KERNEL_UTIL_CUDA_IMPL - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include +#include "oneflow/core/hip/atomic.hip.h" +#include "oneflow/user/kernels/embedding_kernel_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +struct AccumulateType { + using type = T; +}; + +template<> +struct AccumulateType { + using type = float; +}; + +template +__global__ void embedding_kernel(const T* weight_buf, const IndexType* indices_buf, T* out_buf, + const int64_t num_indices, const int64_t emb_size, + const int64_t emb_dim) { + CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_indices * emb_dim) { + IndexType indices_index = i / emb_dim; + IndexType emb_dim_index = i - indices_index * emb_dim; + IndexType emb_size_index = indices_buf[indices_index]; + assert(emb_size_index >= 0 && emb_size_index < emb_size); + IndexType from_index = emb_size_index * emb_dim + emb_dim_index; + out_buf[i] = weight_buf[from_index]; + } +} + +template +__global__ void embedding_grad_kernel(const T* dy_buf, const IndexType* indices_buf, T* dx_buf, + const int64_t padding_idx, const int64_t num_indices, + const int64_t emb_dim) { + CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_indices * emb_dim) { + IndexType indices_index = i / emb_dim; + IndexType emb_dim_index = i - indices_index * emb_dim; + IndexType emb_size_index = indices_buf[indices_index]; + if (emb_size_index != padding_idx) { + IndexType from_index = emb_size_index * emb_dim + emb_dim_index; + cuda::atomic::Add(dx_buf + from_index, dy_buf[i]); + } + } +} + +template +__global__ void indices_freq_kernel(const IndexType* indices_buf, const int64_t num_indices, + int32_t* indices_freq, const int64_t emb_size) { + CUDA_1D_KERNEL_LOOP_T(IndexType, i, num_indices) { + IndexType index = indices_buf[i]; + assert(index >= 0 && index < emb_size); + cuda::atomic::Add(indices_freq + index, 1); + } +} + +template +__global__ void emb_scale_kernel(T* dx_buf, const int64_t emb_size, const int64_t emb_dim, + int32_t* indices_freq) { + CUDA_1D_KERNEL_LOOP_T(int64_t, i, emb_size * emb_dim) { + IndexType emb_size_index = i / emb_dim; + if (indices_freq[emb_size_index] > 1) { + dx_buf[i] /= static_cast(indices_freq[emb_size_index]); + } + } +} + +template +__global__ void embedding_renorm_kernel(const T* in_buf, T* out_buf, int32_t* indices_freq, + const AccumType max_norm, const AccumType norm_type, + const int64_t emb_size, const int64_t emb_dim) { + int64_t tid = threadIdx.x; + for (int64_t emb_idx = blockIdx.x; emb_idx < emb_size; emb_idx += gridDim.x) { + if (indices_freq[emb_idx] == 0) { continue; } + int64_t base_index = emb_idx * emb_dim; + + AccumType v = 0; + for (int64_t i = tid; i < emb_dim; i += blockDim.x) { + v += pow(abs(static_cast(in_buf[base_index + i])), norm_type); + } + + using BlockReduce = hipcub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + __shared__ AccumType norm; + v = BlockReduce(temp_storage).Sum(v); + + if (tid == 0) { norm = pow(v, static_cast(1.0 / norm_type)); } + __syncthreads(); + + if (norm > max_norm) { + auto scale = static_cast(max_norm / (norm + 1e-7)); + for (int64_t i = tid; i < emb_dim; i += blockDim.x) { + out_buf[base_index + i] = in_buf[base_index + i] * scale; + } + } + } +} + +} // namespace + +template +struct EmbeddingReNormFunctor final { + void operator()(ep::Stream* stream, const T* in_buf, const IndexType* indices_buf, T* out_buf, + const double max_norm, const double norm_type, const int64_t num_indices, + const int64_t emb_size, const int64_t emb_dim, int32_t* tmp_buf) { + indices_freq_kernel<<As()->cuda_stream()>>>( + indices_buf, num_indices, tmp_buf, emb_size); + + using AccumType = typename AccumulateType::type; + embedding_renorm_kernel + <<As()->cuda_stream()>>>( + in_buf, out_buf, tmp_buf, static_cast(max_norm), + static_cast(norm_type), emb_size, emb_dim); + } +}; + +template +struct EmbeddingFunctor final { + void operator()(ep::Stream* stream, const T* weight_buf, const IndexType* indices_buf, T* out_buf, + const int64_t padding_idx, const bool scale_grad_by_freq, + const int64_t num_indices, const int64_t emb_size, const int64_t emb_dim) { + embedding_kernel + <<As()->cuda_stream()>>>(weight_buf, indices_buf, out_buf, + num_indices, emb_size, emb_dim); + } +}; + +template +struct EmbeddingGradFunctor final { + void operator()(ep::Stream* stream, const T* dy_buf, const IndexType* indices_buf, T* dx_buf, + const int64_t padding_idx, const bool scale_grad_by_freq, + const int64_t num_indices, const int64_t emb_size, const int64_t emb_dim, + int32_t* tmp_buf) { + embedding_grad_kernel + <<As()->cuda_stream()>>>(dy_buf, indices_buf, dx_buf, padding_idx, + num_indices, emb_dim); + if (scale_grad_by_freq) { + indices_freq_kernel<<As()->cuda_stream()>>>( + indices_buf, num_indices, tmp_buf, emb_size); + emb_scale_kernel + <<As()->cuda_stream()>>>(dx_buf, emb_size, emb_dim, tmp_buf); + } + } +}; + +#define INITIATE_EMBEDDING_KERNEL_UTIL_CUDA_IMPL(in_type_pair, index_type_pair) \ + template struct EmbeddingReNormFunctor; \ + template struct EmbeddingFunctor; \ + template struct EmbeddingGradFunctor; +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_EMBEDDING_KERNEL_UTIL_CUDA_IMPL, + EMBEDDING_DATA_TYPE_SEQ_CUDA, INDEX_DATA_TYPE_SEQ); + +#undef INITIATE_EMBEDDING_KERNEL_UTIL_CUDA_IMPL + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/erfinv_kernel.hip.cpp b/oneflow/user/kernels/erfinv_kernel.hip.cpp index 7b057e6..977a8ed 100644 --- a/oneflow/user/kernels/erfinv_kernel.hip.cpp +++ b/oneflow/user/kernels/erfinv_kernel.hip.cpp @@ -1,61 +1,61 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/hip/elementwise.hip.h" - -namespace oneflow { - -template -struct ErfInvFunctor { - OF_DEVICE_FUNC ErfInvFunctor() {} - OF_DEVICE_FUNC T operator()(T x) const { return erfinv(x); } -}; - -template -class GpuErfinvKernel final : public user_op::OpKernel { - public: - GpuErfinvKernel() = default; - ~GpuErfinvKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); - user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - const int32_t elem_cnt = x->shape_view().elem_cnt(); - OF_CUDA_CHECK(cuda::elementwise::Unary(ErfInvFunctor(), elem_cnt, y->mut_dptr(), - x->dptr(), - ctx->stream()->As()->cuda_stream())); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_ERFINV_KERNEL(dtype) \ - REGISTER_USER_KERNEL("erfinv") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value)) \ - .SetInplaceProposalFn([](const user_op::InferContext&, \ - user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe { \ - OF_RETURN_IF_ERROR(AddInplaceArgPairFn("y", 0, "x", 0, true)); \ - return Maybe::Ok(); \ - }); - -REGISTER_CUDA_ERFINV_KERNEL(float) -REGISTER_CUDA_ERFINV_KERNEL(double) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/hip/elementwise.hip.h" + +namespace oneflow { + +template +struct ErfInvFunctor { + OF_DEVICE_FUNC ErfInvFunctor() {} + OF_DEVICE_FUNC T operator()(T x) const { return erfinv(x); } +}; + +template +class GpuErfinvKernel final : public user_op::OpKernel { + public: + GpuErfinvKernel() = default; + ~GpuErfinvKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); + const int32_t elem_cnt = x->shape_view().elem_cnt(); + OF_CUDA_CHECK(cuda::elementwise::Unary(ErfInvFunctor(), elem_cnt, y->mut_dptr(), + x->dptr(), + ctx->stream()->As()->cuda_stream())); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_ERFINV_KERNEL(dtype) \ + REGISTER_USER_KERNEL("erfinv") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("y", 0) == GetDataType::value)) \ + .SetInplaceProposalFn([](const user_op::InferContext&, \ + user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe { \ + OF_RETURN_IF_ERROR(AddInplaceArgPairFn("y", 0, "x", 0, true)); \ + return Maybe::Ok(); \ + }); + +REGISTER_CUDA_ERFINV_KERNEL(float) +REGISTER_CUDA_ERFINV_KERNEL(double) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/expand_kernel.hip.cpp b/oneflow/user/kernels/expand_kernel.hip.cpp index 07699da..5f417e3 100644 --- a/oneflow/user/kernels/expand_kernel.hip.cpp +++ b/oneflow/user/kernels/expand_kernel.hip.cpp @@ -1,220 +1,220 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/hip/atomic.hip.h" -#include "oneflow/user/kernels/expand_kernel_utils.h" - -namespace oneflow { - -namespace { - -const int32_t NDIMS = 16; -struct STRIDES { - int32_t val[NDIMS]; -}; - -template -__global__ void ExpandCudaKernel(const T* in_ptr, const STRIDES in_stride, - const STRIDES expand_stride, const int32_t dims, - const int32_t elements, T* out_ptr) { - int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; - int32_t step = gridDim.x * blockDim.x; - while (gid < elements) { - int32_t offset = OffsetToNdIndexToOffset(gid, in_stride.val, expand_stride.val, dims); - out_ptr[gid] = in_ptr[offset]; - gid += step; - } -} - -template -__global__ void ExpandGradCudaKernel(const T* out_diff_ptr, const STRIDES out_stride, - const STRIDES expand_stride, const int32_t dims, - const int32_t elements, T* in_diff_ptr) { - int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; - int32_t step = gridDim.x * blockDim.x; - while (gid < elements) { - int32_t offset = OffsetToNdIndexToOffset(gid, out_stride.val, expand_stride.val, dims); - cuda::atomic::Add(&in_diff_ptr[offset], out_diff_ptr[gid]); - gid += step; - } -} - -template -__global__ void InitPtr(const int32_t elements, T* ptr) { - int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; - int32_t step = gridDim.x * blockDim.x; - while (gid < elements) { - ptr[gid] = static_cast(0); - gid += step; - } -} - -template -struct GpuExpandFunctor final { - void operator()(ep::Stream* stream, const T* in_ptr, const STRIDES in_stride, - const STRIDES expand_stride, const int32_t dims, const int32_t elements, - T* out_ptr) { - RUN_CUDA_KERNEL((ExpandCudaKernel), stream, elements, in_ptr, in_stride, expand_stride, dims, - elements, out_ptr); - } -}; - -template<> -void GpuExpandFunctor::operator()(ep::Stream* stream, const float16* in_ptr, - const STRIDES in_stride, const STRIDES expand_stride, - const int32_t dims, const int32_t elements, - float16* out_ptr) { - RUN_CUDA_KERNEL((ExpandCudaKernel), stream, elements, reinterpret_cast(in_ptr), - in_stride, expand_stride, dims, elements, reinterpret_cast(out_ptr)); -} - -template -struct GpuExpandGradFunctor final { - void operator()(ep::Stream* stream, const T* in_ptr, const STRIDES in_stride, - const STRIDES expand_stride, const int32_t dims, const int32_t elements, - const int32_t out_elements, T* out_ptr) { - RUN_CUDA_KERNEL((InitPtr), stream, out_elements, out_elements, out_ptr); - RUN_CUDA_KERNEL((ExpandGradCudaKernel), stream, elements, in_ptr, in_stride, expand_stride, - dims, elements, out_ptr); - } -}; - -template<> -void GpuExpandGradFunctor::operator()(ep::Stream* stream, const float16* in_ptr, - const STRIDES in_stride, const STRIDES expand_stride, - const int32_t dims, const int32_t elements, - const int32_t out_elements, float16* out_ptr) { - RUN_CUDA_KERNEL((InitPtr), stream, out_elements, out_elements, - reinterpret_cast(out_ptr)); - RUN_CUDA_KERNEL((ExpandGradCudaKernel), stream, elements, - reinterpret_cast(in_ptr), in_stride, expand_stride, dims, elements, - reinterpret_cast(out_ptr)); -} - -} // namespace - -template -class GpuExpandKernel final : public user_op::OpKernel { - public: - GpuExpandKernel() = default; - ~GpuExpandKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const std::vector& logical_expand_shape = - ctx->Attr>("logical_expand_shape"); - if (std::any_of(logical_expand_shape.begin(), logical_expand_shape.end(), - [](int32_t dim_size) { return dim_size == 0; })) { - return; - } - std::vector in_shape; - in_shape.resize(in->shape_view().NumAxes()); - for (int i = 0; i < in->shape_view().NumAxes(); ++i) { in_shape[i] = in->shape_view().At(i); } - - std::vector out_shape; - std::vector stride; - CHECK_JUST(getOutShapeAndStrideForFp(in_shape, logical_expand_shape, out_shape, stride)); - - const T* in_ptr = in->dptr(); - T* out_ptr = out->mut_dptr(); - const int32_t out_dims = out->shape_view().NumAxes(); - const int32_t out_size = out->shape_view().elem_cnt(); - - STRIDES expand_stride; - for (int i = 0; i < out_dims; ++i) { expand_stride.val[i] = stride[i]; } - STRIDES out_stride; - InitStride(out_stride.val, out_shape.data(), out_dims); - GpuExpandFunctor()(ctx->stream(), in_ptr, out_stride, expand_stride, out_dims, out_size, - out_ptr); - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_EXPAND_KERNEL(dtype) \ - REGISTER_USER_KERNEL("expand").SetCreateFn>().SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("in", 0) == GetDataType::value)) - -REGISTER_EXPAND_KERNEL(float); -REGISTER_EXPAND_KERNEL(double); -REGISTER_EXPAND_KERNEL(float16); -REGISTER_EXPAND_KERNEL(bool); -REGISTER_EXPAND_KERNEL(uint8_t); -REGISTER_EXPAND_KERNEL(int8_t); -REGISTER_EXPAND_KERNEL(int32_t); -REGISTER_EXPAND_KERNEL(int64_t); - -template -class GpuExpandGradKernel final : public user_op::OpKernel { - public: - GpuExpandGradKernel() = default; - ~GpuExpandGradKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const std::vector& logical_out_shape = - ctx->Attr>("logical_out_shape"); - const std::vector& logical_expand_shape = - ctx->Attr>("logical_expand_shape"); - - std::vector in_shape; - in_shape.resize(in->shape_view().NumAxes()); - for (int i = 0; i < in->shape_view().NumAxes(); ++i) { in_shape[i] = in->shape_view().At(i); } - std::vector out_shape; - std::vector stride; - CHECK_JUST(getOutShapeAndStrideForBp(logical_out_shape, logical_expand_shape, in_shape, - out_shape, stride)); - - const T* in_ptr = in->dptr(); - T* out_ptr = out->mut_dptr(); - - const int32_t in_dims = in->shape_view().NumAxes(); - const int32_t in_size = in->shape_view().elem_cnt(); - const int32_t out_size = out->shape_view().elem_cnt(); - - STRIDES expand_stride; - for (int i = 0; i < in_dims; ++i) { expand_stride.val[i] = stride[i]; } - STRIDES in_stride; - InitStride(in_stride.val, in_shape.data(), in_dims); - - GpuExpandGradFunctor()(ctx->stream(), in_ptr, in_stride, expand_stride, in_dims, in_size, - out_size, out_ptr); - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_EXPAND_GRAD_KERNEL(dtype) \ - REGISTER_USER_KERNEL("expand_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("in", 0) == GetDataType::value)) - -REGISTER_EXPAND_GRAD_KERNEL(float); -REGISTER_EXPAND_GRAD_KERNEL(double); -REGISTER_EXPAND_GRAD_KERNEL(float16); -REGISTER_EXPAND_GRAD_KERNEL(int32_t); -REGISTER_EXPAND_GRAD_KERNEL(int64_t); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/hip/atomic.hip.h" +#include "oneflow/user/kernels/expand_kernel_utils.h" + +namespace oneflow { + +namespace { + +const int32_t NDIMS = 16; +struct STRIDES { + int32_t val[NDIMS]; +}; + +template +__global__ void ExpandCudaKernel(const T* in_ptr, const STRIDES in_stride, + const STRIDES expand_stride, const int32_t dims, + const int32_t elements, T* out_ptr) { + int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; + int32_t step = gridDim.x * blockDim.x; + while (gid < elements) { + int32_t offset = OffsetToNdIndexToOffset(gid, in_stride.val, expand_stride.val, dims); + out_ptr[gid] = in_ptr[offset]; + gid += step; + } +} + +template +__global__ void ExpandGradCudaKernel(const T* out_diff_ptr, const STRIDES out_stride, + const STRIDES expand_stride, const int32_t dims, + const int32_t elements, T* in_diff_ptr) { + int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; + int32_t step = gridDim.x * blockDim.x; + while (gid < elements) { + int32_t offset = OffsetToNdIndexToOffset(gid, out_stride.val, expand_stride.val, dims); + cuda::atomic::Add(&in_diff_ptr[offset], out_diff_ptr[gid]); + gid += step; + } +} + +template +__global__ void InitPtr(const int32_t elements, T* ptr) { + int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; + int32_t step = gridDim.x * blockDim.x; + while (gid < elements) { + ptr[gid] = static_cast(0); + gid += step; + } +} + +template +struct GpuExpandFunctor final { + void operator()(ep::Stream* stream, const T* in_ptr, const STRIDES in_stride, + const STRIDES expand_stride, const int32_t dims, const int32_t elements, + T* out_ptr) { + RUN_CUDA_KERNEL((ExpandCudaKernel), stream, elements, in_ptr, in_stride, expand_stride, dims, + elements, out_ptr); + } +}; + +template<> +void GpuExpandFunctor::operator()(ep::Stream* stream, const float16* in_ptr, + const STRIDES in_stride, const STRIDES expand_stride, + const int32_t dims, const int32_t elements, + float16* out_ptr) { + RUN_CUDA_KERNEL((ExpandCudaKernel), stream, elements, reinterpret_cast(in_ptr), + in_stride, expand_stride, dims, elements, reinterpret_cast(out_ptr)); +} + +template +struct GpuExpandGradFunctor final { + void operator()(ep::Stream* stream, const T* in_ptr, const STRIDES in_stride, + const STRIDES expand_stride, const int32_t dims, const int32_t elements, + const int32_t out_elements, T* out_ptr) { + RUN_CUDA_KERNEL((InitPtr), stream, out_elements, out_elements, out_ptr); + RUN_CUDA_KERNEL((ExpandGradCudaKernel), stream, elements, in_ptr, in_stride, expand_stride, + dims, elements, out_ptr); + } +}; + +template<> +void GpuExpandGradFunctor::operator()(ep::Stream* stream, const float16* in_ptr, + const STRIDES in_stride, const STRIDES expand_stride, + const int32_t dims, const int32_t elements, + const int32_t out_elements, float16* out_ptr) { + RUN_CUDA_KERNEL((InitPtr), stream, out_elements, out_elements, + reinterpret_cast(out_ptr)); + RUN_CUDA_KERNEL((ExpandGradCudaKernel), stream, elements, + reinterpret_cast(in_ptr), in_stride, expand_stride, dims, elements, + reinterpret_cast(out_ptr)); +} + +} // namespace + +template +class GpuExpandKernel final : public user_op::OpKernel { + public: + GpuExpandKernel() = default; + ~GpuExpandKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + const std::vector& logical_expand_shape = + ctx->Attr>("logical_expand_shape"); + if (std::any_of(logical_expand_shape.begin(), logical_expand_shape.end(), + [](int32_t dim_size) { return dim_size == 0; })) { + return; + } + std::vector in_shape; + in_shape.resize(in->shape_view().NumAxes()); + for (int i = 0; i < in->shape_view().NumAxes(); ++i) { in_shape[i] = in->shape_view().At(i); } + + std::vector out_shape; + std::vector stride; + CHECK_JUST(getOutShapeAndStrideForFp(in_shape, logical_expand_shape, out_shape, stride)); + + const T* in_ptr = in->dptr(); + T* out_ptr = out->mut_dptr(); + const int32_t out_dims = out->shape_view().NumAxes(); + const int32_t out_size = out->shape_view().elem_cnt(); + + STRIDES expand_stride; + for (int i = 0; i < out_dims; ++i) { expand_stride.val[i] = stride[i]; } + STRIDES out_stride; + InitStride(out_stride.val, out_shape.data(), out_dims); + GpuExpandFunctor()(ctx->stream(), in_ptr, out_stride, expand_stride, out_dims, out_size, + out_ptr); + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_EXPAND_KERNEL(dtype) \ + REGISTER_USER_KERNEL("expand").SetCreateFn>().SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("in", 0) == GetDataType::value)) + +REGISTER_EXPAND_KERNEL(float); +REGISTER_EXPAND_KERNEL(double); +REGISTER_EXPAND_KERNEL(float16); +REGISTER_EXPAND_KERNEL(bool); +REGISTER_EXPAND_KERNEL(uint8_t); +REGISTER_EXPAND_KERNEL(int8_t); +REGISTER_EXPAND_KERNEL(int32_t); +REGISTER_EXPAND_KERNEL(int64_t); + +template +class GpuExpandGradKernel final : public user_op::OpKernel { + public: + GpuExpandGradKernel() = default; + ~GpuExpandGradKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + const std::vector& logical_out_shape = + ctx->Attr>("logical_out_shape"); + const std::vector& logical_expand_shape = + ctx->Attr>("logical_expand_shape"); + + std::vector in_shape; + in_shape.resize(in->shape_view().NumAxes()); + for (int i = 0; i < in->shape_view().NumAxes(); ++i) { in_shape[i] = in->shape_view().At(i); } + std::vector out_shape; + std::vector stride; + CHECK_JUST(getOutShapeAndStrideForBp(logical_out_shape, logical_expand_shape, in_shape, + out_shape, stride)); + + const T* in_ptr = in->dptr(); + T* out_ptr = out->mut_dptr(); + + const int32_t in_dims = in->shape_view().NumAxes(); + const int32_t in_size = in->shape_view().elem_cnt(); + const int32_t out_size = out->shape_view().elem_cnt(); + + STRIDES expand_stride; + for (int i = 0; i < in_dims; ++i) { expand_stride.val[i] = stride[i]; } + STRIDES in_stride; + InitStride(in_stride.val, in_shape.data(), in_dims); + + GpuExpandGradFunctor()(ctx->stream(), in_ptr, in_stride, expand_stride, in_dims, in_size, + out_size, out_ptr); + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_EXPAND_GRAD_KERNEL(dtype) \ + REGISTER_USER_KERNEL("expand_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("in", 0) == GetDataType::value)) + +REGISTER_EXPAND_GRAD_KERNEL(float); +REGISTER_EXPAND_GRAD_KERNEL(double); +REGISTER_EXPAND_GRAD_KERNEL(float16); +REGISTER_EXPAND_GRAD_KERNEL(int32_t); +REGISTER_EXPAND_GRAD_KERNEL(int64_t); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/eye_kernel_util.hip.cpp b/oneflow/user/kernels/eye_kernel_util.hip.cpp index 84fd109..978c987 100644 --- a/oneflow/user/kernels/eye_kernel_util.hip.cpp +++ b/oneflow/user/kernels/eye_kernel_util.hip.cpp @@ -1,40 +1,40 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifdef WITH_ROCM -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/user/kernels/eye_kernel_util.h" - -namespace oneflow { - -namespace user_op { - -template -__global__ void EyeForwardGpuKernel(const int64_t cols, const int64_t rows, T* out) { - SetOneInDiag(cols, rows, out); -} - -template -struct EyeFunctor final { - void operator()(ep::Stream* stream, const int64_t& cols, const int64_t& rows, T* out) { - RUN_CUDA_KERNEL((EyeForwardGpuKernel), stream, rows, cols, rows, out); - } -}; -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_EYE_FUNCTOR, (DeviceType::kCUDA), EYE_DATA_TYPE_SEQ); -} // namespace user_op -} // namespace oneflow - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef WITH_ROCM +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/user/kernels/eye_kernel_util.h" + +namespace oneflow { + +namespace user_op { + +template +__global__ void EyeForwardGpuKernel(const int64_t cols, const int64_t rows, T* out) { + SetOneInDiag(cols, rows, out); +} + +template +struct EyeFunctor final { + void operator()(ep::Stream* stream, const int64_t& cols, const int64_t& rows, T* out) { + RUN_CUDA_KERNEL((EyeForwardGpuKernel), stream, rows, cols, rows, out); + } +}; +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_EYE_FUNCTOR, (DeviceType::kCUDA), EYE_DATA_TYPE_SEQ); +} // namespace user_op +} // namespace oneflow + #endif // End WITH_ROCM \ No newline at end of file diff --git a/oneflow/user/kernels/fake_quantization_kernel.hip.cpp b/oneflow/user/kernels/fake_quantization_kernel.hip.cpp index 126595e..a992cc2 100644 --- a/oneflow/user/kernels/fake_quantization_kernel.hip.cpp +++ b/oneflow/user/kernels/fake_quantization_kernel.hip.cpp @@ -1,160 +1,160 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/kernel_util.hip.h" - -namespace oneflow { - -namespace { - -template -__global__ void FakeQuantizationSymmetric(const T* in_ptr, const T* scale_ptr, - const int64_t scale_size, const int64_t elements, - const int64_t panel_size, const double quantization_bit, - T* out_ptr) { - int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; - int64_t step = gridDim.x * blockDim.x; - - T upper_bound = static_cast(pow(2.0, quantization_bit - 1)) - 1; - T lower_bound = -upper_bound - 1; - - while (gid < elements) { - int64_t channel_index = gid / panel_size; - int64_t scale_idx = min(scale_size - 1, channel_index); - - T scale = scale_ptr[scale_idx]; - - T out = nearbyint(in_ptr[gid] / scale); - out = out > upper_bound ? upper_bound : out; - out = out < lower_bound ? lower_bound : out; - out_ptr[gid] = out * scale; - - gid += step; - } -} - -template -__global__ void FakeQuantizationAffine(const T* in_ptr, const T* scale_ptr, const T* zero_point_ptr, - const int64_t scale_size, const int64_t elements, - const int64_t panel_size, const double quantization_bit, - T* out_ptr) { - int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; - int64_t step = gridDim.x * blockDim.x; - - T upper_bound = static_cast(pow(2.0, quantization_bit)) - 1; - T lower_bound = 0; - - while (gid < elements) { - int64_t channel_index = gid / panel_size; - int64_t scale_idx = min(scale_size - 1, channel_index); - - T scale = scale_ptr[scale_idx]; - T zero_point = zero_point_ptr[scale_idx]; - - T out = nearbyint(in_ptr[gid] / scale + zero_point); - out = out > upper_bound ? upper_bound : out; - out = out < lower_bound ? lower_bound : out; - out_ptr[gid] = (out - zero_point) * scale; - - gid += step; - } -} - -template -__global__ void FakeQuantizationCambricon(const T* in_ptr, const T* shift, const int64_t scale_size, - const int64_t elements, const int64_t panel_size, - const double quantization_bit, T* out_ptr) { - int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; - int64_t step = gridDim.x * blockDim.x; - - T upper_bound = static_cast(pow(2.0, quantization_bit - 1)) - 1; - T lower_bound = -upper_bound - 1; - - T scale = static_cast(pow(2.0, static_cast(shift[0]))); - - while (gid < elements) { - T out = nearbyint(in_ptr[gid] / scale); - out = out > upper_bound ? upper_bound : out; - out = out < lower_bound ? lower_bound : out; - out_ptr[gid] = out * scale; - gid += step; - } -} - -} // namespace - -template -class GpuFakeQuantizationKernel final : public user_op::OpKernel { - public: - GpuFakeQuantizationKernel() = default; - ~GpuFakeQuantizationKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - const user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0); - const user_op::Tensor* zero_point = ctx->Tensor4ArgNameAndIndex("zero_point", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - - const std::string quantization_scheme = ctx->Attr("quantization_scheme"); - const int32_t quantization_bit = ctx->Attr("quantization_bit"); - const std::string quantization_formula = ctx->Attr("quantization_formula"); - - const int64_t elements = in->shape_view().elem_cnt(); - const int64_t panel_size = in->shape_view().Count(1); - const int64_t scale_size = scale->shape_view().elem_cnt(); - - // round to even - auto origin_round_mode = std::fegetround(); - std::fesetround(FE_TONEAREST); - - if (quantization_formula == "google") { - if (quantization_scheme == "symmetric") { - RUN_CUDA_KERNEL((FakeQuantizationSymmetric), ctx->stream(), elements, in->dptr(), - scale->dptr(), scale_size, elements, panel_size, quantization_bit, - out->mut_dptr()); - } else { // quantization_scheme == "affine" - RUN_CUDA_KERNEL((FakeQuantizationAffine), ctx->stream(), elements, in->dptr(), - scale->dptr(), zero_point->dptr(), scale_size, elements, panel_size, - quantization_bit, out->mut_dptr()); - } - } else if (quantization_formula == "cambricon") { - RUN_CUDA_KERNEL((FakeQuantizationCambricon), ctx->stream(), elements, in->dptr(), - scale->dptr(), scale_size, elements, panel_size, quantization_bit, - out->mut_dptr()); - } else { - UNIMPLEMENTED(); - } - - std::fesetround(origin_round_mode); - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_FAKE_QUANTIZATION_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fake_quantization") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("in", 0) == GetDataType::value)) - -REGISTER_FAKE_QUANTIZATION_KERNEL(float); -REGISTER_FAKE_QUANTIZATION_KERNEL(double); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/kernel_util.hip.h" + +namespace oneflow { + +namespace { + +template +__global__ void FakeQuantizationSymmetric(const T* in_ptr, const T* scale_ptr, + const int64_t scale_size, const int64_t elements, + const int64_t panel_size, const double quantization_bit, + T* out_ptr) { + int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; + int64_t step = gridDim.x * blockDim.x; + + T upper_bound = static_cast(pow(2.0, quantization_bit - 1)) - 1; + T lower_bound = -upper_bound - 1; + + while (gid < elements) { + int64_t channel_index = gid / panel_size; + int64_t scale_idx = min(scale_size - 1, channel_index); + + T scale = scale_ptr[scale_idx]; + + T out = nearbyint(in_ptr[gid] / scale); + out = out > upper_bound ? upper_bound : out; + out = out < lower_bound ? lower_bound : out; + out_ptr[gid] = out * scale; + + gid += step; + } +} + +template +__global__ void FakeQuantizationAffine(const T* in_ptr, const T* scale_ptr, const T* zero_point_ptr, + const int64_t scale_size, const int64_t elements, + const int64_t panel_size, const double quantization_bit, + T* out_ptr) { + int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; + int64_t step = gridDim.x * blockDim.x; + + T upper_bound = static_cast(pow(2.0, quantization_bit)) - 1; + T lower_bound = 0; + + while (gid < elements) { + int64_t channel_index = gid / panel_size; + int64_t scale_idx = min(scale_size - 1, channel_index); + + T scale = scale_ptr[scale_idx]; + T zero_point = zero_point_ptr[scale_idx]; + + T out = nearbyint(in_ptr[gid] / scale + zero_point); + out = out > upper_bound ? upper_bound : out; + out = out < lower_bound ? lower_bound : out; + out_ptr[gid] = (out - zero_point) * scale; + + gid += step; + } +} + +template +__global__ void FakeQuantizationCambricon(const T* in_ptr, const T* shift, const int64_t scale_size, + const int64_t elements, const int64_t panel_size, + const double quantization_bit, T* out_ptr) { + int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; + int64_t step = gridDim.x * blockDim.x; + + T upper_bound = static_cast(pow(2.0, quantization_bit - 1)) - 1; + T lower_bound = -upper_bound - 1; + + T scale = static_cast(pow(2.0, static_cast(shift[0]))); + + while (gid < elements) { + T out = nearbyint(in_ptr[gid] / scale); + out = out > upper_bound ? upper_bound : out; + out = out < lower_bound ? lower_bound : out; + out_ptr[gid] = out * scale; + gid += step; + } +} + +} // namespace + +template +class GpuFakeQuantizationKernel final : public user_op::OpKernel { + public: + GpuFakeQuantizationKernel() = default; + ~GpuFakeQuantizationKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + const user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0); + const user_op::Tensor* zero_point = ctx->Tensor4ArgNameAndIndex("zero_point", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + + const std::string quantization_scheme = ctx->Attr("quantization_scheme"); + const int32_t quantization_bit = ctx->Attr("quantization_bit"); + const std::string quantization_formula = ctx->Attr("quantization_formula"); + + const int64_t elements = in->shape_view().elem_cnt(); + const int64_t panel_size = in->shape_view().Count(1); + const int64_t scale_size = scale->shape_view().elem_cnt(); + + // round to even + auto origin_round_mode = std::fegetround(); + std::fesetround(FE_TONEAREST); + + if (quantization_formula == "google") { + if (quantization_scheme == "symmetric") { + RUN_CUDA_KERNEL((FakeQuantizationSymmetric), ctx->stream(), elements, in->dptr(), + scale->dptr(), scale_size, elements, panel_size, quantization_bit, + out->mut_dptr()); + } else { // quantization_scheme == "affine" + RUN_CUDA_KERNEL((FakeQuantizationAffine), ctx->stream(), elements, in->dptr(), + scale->dptr(), zero_point->dptr(), scale_size, elements, panel_size, + quantization_bit, out->mut_dptr()); + } + } else if (quantization_formula == "cambricon") { + RUN_CUDA_KERNEL((FakeQuantizationCambricon), ctx->stream(), elements, in->dptr(), + scale->dptr(), scale_size, elements, panel_size, quantization_bit, + out->mut_dptr()); + } else { + UNIMPLEMENTED(); + } + + std::fesetround(origin_round_mode); + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_FAKE_QUANTIZATION_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fake_quantization") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("in", 0) == GetDataType::value)) + +REGISTER_FAKE_QUANTIZATION_KERNEL(float); +REGISTER_FAKE_QUANTIZATION_KERNEL(double); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/fill_kernel.hip.cpp b/oneflow/user/kernels/fill_kernel.hip.cpp index b123325..290575f 100644 --- a/oneflow/user/kernels/fill_kernel.hip.cpp +++ b/oneflow/user/kernels/fill_kernel.hip.cpp @@ -1,61 +1,61 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/common/nd_index_offset_helper.h" -#include "oneflow/core/kernel/new_kernel_util.h" - -namespace oneflow { - -namespace { -template -__global__ void FillTensorGpuForward(const int n, const T* value, T* y) { - CUDA_1D_KERNEL_LOOP(i, n) { y[i] = value[0]; } -} -}; // namespace - -template -class FillTensorGpuKernel final : public user_op::OpKernel { - public: - FillTensorGpuKernel() = default; - ~FillTensorGpuKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const user_op::Tensor* value = ctx->Tensor4ArgNameAndIndex("value", 0); - const int32_t elem_cnt = in->shape_view().elem_cnt(); - RUN_CUDA_KERNEL((FillTensorGpuForward), ctx->stream(), elem_cnt, elem_cnt, value->dptr(), - out->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_FILL_CUDA_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fill_tensor_") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("out", 0) == GetDataType::value)); - -REGISTER_FILL_CUDA_KERNEL(float) -REGISTER_FILL_CUDA_KERNEL(double) -REGISTER_FILL_CUDA_KERNEL(int8_t) -REGISTER_FILL_CUDA_KERNEL(int32_t) -REGISTER_FILL_CUDA_KERNEL(int64_t) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/common/nd_index_offset_helper.h" +#include "oneflow/core/kernel/new_kernel_util.h" + +namespace oneflow { + +namespace { +template +__global__ void FillTensorGpuForward(const int n, const T* value, T* y) { + CUDA_1D_KERNEL_LOOP(i, n) { y[i] = value[0]; } +} +}; // namespace + +template +class FillTensorGpuKernel final : public user_op::OpKernel { + public: + FillTensorGpuKernel() = default; + ~FillTensorGpuKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + const user_op::Tensor* value = ctx->Tensor4ArgNameAndIndex("value", 0); + const int32_t elem_cnt = in->shape_view().elem_cnt(); + RUN_CUDA_KERNEL((FillTensorGpuForward), ctx->stream(), elem_cnt, elem_cnt, value->dptr(), + out->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_FILL_CUDA_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fill_tensor_") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("out", 0) == GetDataType::value)); + +REGISTER_FILL_CUDA_KERNEL(float) +REGISTER_FILL_CUDA_KERNEL(double) +REGISTER_FILL_CUDA_KERNEL(int8_t) +REGISTER_FILL_CUDA_KERNEL(int32_t) +REGISTER_FILL_CUDA_KERNEL(int64_t) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/flip_kernel.hip.cpp b/oneflow/user/kernels/flip_kernel.hip.cpp index 2c191a1..35b9a9b 100644 --- a/oneflow/user/kernels/flip_kernel.hip.cpp +++ b/oneflow/user/kernels/flip_kernel.hip.cpp @@ -1,104 +1,104 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/common/nd_index_offset_helper.h" -#include "oneflow/core/ep/include/stream.h" - -namespace oneflow { - -namespace { - -const int32_t NDIMS = 16; -struct SIZE_V { - int32_t val[NDIMS]; -}; - -struct VIS { - bool val[NDIMS] = {false}; -}; - -template -__global__ void FlipGpuForward(const int32_t element, const int64_t total_dims, - const SIZE_V sizes_v, const VIS vis, SIZE_V strides_v, - const T* in_dptr, T* out_dptr) { - CUDA_1D_KERNEL_LOOP(i, element) { - int32_t cur_indices = i; - int32_t rem = 0; - int32_t dst_offset = 0; - for (int32_t d = 0; d < total_dims; d++) { - int32_t temp = cur_indices; - cur_indices = cur_indices / strides_v.val[d]; - rem = temp - cur_indices * strides_v.val[d]; - dst_offset += vis.val[d] ? (sizes_v.val[d] - 1 - cur_indices) * strides_v.val[d] - : cur_indices * strides_v.val[d]; - cur_indices = rem; - } - out_dptr[i] = in_dptr[dst_offset]; - } -} - -} // namespace - -template -class FlipGpuKernel final : public user_op::OpKernel { - public: - FlipGpuKernel() = default; - ~FlipGpuKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); - user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); - const int32_t elem_cnt = y_tensor->shape_view().elem_cnt(); - if (elem_cnt == 0) { return; } - const int32_t total_dims = y_tensor->shape_view().NumAxes(); - - std::vector dims = ctx->Attr>("dims"); - VIS vis; - for (auto x : dims) { vis.val[x] = true; } - - SIZE_V sizes_v; - for (int32_t i = 0; i < total_dims; i++) { sizes_v.val[i] = y_tensor->shape_view().At(i); } - - // TODO(bbuf) delete strides caluculate, after tensor strides supported - SIZE_V strides_v; - strides_v.val[total_dims - 1] = 1; - for (int32_t i = total_dims - 2; i >= 0; i--) { - strides_v.val[i] = strides_v.val[i + 1] * y_tensor->shape_view().At(i + 1); - } - RUN_CUDA_KERNEL((FlipGpuForward), ctx->stream(), elem_cnt, elem_cnt, total_dims, sizes_v, - vis, strides_v, x_tensor->dptr(), y_tensor->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_FLIP_CUDA_KERNEL(dtype) \ - REGISTER_USER_KERNEL("flip").SetCreateFn>().SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value)); - -REGISTER_FLIP_CUDA_KERNEL(bool) -REGISTER_FLIP_CUDA_KERNEL(float) -REGISTER_FLIP_CUDA_KERNEL(double) -REGISTER_FLIP_CUDA_KERNEL(uint8_t) -REGISTER_FLIP_CUDA_KERNEL(int8_t) -REGISTER_FLIP_CUDA_KERNEL(int32_t) -REGISTER_FLIP_CUDA_KERNEL(int64_t) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/common/nd_index_offset_helper.h" +#include "oneflow/core/ep/include/stream.h" + +namespace oneflow { + +namespace { + +const int32_t NDIMS = 16; +struct SIZE_V { + int32_t val[NDIMS]; +}; + +struct VIS { + bool val[NDIMS] = {false}; +}; + +template +__global__ void FlipGpuForward(const int32_t element, const int64_t total_dims, + const SIZE_V sizes_v, const VIS vis, SIZE_V strides_v, + const T* in_dptr, T* out_dptr) { + CUDA_1D_KERNEL_LOOP(i, element) { + int32_t cur_indices = i; + int32_t rem = 0; + int32_t dst_offset = 0; + for (int32_t d = 0; d < total_dims; d++) { + int32_t temp = cur_indices; + cur_indices = cur_indices / strides_v.val[d]; + rem = temp - cur_indices * strides_v.val[d]; + dst_offset += vis.val[d] ? (sizes_v.val[d] - 1 - cur_indices) * strides_v.val[d] + : cur_indices * strides_v.val[d]; + cur_indices = rem; + } + out_dptr[i] = in_dptr[dst_offset]; + } +} + +} // namespace + +template +class FlipGpuKernel final : public user_op::OpKernel { + public: + FlipGpuKernel() = default; + ~FlipGpuKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); + user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); + const int32_t elem_cnt = y_tensor->shape_view().elem_cnt(); + if (elem_cnt == 0) { return; } + const int32_t total_dims = y_tensor->shape_view().NumAxes(); + + std::vector dims = ctx->Attr>("dims"); + VIS vis; + for (auto x : dims) { vis.val[x] = true; } + + SIZE_V sizes_v; + for (int32_t i = 0; i < total_dims; i++) { sizes_v.val[i] = y_tensor->shape_view().At(i); } + + // TODO(bbuf) delete strides caluculate, after tensor strides supported + SIZE_V strides_v; + strides_v.val[total_dims - 1] = 1; + for (int32_t i = total_dims - 2; i >= 0; i--) { + strides_v.val[i] = strides_v.val[i + 1] * y_tensor->shape_view().At(i + 1); + } + RUN_CUDA_KERNEL((FlipGpuForward), ctx->stream(), elem_cnt, elem_cnt, total_dims, sizes_v, + vis, strides_v, x_tensor->dptr(), y_tensor->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_FLIP_CUDA_KERNEL(dtype) \ + REGISTER_USER_KERNEL("flip").SetCreateFn>().SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("y", 0) == GetDataType::value)); + +REGISTER_FLIP_CUDA_KERNEL(bool) +REGISTER_FLIP_CUDA_KERNEL(float) +REGISTER_FLIP_CUDA_KERNEL(double) +REGISTER_FLIP_CUDA_KERNEL(uint8_t) +REGISTER_FLIP_CUDA_KERNEL(int8_t) +REGISTER_FLIP_CUDA_KERNEL(int32_t) +REGISTER_FLIP_CUDA_KERNEL(int64_t) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/fold_kernel_util.hip.cpp b/oneflow/user/kernels/fold_kernel_util.hip.cpp index 7085abc..ea82e73 100644 --- a/oneflow/user/kernels/fold_kernel_util.hip.cpp +++ b/oneflow/user/kernels/fold_kernel_util.hip.cpp @@ -1,75 +1,75 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifdef WITH_ROCM - -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/user/kernels/fold_kernel_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace user_op { - -namespace { - -constexpr int kBlockSize = cuda::elementwise::kBlockSize; - -int GetNumBlocks(int64_t elem_cnt) { - int num_blocks = 0; - OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks)); - return num_blocks; -} - -// NDIM range: (1, 2, 3) -// SDIM range: (1, 2), 1 indicates channels_last, 2 indicates channels_first -template -__global__ void CudaFoldForward(FoldParams params, const T* input_ptr, - T* output_ptr) { - CUDA_1D_KERNEL_LOOP_T(INDEX_T, in_offset, params.in_elem_cnt) { - using ParamType = FoldParams; - INDEX_T in_index[ParamType::kInputNDim] = {0}; - INDEX_T out_index[ParamType::kOutputNDim] = {0}; - params.in_index_helper.OffsetToNdIndex(in_offset, in_index); - if (!FoldIndexTransform(params, in_index, out_index)) { - INDEX_T out_offset = params.out_index_helper.NdIndexToOffset(out_index); - XPUAdd::Invoke(&input_ptr[in_offset], &output_ptr[out_offset]); - } else { - continue; - } - } -} - -} // namespace - -template -struct FoldKernelUtil { - using ParamType = FoldParams; - static void Forward(ep::Stream* stream, const void* raw_params, const T* input_ptr, - T* output_ptr) { - const auto* fold_params = static_cast(raw_params); - CudaFoldForward - <<in_elem_cnt), kBlockSize, 0, - stream->As()->cuda_stream()>>>(*fold_params, input_ptr, output_ptr); - } -}; - -INSTANTIATE_FOLD_KERNEL_UTIL_FOR_DEVICE(DeviceType::kCUDA) - -} // namespace user_op -} // namespace oneflow +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef WITH_ROCM + +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/user/kernels/fold_kernel_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace user_op { + +namespace { + +constexpr int kBlockSize = cuda::elementwise::kBlockSize; + +int GetNumBlocks(int64_t elem_cnt) { + int num_blocks = 0; + OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks)); + return num_blocks; +} + +// NDIM range: (1, 2, 3) +// SDIM range: (1, 2), 1 indicates channels_last, 2 indicates channels_first +template +__global__ void CudaFoldForward(FoldParams params, const T* input_ptr, + T* output_ptr) { + CUDA_1D_KERNEL_LOOP_T(INDEX_T, in_offset, params.in_elem_cnt) { + using ParamType = FoldParams; + INDEX_T in_index[ParamType::kInputNDim] = {0}; + INDEX_T out_index[ParamType::kOutputNDim] = {0}; + params.in_index_helper.OffsetToNdIndex(in_offset, in_index); + if (!FoldIndexTransform(params, in_index, out_index)) { + INDEX_T out_offset = params.out_index_helper.NdIndexToOffset(out_index); + XPUAdd::Invoke(&input_ptr[in_offset], &output_ptr[out_offset]); + } else { + continue; + } + } +} + +} // namespace + +template +struct FoldKernelUtil { + using ParamType = FoldParams; + static void Forward(ep::Stream* stream, const void* raw_params, const T* input_ptr, + T* output_ptr) { + const auto* fold_params = static_cast(raw_params); + CudaFoldForward + <<in_elem_cnt), kBlockSize, 0, + stream->As()->cuda_stream()>>>(*fold_params, input_ptr, output_ptr); + } +}; + +INSTANTIATE_FOLD_KERNEL_UTIL_FOR_DEVICE(DeviceType::kCUDA) + +} // namespace user_op +} // namespace oneflow #endif \ No newline at end of file diff --git a/oneflow/user/kernels/fused_bias_add_kernel.hip.cpp b/oneflow/user/kernels/fused_bias_add_kernel.hip.cpp index 079f394..ceabaa0 100644 --- a/oneflow/user/kernels/fused_bias_add_kernel.hip.cpp +++ b/oneflow/user/kernels/fused_bias_add_kernel.hip.cpp @@ -1,456 +1,456 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -struct GeluFunctor { - __device__ T Compute(T x, int64_t i) const { - return static_cast(0.5) * x * (static_cast(1.0) + erf(static_cast(M_SQRT1_2) * x)); - } -}; - -template<> -struct GeluFunctor { - GeluFunctor float_functor; - __device__ half Compute(half x, int64_t i) const { - return __float2half(float_functor.Compute(__half2float(x), i)); - } - __device__ half2 ComputeHalf2(half2 x, int64_t i) const { - half2 y; - y.data.x = __float2half(float_functor.Compute(__half2float(x.data.x), 2 * i)); - y.data.y = __float2half(float_functor.Compute(__half2float(x.data.y), 2 * i + 1)); - return y; - } -}; - -template -struct MaskAndScaleFunctor { - MaskAndScaleFunctor(const bool* mask, float scale) : mask(mask), scale(scale) {} - __device__ T Compute(T x, int64_t i) const { return x * static_cast(mask[i]) * scale; } - const bool* mask; - float scale; -}; - -template<> -struct MaskAndScaleFunctor { - MaskAndScaleFunctor(const bool* mask, float scale) : mask(mask), scale(scale) {} - __device__ half Compute(half x, int64_t i) const { - return x * static_cast(mask[i] * scale); - } - __device__ half2 ComputeHalf2(half2 x, int64_t i) const { - const char2* mask_c2 = reinterpret_cast(mask); - char2 mask_val = mask_c2[i]; - half2 one_or_zero_h2; - half2 h2_scale = __float2half2_rn(scale); - one_or_zero_h2.data.x = mask_val.x; - one_or_zero_h2.data.y = mask_val.y; - return __hmul2(__hmul2(x, one_or_zero_h2), h2_scale); - } - const bool* mask; - float scale; -}; - -template -struct MaskAndScaleAddFunctor { - MaskAndScaleAddFunctor(const bool* mask, const T* addend, float scale) - : mask(mask), addend(addend), scale(scale) {} - __device__ T Compute(T x, int64_t i) const { - return x * static_cast(mask[i]) * scale + addend[i]; - } - const bool* mask; - const T* addend; - float scale; -}; - -template<> -struct MaskAndScaleAddFunctor { - MaskAndScaleAddFunctor(const bool* mask, const half* addend, float scale) - : mask(mask), addend(addend), scale(scale) {} - __device__ half Compute(half x, int64_t i) const { - return x * static_cast(mask[i] * scale) + addend[i]; - } - __device__ half2 ComputeHalf2(half2 x, int64_t i) const { - const char2* mask_c2 = reinterpret_cast(mask); - const half2* addend_h2 = reinterpret_cast(addend); - char2 mask_val = mask_c2[i]; - half2 one_or_zero_h2; - half2 h2_scale = __float2half2_rn(scale); - one_or_zero_h2.data.x = mask_val.x; - one_or_zero_h2.data.y = mask_val.y; - return __hadd2(__hmul2(__hmul2(x, one_or_zero_h2), h2_scale), addend_h2[i]); - } - const bool* mask; - const half* addend; - float scale; -}; - -template -struct GeluGradFunctor { - const T coef = std::sqrt(static_cast(2.0) / std::acos(static_cast(-1.0))); - __device__ T Compute(T x, T dy, int64_t i) const { - return static_cast(0.5) - * (static_cast(1.0) + erf(static_cast(M_SQRT1_2) * x) - + x * coef * exp(static_cast(-0.5) * x * x)) - * dy; - } -}; - -template<> -struct GeluGradFunctor { - GeluGradFunctor float_functor; - __device__ half Compute(half x, half dy, int64_t i) const { - return __float2half(float_functor.Compute(__half2float(x), __half2float(dy), i)); - } -}; - -template -__global__ void FusedBiasAddGpu(FUNCTOR functor, const Index elem_cnt, const Index bias_size, - const Index inner_size, const T* x, const T* bias, T* y) { - const Index block_size = bias_size * inner_size; - CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) { - T x_i = x[i] + bias[(i % block_size) / inner_size]; - y[i] = functor.Compute(x_i, i); - } -} - -template -__global__ void FusedBiasAddGradGpu(FUNCTOR grad_functor, const Index elem_cnt, - const Index bias_size, const Index inner_size, const T* x, - const T* bias, const T* dy, T* dx) { - const Index block_size = bias_size * inner_size; - CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) { - T x_i = x[i] + bias[(i % block_size) / inner_size]; - dx[i] = grad_functor.Compute(x_i, dy[i], i); - } -} - -template -__global__ void FusedBiasAddRowGpu(FUNCTOR functor, const Index elem_cnt, const Index bias_size, - const T* x, const T* bias, T* y) { - CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) { - T x_i = x[i] + bias[i % bias_size]; - y[i] = functor.Compute(x_i, i); - } -} - -template -__global__ void FusedBiasAddGradRowGpu(FUNCTOR grad_functor, const Index elem_cnt, - const Index bias_size, const T* x, const T* bias, - const T* dy, T* dx) { - CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) { - T x_i = x[i] + bias[i % bias_size]; - dx[i] = grad_functor.Compute(x_i, dy[i], i); - } -} - -template -__global__ void FusedBiasAddRowGpuHalf2(FUNCTOR functor, const Index elem_cnt, - const Index bias_size, const half* x, const half* bias, - half* y) { - const Index h2_elem_cnt = elem_cnt / 2; - const Index h2_bias_size = bias_size / 2; - const auto* x_h2 = reinterpret_cast(x); - const auto* bias_h2 = reinterpret_cast(bias); - auto* y_h2 = reinterpret_cast(y); - CUDA_1D_KERNEL_LOOP_T(Index, i, h2_elem_cnt) { - half2 x_i = __hadd2(x_h2[i], bias_h2[i % h2_bias_size]); - y_h2[i] = functor.ComputeHalf2(x_i, i); - } -} - -template -__global__ void FusedBiasAddGradRowGpuHalf2(FUNCTOR grad_functor, const Index elem_cnt, - const Index bias_size, const half* x, const half* bias, - const half* dy, half* dx) { - const Index h2_elem_cnt = elem_cnt / 2; - const Index h2_bias_size = bias_size / 2; - const auto* x_h2 = reinterpret_cast(x); - const auto* bias_h2 = reinterpret_cast(bias); - const auto* dy_h2 = reinterpret_cast(dy); - auto* dx_h2 = reinterpret_cast(dx); - CUDA_1D_KERNEL_LOOP_T(Index, i, h2_elem_cnt) { - half2 x_i = __hadd2(x_h2[i], bias_h2[i % h2_bias_size]); - half2 dy_i = dy_h2[i]; - half2 dx_i; - dx_i.data.x = grad_functor.Compute(x_i.data.x, dy_i.data.x, 2 * i); - dx_i.data.y = grad_functor.Compute(x_i.data.y, dy_i.data.y, 2 * i + 1); - dx_h2[i] = dx_i; - } -} - -template -__global__ void FusedBiasAddColGpu(FUNCTOR functor, const Index elem_cnt, const Index inner_size, - const T* x, const T* bias, T* y) { - CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) { - T x_i = x[i] + bias[i / inner_size]; - y[i] = functor.Compute(x_i, i); - } -} - -template -__global__ void FusedBiasAddGradColGpu(FUNCTOR grad_functor, const Index elem_cnt, - const Index inner_size, const T* x, const T* bias, - const T* dy, T* dx) { - CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) { - T x_i = x[i] + bias[i / inner_size]; - dx[i] = grad_functor.Compute(x_i, dy[i], i); - } -} - -template -struct FusedBiasAddRow { - static void Invoke(ep::Stream* stream, FUNCTOR functor, Index elem_cnt, Index bias_size, - const T* x, const T* bias, T* y) { - FusedBiasAddRowGpu - <<As()->cuda_stream()>>>(functor, elem_cnt, bias_size, x, bias, y); - } -}; - -template -struct FusedBiasAddRow { - static void Invoke(ep::Stream* stream, FUNCTOR functor, Index elem_cnt, Index bias_size, - const half* x, const half* bias, half* y) { - if (bias_size % 2 == 0) { - FusedBiasAddRowGpuHalf2 - <<As()->cuda_stream()>>>(functor, elem_cnt, bias_size, x, bias, - y); - } else { - FusedBiasAddRowGpu - <<As()->cuda_stream()>>>(functor, elem_cnt, bias_size, x, bias, - y); - } - } -}; - -template -void FusedBiasAddForwardImpl(ep::Stream* stream, FUNCTOR functor, Index outer_size, Index bias_size, - Index inner_size, const T* x, const T* bias, T* y) { - const Index elem_cnt = outer_size * bias_size * inner_size; - if (inner_size == 1) { - FusedBiasAddRow::Invoke(stream, functor, elem_cnt, bias_size, x, bias, y); - } else if (outer_size == 1) { - FusedBiasAddColGpu<<As()->cuda_stream()>>>( - functor, elem_cnt, inner_size, x, bias, y); - } else { - FusedBiasAddGpu<<As()->cuda_stream()>>>( - functor, elem_cnt, bias_size, inner_size, x, bias, y); - } -} - -template -struct FusedBiasAddGradRow { - static void Invoke(ep::Stream* stream, FUNCTOR grad_functor, Index elem_cnt, Index bias_size, - const T* x, const T* bias, const T* dy, T* dx) { - FusedBiasAddGradRowGpu - <<As()->cuda_stream()>>>(grad_functor, elem_cnt, bias_size, x, - bias, dy, dx); - } -}; - -template -struct FusedBiasAddGradRow { - static void Invoke(ep::Stream* stream, FUNCTOR grad_functor, Index elem_cnt, Index bias_size, - const half* x, const half* bias, const half* dy, half* dx) { - if (bias_size % 2 == 0) { - FusedBiasAddGradRowGpuHalf2 - <<As()->cuda_stream()>>>(grad_functor, elem_cnt, bias_size, x, - bias, dy, dx); - } else { - FusedBiasAddGradRowGpu - <<As()->cuda_stream()>>>(grad_functor, elem_cnt, bias_size, x, - bias, dy, dx); - } - } -}; - -template -void FusedBiasAddGradImpl(ep::Stream* stream, FUNCTOR grad_functor, Index outer_size, - Index bias_size, Index inner_size, const T* x, const T* bias, const T* dy, - T* dx) { - const Index elem_cnt = outer_size * bias_size * inner_size; - if (inner_size == 1) { - FusedBiasAddGradRow::Invoke(stream, grad_functor, elem_cnt, bias_size, x, - bias, dy, dx); - } else if (outer_size == 1) { - FusedBiasAddGradColGpu - <<As()->cuda_stream()>>>(grad_functor, elem_cnt, inner_size, x, - bias, dy, dx); - } else { - FusedBiasAddGradGpu - <<As()->cuda_stream()>>>(grad_functor, elem_cnt, bias_size, - inner_size, x, bias, dy, dx); - } -} - -template -void DispatchFusedBiasAddForwardImpl(ep::Stream* stream, FUNCTOR functor, int64_t n, - int64_t outer_size, int64_t bias_size, int64_t inner_size, - const T* x, const T* bias, T* y) { - if (IsKernelSafeInt32(n)) { - FusedBiasAddForwardImpl(stream, functor, outer_size, bias_size, inner_size, - x, bias, y); - } else { - FusedBiasAddForwardImpl(stream, functor, outer_size, bias_size, inner_size, - x, bias, y); - } -} - -} // namespace - -template -class FusedFusedBiasAddKernel final : public user_op::OpKernel { - public: - FusedFusedBiasAddKernel() = default; - ~FusedFusedBiasAddKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const auto* a_tensor = ctx->Tensor4ArgNameAndIndex("a", 0); - const auto* b_tensor = ctx->Tensor4ArgNameAndIndex("b", 0); - auto* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0); - const int32_t bias_add_axis = ctx->Attr("axis"); - const int64_t outer_size = a_tensor->shape_view().Count(0, bias_add_axis); - const int64_t bias_size = a_tensor->shape_view().At(bias_add_axis); - const int64_t inner_size = a_tensor->shape_view().Count(bias_add_axis + 1); - const auto n = a_tensor->shape_view().elem_cnt(); - GeluFunctor gelu_functor{}; - DispatchFusedBiasAddForwardImpl( - ctx->stream(), gelu_functor, n, outer_size, bias_size, inner_size, a_tensor->dptr(), - b_tensor->dptr(), out_tensor->mut_dptr()); - }; - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_FUSED_BIAS_ADD_GELU_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fused_bias_add_gelu") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("out", 0) == GetDataType::value)); - -REGISTER_FUSED_BIAS_ADD_GELU_KERNEL(float) -REGISTER_FUSED_BIAS_ADD_GELU_KERNEL(double) -REGISTER_FUSED_BIAS_ADD_GELU_KERNEL(half) - -template -class FusedBiasAddMaskScaleKernel final : public user_op::OpKernel { - public: - FusedBiasAddMaskScaleKernel() = default; - ~FusedBiasAddMaskScaleKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const auto* a_tensor = ctx->Tensor4ArgNameAndIndex("a", 0); - const auto* b_tensor = ctx->Tensor4ArgNameAndIndex("b", 0); - const auto* mask_tensor = ctx->Tensor4ArgNameAndIndex("mask", 0); - auto* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0); - const int32_t bias_add_axis = ctx->Attr("axis"); - const float scale = ctx->Attr("scale"); - const int64_t outer_size = a_tensor->shape_view().Count(0, bias_add_axis); - const int64_t bias_size = a_tensor->shape_view().At(bias_add_axis); - const int64_t inner_size = a_tensor->shape_view().Count(bias_add_axis + 1); - const auto n = a_tensor->shape_view().elem_cnt(); - if (ctx->has_input("_add_to_output", 0)) { - const user_op::Tensor* addend = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); - MaskAndScaleAddFunctor mask_and_scale_add_functor(mask_tensor->dptr(), - addend->dptr(), scale); - DispatchFusedBiasAddForwardImpl( - ctx->stream(), mask_and_scale_add_functor, n, outer_size, bias_size, inner_size, - a_tensor->dptr(), b_tensor->dptr(), out_tensor->mut_dptr()); - } else { - MaskAndScaleFunctor mask_and_scale_functor(mask_tensor->dptr(), scale); - DispatchFusedBiasAddForwardImpl( - ctx->stream(), mask_and_scale_functor, n, outer_size, bias_size, inner_size, - a_tensor->dptr(), b_tensor->dptr(), out_tensor->mut_dptr()); - } - }; - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_FUSED_BIAS_ADD_MASK_SCALE_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fused_bias_add_mask_scale") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("out", 0) == GetDataType::value)); - -REGISTER_FUSED_BIAS_ADD_MASK_SCALE_KERNEL(float) -REGISTER_FUSED_BIAS_ADD_MASK_SCALE_KERNEL(double) -REGISTER_FUSED_BIAS_ADD_MASK_SCALE_KERNEL(half) - -template -class FusedFusedBiasAddGradKernel final : public user_op::OpKernel { - public: - FusedFusedBiasAddGradKernel() = default; - ~FusedFusedBiasAddGradKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const auto* a_tensor = ctx->Tensor4ArgNameAndIndex("a", 0); - const auto* b_tensor = ctx->Tensor4ArgNameAndIndex("b", 0); - const auto* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); - auto* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); - const int32_t bias_add_axis = ctx->Attr("axis"); - const int64_t outer_size = a_tensor->shape_view().Count(0, bias_add_axis); - const int64_t bias_size = a_tensor->shape_view().At(bias_add_axis); - const int64_t inner_size = a_tensor->shape_view().Count(bias_add_axis + 1); - const auto n = a_tensor->shape_view().elem_cnt(); - GeluGradFunctor gelu_grad_functor; - if (IsKernelSafeInt32(n)) { - FusedBiasAddGradImpl( - ctx->stream(), gelu_grad_functor, outer_size, bias_size, inner_size, a_tensor->dptr(), - b_tensor->dptr(), dy_tensor->dptr(), dx_tensor->mut_dptr()); - } else { - FusedBiasAddGradImpl( - ctx->stream(), gelu_grad_functor, outer_size, bias_size, inner_size, a_tensor->dptr(), - b_tensor->dptr(), dy_tensor->dptr(), dx_tensor->mut_dptr()); - } - }; - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_FUSED_BIAS_ADD_GELU_GRAD_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fused_bias_add_gelu_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)); - -REGISTER_FUSED_BIAS_ADD_GELU_GRAD_KERNEL(float) -REGISTER_FUSED_BIAS_ADD_GELU_GRAD_KERNEL(double) -REGISTER_FUSED_BIAS_ADD_GELU_GRAD_KERNEL(half) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +struct GeluFunctor { + __device__ T Compute(T x, int64_t i) const { + return static_cast(0.5) * x * (static_cast(1.0) + erf(static_cast(M_SQRT1_2) * x)); + } +}; + +template<> +struct GeluFunctor { + GeluFunctor float_functor; + __device__ half Compute(half x, int64_t i) const { + return __float2half(float_functor.Compute(__half2float(x), i)); + } + __device__ half2 ComputeHalf2(half2 x, int64_t i) const { + half2 y; + y.data.x = __float2half(float_functor.Compute(__half2float(x.data.x), 2 * i)); + y.data.y = __float2half(float_functor.Compute(__half2float(x.data.y), 2 * i + 1)); + return y; + } +}; + +template +struct MaskAndScaleFunctor { + MaskAndScaleFunctor(const bool* mask, float scale) : mask(mask), scale(scale) {} + __device__ T Compute(T x, int64_t i) const { return x * static_cast(mask[i]) * scale; } + const bool* mask; + float scale; +}; + +template<> +struct MaskAndScaleFunctor { + MaskAndScaleFunctor(const bool* mask, float scale) : mask(mask), scale(scale) {} + __device__ half Compute(half x, int64_t i) const { + return x * static_cast(mask[i] * scale); + } + __device__ half2 ComputeHalf2(half2 x, int64_t i) const { + const char2* mask_c2 = reinterpret_cast(mask); + char2 mask_val = mask_c2[i]; + half2 one_or_zero_h2; + half2 h2_scale = __float2half2_rn(scale); + one_or_zero_h2.data.x = mask_val.x; + one_or_zero_h2.data.y = mask_val.y; + return __hmul2(__hmul2(x, one_or_zero_h2), h2_scale); + } + const bool* mask; + float scale; +}; + +template +struct MaskAndScaleAddFunctor { + MaskAndScaleAddFunctor(const bool* mask, const T* addend, float scale) + : mask(mask), addend(addend), scale(scale) {} + __device__ T Compute(T x, int64_t i) const { + return x * static_cast(mask[i]) * scale + addend[i]; + } + const bool* mask; + const T* addend; + float scale; +}; + +template<> +struct MaskAndScaleAddFunctor { + MaskAndScaleAddFunctor(const bool* mask, const half* addend, float scale) + : mask(mask), addend(addend), scale(scale) {} + __device__ half Compute(half x, int64_t i) const { + return x * static_cast(mask[i] * scale) + addend[i]; + } + __device__ half2 ComputeHalf2(half2 x, int64_t i) const { + const char2* mask_c2 = reinterpret_cast(mask); + const half2* addend_h2 = reinterpret_cast(addend); + char2 mask_val = mask_c2[i]; + half2 one_or_zero_h2; + half2 h2_scale = __float2half2_rn(scale); + one_or_zero_h2.data.x = mask_val.x; + one_or_zero_h2.data.y = mask_val.y; + return __hadd2(__hmul2(__hmul2(x, one_or_zero_h2), h2_scale), addend_h2[i]); + } + const bool* mask; + const half* addend; + float scale; +}; + +template +struct GeluGradFunctor { + const T coef = std::sqrt(static_cast(2.0) / std::acos(static_cast(-1.0))); + __device__ T Compute(T x, T dy, int64_t i) const { + return static_cast(0.5) + * (static_cast(1.0) + erf(static_cast(M_SQRT1_2) * x) + + x * coef * exp(static_cast(-0.5) * x * x)) + * dy; + } +}; + +template<> +struct GeluGradFunctor { + GeluGradFunctor float_functor; + __device__ half Compute(half x, half dy, int64_t i) const { + return __float2half(float_functor.Compute(__half2float(x), __half2float(dy), i)); + } +}; + +template +__global__ void FusedBiasAddGpu(FUNCTOR functor, const Index elem_cnt, const Index bias_size, + const Index inner_size, const T* x, const T* bias, T* y) { + const Index block_size = bias_size * inner_size; + CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) { + T x_i = x[i] + bias[(i % block_size) / inner_size]; + y[i] = functor.Compute(x_i, i); + } +} + +template +__global__ void FusedBiasAddGradGpu(FUNCTOR grad_functor, const Index elem_cnt, + const Index bias_size, const Index inner_size, const T* x, + const T* bias, const T* dy, T* dx) { + const Index block_size = bias_size * inner_size; + CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) { + T x_i = x[i] + bias[(i % block_size) / inner_size]; + dx[i] = grad_functor.Compute(x_i, dy[i], i); + } +} + +template +__global__ void FusedBiasAddRowGpu(FUNCTOR functor, const Index elem_cnt, const Index bias_size, + const T* x, const T* bias, T* y) { + CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) { + T x_i = x[i] + bias[i % bias_size]; + y[i] = functor.Compute(x_i, i); + } +} + +template +__global__ void FusedBiasAddGradRowGpu(FUNCTOR grad_functor, const Index elem_cnt, + const Index bias_size, const T* x, const T* bias, + const T* dy, T* dx) { + CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) { + T x_i = x[i] + bias[i % bias_size]; + dx[i] = grad_functor.Compute(x_i, dy[i], i); + } +} + +template +__global__ void FusedBiasAddRowGpuHalf2(FUNCTOR functor, const Index elem_cnt, + const Index bias_size, const half* x, const half* bias, + half* y) { + const Index h2_elem_cnt = elem_cnt / 2; + const Index h2_bias_size = bias_size / 2; + const auto* x_h2 = reinterpret_cast(x); + const auto* bias_h2 = reinterpret_cast(bias); + auto* y_h2 = reinterpret_cast(y); + CUDA_1D_KERNEL_LOOP_T(Index, i, h2_elem_cnt) { + half2 x_i = __hadd2(x_h2[i], bias_h2[i % h2_bias_size]); + y_h2[i] = functor.ComputeHalf2(x_i, i); + } +} + +template +__global__ void FusedBiasAddGradRowGpuHalf2(FUNCTOR grad_functor, const Index elem_cnt, + const Index bias_size, const half* x, const half* bias, + const half* dy, half* dx) { + const Index h2_elem_cnt = elem_cnt / 2; + const Index h2_bias_size = bias_size / 2; + const auto* x_h2 = reinterpret_cast(x); + const auto* bias_h2 = reinterpret_cast(bias); + const auto* dy_h2 = reinterpret_cast(dy); + auto* dx_h2 = reinterpret_cast(dx); + CUDA_1D_KERNEL_LOOP_T(Index, i, h2_elem_cnt) { + half2 x_i = __hadd2(x_h2[i], bias_h2[i % h2_bias_size]); + half2 dy_i = dy_h2[i]; + half2 dx_i; + dx_i.data.x = grad_functor.Compute(x_i.data.x, dy_i.data.x, 2 * i); + dx_i.data.y = grad_functor.Compute(x_i.data.y, dy_i.data.y, 2 * i + 1); + dx_h2[i] = dx_i; + } +} + +template +__global__ void FusedBiasAddColGpu(FUNCTOR functor, const Index elem_cnt, const Index inner_size, + const T* x, const T* bias, T* y) { + CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) { + T x_i = x[i] + bias[i / inner_size]; + y[i] = functor.Compute(x_i, i); + } +} + +template +__global__ void FusedBiasAddGradColGpu(FUNCTOR grad_functor, const Index elem_cnt, + const Index inner_size, const T* x, const T* bias, + const T* dy, T* dx) { + CUDA_1D_KERNEL_LOOP_T(Index, i, elem_cnt) { + T x_i = x[i] + bias[i / inner_size]; + dx[i] = grad_functor.Compute(x_i, dy[i], i); + } +} + +template +struct FusedBiasAddRow { + static void Invoke(ep::Stream* stream, FUNCTOR functor, Index elem_cnt, Index bias_size, + const T* x, const T* bias, T* y) { + FusedBiasAddRowGpu + <<As()->cuda_stream()>>>(functor, elem_cnt, bias_size, x, bias, y); + } +}; + +template +struct FusedBiasAddRow { + static void Invoke(ep::Stream* stream, FUNCTOR functor, Index elem_cnt, Index bias_size, + const half* x, const half* bias, half* y) { + if (bias_size % 2 == 0) { + FusedBiasAddRowGpuHalf2 + <<As()->cuda_stream()>>>(functor, elem_cnt, bias_size, x, bias, + y); + } else { + FusedBiasAddRowGpu + <<As()->cuda_stream()>>>(functor, elem_cnt, bias_size, x, bias, + y); + } + } +}; + +template +void FusedBiasAddForwardImpl(ep::Stream* stream, FUNCTOR functor, Index outer_size, Index bias_size, + Index inner_size, const T* x, const T* bias, T* y) { + const Index elem_cnt = outer_size * bias_size * inner_size; + if (inner_size == 1) { + FusedBiasAddRow::Invoke(stream, functor, elem_cnt, bias_size, x, bias, y); + } else if (outer_size == 1) { + FusedBiasAddColGpu<<As()->cuda_stream()>>>( + functor, elem_cnt, inner_size, x, bias, y); + } else { + FusedBiasAddGpu<<As()->cuda_stream()>>>( + functor, elem_cnt, bias_size, inner_size, x, bias, y); + } +} + +template +struct FusedBiasAddGradRow { + static void Invoke(ep::Stream* stream, FUNCTOR grad_functor, Index elem_cnt, Index bias_size, + const T* x, const T* bias, const T* dy, T* dx) { + FusedBiasAddGradRowGpu + <<As()->cuda_stream()>>>(grad_functor, elem_cnt, bias_size, x, + bias, dy, dx); + } +}; + +template +struct FusedBiasAddGradRow { + static void Invoke(ep::Stream* stream, FUNCTOR grad_functor, Index elem_cnt, Index bias_size, + const half* x, const half* bias, const half* dy, half* dx) { + if (bias_size % 2 == 0) { + FusedBiasAddGradRowGpuHalf2 + <<As()->cuda_stream()>>>(grad_functor, elem_cnt, bias_size, x, + bias, dy, dx); + } else { + FusedBiasAddGradRowGpu + <<As()->cuda_stream()>>>(grad_functor, elem_cnt, bias_size, x, + bias, dy, dx); + } + } +}; + +template +void FusedBiasAddGradImpl(ep::Stream* stream, FUNCTOR grad_functor, Index outer_size, + Index bias_size, Index inner_size, const T* x, const T* bias, const T* dy, + T* dx) { + const Index elem_cnt = outer_size * bias_size * inner_size; + if (inner_size == 1) { + FusedBiasAddGradRow::Invoke(stream, grad_functor, elem_cnt, bias_size, x, + bias, dy, dx); + } else if (outer_size == 1) { + FusedBiasAddGradColGpu + <<As()->cuda_stream()>>>(grad_functor, elem_cnt, inner_size, x, + bias, dy, dx); + } else { + FusedBiasAddGradGpu + <<As()->cuda_stream()>>>(grad_functor, elem_cnt, bias_size, + inner_size, x, bias, dy, dx); + } +} + +template +void DispatchFusedBiasAddForwardImpl(ep::Stream* stream, FUNCTOR functor, int64_t n, + int64_t outer_size, int64_t bias_size, int64_t inner_size, + const T* x, const T* bias, T* y) { + if (IsKernelSafeInt32(n)) { + FusedBiasAddForwardImpl(stream, functor, outer_size, bias_size, inner_size, + x, bias, y); + } else { + FusedBiasAddForwardImpl(stream, functor, outer_size, bias_size, inner_size, + x, bias, y); + } +} + +} // namespace + +template +class FusedFusedBiasAddKernel final : public user_op::OpKernel { + public: + FusedFusedBiasAddKernel() = default; + ~FusedFusedBiasAddKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const auto* a_tensor = ctx->Tensor4ArgNameAndIndex("a", 0); + const auto* b_tensor = ctx->Tensor4ArgNameAndIndex("b", 0); + auto* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0); + const int32_t bias_add_axis = ctx->Attr("axis"); + const int64_t outer_size = a_tensor->shape_view().Count(0, bias_add_axis); + const int64_t bias_size = a_tensor->shape_view().At(bias_add_axis); + const int64_t inner_size = a_tensor->shape_view().Count(bias_add_axis + 1); + const auto n = a_tensor->shape_view().elem_cnt(); + GeluFunctor gelu_functor{}; + DispatchFusedBiasAddForwardImpl( + ctx->stream(), gelu_functor, n, outer_size, bias_size, inner_size, a_tensor->dptr(), + b_tensor->dptr(), out_tensor->mut_dptr()); + }; + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_FUSED_BIAS_ADD_GELU_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fused_bias_add_gelu") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("out", 0) == GetDataType::value)); + +REGISTER_FUSED_BIAS_ADD_GELU_KERNEL(float) +REGISTER_FUSED_BIAS_ADD_GELU_KERNEL(double) +REGISTER_FUSED_BIAS_ADD_GELU_KERNEL(half) + +template +class FusedBiasAddMaskScaleKernel final : public user_op::OpKernel { + public: + FusedBiasAddMaskScaleKernel() = default; + ~FusedBiasAddMaskScaleKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const auto* a_tensor = ctx->Tensor4ArgNameAndIndex("a", 0); + const auto* b_tensor = ctx->Tensor4ArgNameAndIndex("b", 0); + const auto* mask_tensor = ctx->Tensor4ArgNameAndIndex("mask", 0); + auto* out_tensor = ctx->Tensor4ArgNameAndIndex("out", 0); + const int32_t bias_add_axis = ctx->Attr("axis"); + const float scale = ctx->Attr("scale"); + const int64_t outer_size = a_tensor->shape_view().Count(0, bias_add_axis); + const int64_t bias_size = a_tensor->shape_view().At(bias_add_axis); + const int64_t inner_size = a_tensor->shape_view().Count(bias_add_axis + 1); + const auto n = a_tensor->shape_view().elem_cnt(); + if (ctx->has_input("_add_to_output", 0)) { + const user_op::Tensor* addend = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); + MaskAndScaleAddFunctor mask_and_scale_add_functor(mask_tensor->dptr(), + addend->dptr(), scale); + DispatchFusedBiasAddForwardImpl( + ctx->stream(), mask_and_scale_add_functor, n, outer_size, bias_size, inner_size, + a_tensor->dptr(), b_tensor->dptr(), out_tensor->mut_dptr()); + } else { + MaskAndScaleFunctor mask_and_scale_functor(mask_tensor->dptr(), scale); + DispatchFusedBiasAddForwardImpl( + ctx->stream(), mask_and_scale_functor, n, outer_size, bias_size, inner_size, + a_tensor->dptr(), b_tensor->dptr(), out_tensor->mut_dptr()); + } + }; + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_FUSED_BIAS_ADD_MASK_SCALE_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fused_bias_add_mask_scale") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("out", 0) == GetDataType::value)); + +REGISTER_FUSED_BIAS_ADD_MASK_SCALE_KERNEL(float) +REGISTER_FUSED_BIAS_ADD_MASK_SCALE_KERNEL(double) +REGISTER_FUSED_BIAS_ADD_MASK_SCALE_KERNEL(half) + +template +class FusedFusedBiasAddGradKernel final : public user_op::OpKernel { + public: + FusedFusedBiasAddGradKernel() = default; + ~FusedFusedBiasAddGradKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const auto* a_tensor = ctx->Tensor4ArgNameAndIndex("a", 0); + const auto* b_tensor = ctx->Tensor4ArgNameAndIndex("b", 0); + const auto* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); + auto* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); + const int32_t bias_add_axis = ctx->Attr("axis"); + const int64_t outer_size = a_tensor->shape_view().Count(0, bias_add_axis); + const int64_t bias_size = a_tensor->shape_view().At(bias_add_axis); + const int64_t inner_size = a_tensor->shape_view().Count(bias_add_axis + 1); + const auto n = a_tensor->shape_view().elem_cnt(); + GeluGradFunctor gelu_grad_functor; + if (IsKernelSafeInt32(n)) { + FusedBiasAddGradImpl( + ctx->stream(), gelu_grad_functor, outer_size, bias_size, inner_size, a_tensor->dptr(), + b_tensor->dptr(), dy_tensor->dptr(), dx_tensor->mut_dptr()); + } else { + FusedBiasAddGradImpl( + ctx->stream(), gelu_grad_functor, outer_size, bias_size, inner_size, a_tensor->dptr(), + b_tensor->dptr(), dy_tensor->dptr(), dx_tensor->mut_dptr()); + } + }; + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_FUSED_BIAS_ADD_GELU_GRAD_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fused_bias_add_gelu_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value)); + +REGISTER_FUSED_BIAS_ADD_GELU_GRAD_KERNEL(float) +REGISTER_FUSED_BIAS_ADD_GELU_GRAD_KERNEL(double) +REGISTER_FUSED_BIAS_ADD_GELU_GRAD_KERNEL(half) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/fused_cast_scale_kernel.hip.cpp b/oneflow/user/kernels/fused_cast_scale_kernel.hip.cpp index 5e898fd..b61f559 100644 --- a/oneflow/user/kernels/fused_cast_scale_kernel.hip.cpp +++ b/oneflow/user/kernels/fused_cast_scale_kernel.hip.cpp @@ -1,112 +1,112 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/kernel/cuda_graph_support.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -__global__ void FusedCastScaleGpu(const int64_t n, const T scale_val, const U* in, - const T* scale_by_ptr, T* out) { - const T scale = *scale_by_ptr * scale_val; - CUDA_1D_KERNEL_LOOP(i, n) { out[i] = static_cast(in[i]) * scale; } -} - -template<> -__global__ void FusedCastScaleGpu(const int64_t n, const float scale_val, - const half* in, const float* scale_by_ptr, - float* out) { - const float scale = *scale_by_ptr * scale_val; - const int64_t n_2 = n / 2; - const auto* in_2 = reinterpret_cast(in); - auto* out_2 = reinterpret_cast(out); - CUDA_1D_KERNEL_LOOP(i, n_2) { - float2 f2 = __half22float2(in_2[i]); - f2.x *= scale; - f2.y *= scale; - out_2[i] = f2; - } - if (n % 2 == 1 && blockIdx.x == 0 && threadIdx.x == 0) { - out[n - 1] = __half2float(in[n - 1]) * scale; - } -} - -template<> -__global__ void FusedCastScaleGpu(const int64_t n, const half scale_val, - const float* in, const half* scale_by_ptr, - half* out) { - const half scale = *scale_by_ptr * scale_val; - const half2 scale_h2 = __half2half2(scale); - const int64_t n_2 = n / 2; - const auto* in_2 = reinterpret_cast(in); - auto* out_h2 = reinterpret_cast(out); - CUDA_1D_KERNEL_LOOP(i, n_2) { - half2 in_h2 = __float22half2_rn(in_2[i]); - out_h2[i] = __hmul2(in_h2, scale_h2); - } - if (n % 2 == 1 && blockIdx.x == 0 && threadIdx.x == 0) { - out[n - 1] = __float2half(in[n - 1]) * scale; - } -} - -template -class FusedCastScaleGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { - public: - FusedCastScaleGpuKernel() = default; - ~FusedCastScaleGpuKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); - user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - const int64_t n = x->shape_view().elem_cnt(); - const double scale = ctx->Attr("scale"); - const int64_t launch_n = ((std::is_same::value && std::is_same::value) - || (std::is_same::value && std::is_same::value)) - ? RoundUp(n, 2) / 2 - : n; - FusedCastScaleGpu<<stream()->As()->cuda_stream()>>>( - n, static_cast(scale), x->dptr(), scale_by_tensor->dptr(), y->mut_dptr()); - }; - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -} // namespace - -#define REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(x_type, y_type) \ - REGISTER_USER_KERNEL("fused_cast_scale") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value) \ - && (user_op::HobDataType("x", 0) == GetDataType::value)); - -REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(half, float); -// REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(half, double); -REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(float, half); -REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(float, double); -// REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(double, half); -REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(double, float); -#undef REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/kernel/cuda_graph_support.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +__global__ void FusedCastScaleGpu(const int64_t n, const T scale_val, const U* in, + const T* scale_by_ptr, T* out) { + const T scale = *scale_by_ptr * scale_val; + CUDA_1D_KERNEL_LOOP(i, n) { out[i] = static_cast(in[i]) * scale; } +} + +template<> +__global__ void FusedCastScaleGpu(const int64_t n, const float scale_val, + const half* in, const float* scale_by_ptr, + float* out) { + const float scale = *scale_by_ptr * scale_val; + const int64_t n_2 = n / 2; + const auto* in_2 = reinterpret_cast(in); + auto* out_2 = reinterpret_cast(out); + CUDA_1D_KERNEL_LOOP(i, n_2) { + float2 f2 = __half22float2(in_2[i]); + f2.x *= scale; + f2.y *= scale; + out_2[i] = f2; + } + if (n % 2 == 1 && blockIdx.x == 0 && threadIdx.x == 0) { + out[n - 1] = __half2float(in[n - 1]) * scale; + } +} + +template<> +__global__ void FusedCastScaleGpu(const int64_t n, const half scale_val, + const float* in, const half* scale_by_ptr, + half* out) { + const half scale = *scale_by_ptr * scale_val; + const half2 scale_h2 = __half2half2(scale); + const int64_t n_2 = n / 2; + const auto* in_2 = reinterpret_cast(in); + auto* out_h2 = reinterpret_cast(out); + CUDA_1D_KERNEL_LOOP(i, n_2) { + half2 in_h2 = __float22half2_rn(in_2[i]); + out_h2[i] = __hmul2(in_h2, scale_h2); + } + if (n % 2 == 1 && blockIdx.x == 0 && threadIdx.x == 0) { + out[n - 1] = __float2half(in[n - 1]) * scale; + } +} + +template +class FusedCastScaleGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { + public: + FusedCastScaleGpuKernel() = default; + ~FusedCastScaleGpuKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); + user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); + const int64_t n = x->shape_view().elem_cnt(); + const double scale = ctx->Attr("scale"); + const int64_t launch_n = ((std::is_same::value && std::is_same::value) + || (std::is_same::value && std::is_same::value)) + ? RoundUp(n, 2) / 2 + : n; + FusedCastScaleGpu<<stream()->As()->cuda_stream()>>>( + n, static_cast(scale), x->dptr(), scale_by_tensor->dptr(), y->mut_dptr()); + }; + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +} // namespace + +#define REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(x_type, y_type) \ + REGISTER_USER_KERNEL("fused_cast_scale") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("y", 0) == GetDataType::value) \ + && (user_op::HobDataType("x", 0) == GetDataType::value)); + +REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(half, float); +// REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(half, double); +REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(float, half); +REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(float, double); +// REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(double, half); +REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL(double, float); +#undef REGISTER_FUSED_CAST_SCALE_CUDA_KERNEL + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/fused_cross_feature_interaction.hip.cpp b/oneflow/user/kernels/fused_cross_feature_interaction.hip.cpp index a2dd5bb..d83dad4 100644 --- a/oneflow/user/kernels/fused_cross_feature_interaction.hip.cpp +++ b/oneflow/user/kernels/fused_cross_feature_interaction.hip.cpp @@ -1,259 +1,259 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/cuda_graph_support.h" -#include "oneflow/core/ep/include/primitive/matmul.h" -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -enum InteractionMode { kVector = 0, kMatrix }; - -constexpr int kBlockSize = 256; - -void InferMatmulMNK(const ShapeView& a_shape, const ShapeView& b_shape, bool transpose_a, - bool transpose_b, size_t* m, size_t* n, size_t* k) { - const int64_t num_a_axes = a_shape.NumAxes(); - CHECK_GE(num_a_axes, 2); - const int64_t num_b_axes = b_shape.NumAxes(); - CHECK_GE(num_b_axes, 2); - if (!transpose_a) { - *m = a_shape.At(num_a_axes - 2); - *k = a_shape.At(num_a_axes - 1); - } else { - *m = a_shape.At(num_a_axes - 1); - *k = a_shape.At(num_a_axes - 2); - } - if (!transpose_b) { - CHECK_EQ(b_shape.At(num_b_axes - 2), *k); - *n = b_shape.At(num_b_axes - 1); - } else { - CHECK_EQ(b_shape.At(num_b_axes - 1), *k); - *n = b_shape.At(num_b_axes - 2); - } -} - -ep::primitive::BlasTransposeType GetBlasTransposeType(bool transpose) { - return transpose ? ep::primitive::BlasTransposeType::T : ep::primitive::BlasTransposeType::N; -} - -std::unique_ptr NewMatmulPrimitive(DeviceType device_type, - DataType data_type, bool transpose_a, - bool transpose_b) { - const auto trans_a = GetBlasTransposeType(transpose_a); - const auto trans_b = GetBlasTransposeType(transpose_b); - return ep::primitive::NewPrimitive(device_type, data_type, trans_a, - trans_b); -} - -template -std::unique_ptr NewMatmulPrimitive(Context* ctx) { - const DataType data_type = ctx->TensorDesc4ArgNameAndIndex("x", 0)->data_type(); - return NewMatmulPrimitive(ctx->device_type(), data_type, /*transpose_a=*/false, - /*transpose_b=*/true); -} - -auto MatmulPrimitiveExists() { - return hob::make_custom("MatmulPrimitiveExists", [](const user_op::KernelRegContext& ctx) { - return NewMatmulPrimitive(&ctx).operator bool(); - }); -} - -template -__global__ void FusedBiasAddMulAddResidualKernel(const T* in, const T* x, const T* x0, - const T* bias, T* out, const IndexType cols, - const IndexType elem_cnt) { - const IndexType global_thread_id = blockDim.x * blockIdx.x + threadIdx.x; - using LoadPack = cuda::elementwise::Packed; - for (IndexType linear_index = global_thread_id * pack_size, - step = gridDim.x * blockDim.x * pack_size; - linear_index < elem_cnt; linear_index += step) { - const IndexType row_idx = linear_index / cols; - const IndexType col_idx = linear_index - row_idx * cols; - - const LoadPack* x0_load = reinterpret_cast(x0 + linear_index); - const LoadPack* x_load = reinterpret_cast(x + linear_index); - const LoadPack* bias_load = reinterpret_cast(bias + col_idx); - - LoadPack x0_vec = *x0_load; - LoadPack x_vec = *x_load; - LoadPack bias_vec = *bias_load; - - LoadPack out_store; - if (mode == InteractionMode::kVector) { - T in_val = in[row_idx]; -#pragma unroll - for (int i = 0; i < pack_size; i++) { - out_store.elem[i] = x0_vec.elem[i] * in_val + bias_vec.elem[i] + x_vec.elem[i]; - } - } else if (mode == InteractionMode::kMatrix) { - const LoadPack* in_load = reinterpret_cast(in + linear_index); - LoadPack in_vec = *in_load; -#pragma unroll - for (int i = 0; i < pack_size; i++) { - out_store.elem[i] = (in_vec.elem[i] + bias_vec.elem[i]) * x0_vec.elem[i] + x_vec.elem[i]; - } - } else { - asm volatile("s_trap 0;"); - } - *(reinterpret_cast(out + linear_index)) = out_store; - } -} - -template -int GetLaunchPackSize(const int64_t cols) { - constexpr int type_pack_size = cuda::elementwise::PackSize(); - for (int launch_pack_size = 8; launch_pack_size > 0; launch_pack_size /= 2) { - if (type_pack_size >= launch_pack_size && cols % launch_pack_size == 0) { - return launch_pack_size; - } - } - return 1; -} - -template -void DispatchFusedBiasAddMulAddResidualPackSize(ep::Stream* stream, const T* in, const T* x, - const T* x0, const T* bias, T* out, - const IndexType cols, const IndexType elem_cnt) { - int grid_size; - const int pack_size = GetLaunchPackSize(cols); - const int64_t pack_num = elem_cnt / pack_size; - hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size); - if (pack_size == 8) { - FusedBiasAddMulAddResidualKernel - <<As()->cuda_stream()>>>( - in, x, x0, bias, out, cols, elem_cnt); - } else if (pack_size == 4) { - FusedBiasAddMulAddResidualKernel - <<As()->cuda_stream()>>>( - in, x, x0, bias, out, cols, elem_cnt); - } else if (pack_size == 2) { - FusedBiasAddMulAddResidualKernel - <<As()->cuda_stream()>>>( - in, x, x0, bias, out, cols, elem_cnt); - } else { - FusedBiasAddMulAddResidualKernel - <<As()->cuda_stream()>>>( - in, x, x0, bias, out, cols, elem_cnt); - } -} - -template -void DispatchFusedBiasAddMulAddResidualIndexType(ep::Stream* stream, const T* in, const T* x, - const T* x0, const T* bias, T* out, - const int64_t cols, const int64_t elem_cnt) { - if (elem_cnt < GetMaxVal()) { - DispatchFusedBiasAddMulAddResidualPackSize(stream, in, x, x0, bias, out, cols, - elem_cnt); - } else { - DispatchFusedBiasAddMulAddResidualPackSize(stream, in, x, x0, bias, out, cols, - elem_cnt); - } -} - -template -class FusedCrossFeatureInteractionKernel final : public user_op::OpKernel, - public user_op::CudaGraphSupport { - public: - FusedCrossFeatureInteractionKernel() = default; - ~FusedCrossFeatureInteractionKernel() override = default; - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - /* - Cross Interaction v1: - 1. x matmul weight. matmul_result0 -> (B, E) matmul (1, E) -> (B, 1) - dx = dmatmul_result0 matmul weight - dw = x matmul dmatmul_result0 - - 2. matmul_result0 broadcast_mul x0. matmul_result1 -> (B, 1) broadcast_mul (B, E) -> (B, E) - dmatmul_result0 = reduce_sum(dmatmul_result1 * x0, axis=1) - dx0 = dmatmul_result1 broadcast_mul matmul_result0 - - 3. matmul_result1 broadcast_add bias. matmul_result2 -> (B, E) broadcast_add (1, E) -> (B, E) - dmatmul_result1 = dout - dbias = reduce_sum(dmatmul_result2, axis=0) - - 4. matmul_result2 add x. out -> (B, E) elementwise_add (B, E) -> (B, E) - dmatmul_result2 = dout, dx = dout. - - Cross Interaction Grad: - dw = x matmul dmatmul_result0 - dx0 = dmatmul_result1 broadcast_mul matmul_result0 - dbias = reduce_sum(dmatmul_result2, axis=0) - dx = (dmatmul_result0 matmul weight) + dout. - - Cross Interaction v2: - 1. x matmul weight. matmul_result0 -> (B, E) matmul (E, E) -> (B, E) - - 2. matmul_result0 add bias. matmul_result1 -> (B, E) bias_add (1, E) -> (B, E) - - 3. matmul_result1 multiply x0. matmul_result2 -> (B, E) elementwise_mul (B, E) -> (B, E) - - 4. matmul_result2 add x. out -> (B, E) elementwise_add (B, E) -> (B, E) - - */ - const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); - const user_op::Tensor* x0 = ctx->Tensor4ArgNameAndIndex("x0", 0); - const user_op::Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - user_op::Tensor* matmul_result = ctx->Tensor4ArgNameAndIndex("matmul_result", 0); - const std::string interaction_mode = ctx->Attr("interaction_mode"); - - CHECK_EQ(out->shape_view().NumAxes(), 2); - size_t m = 0, n = 0, k = 0; - InferMatmulMNK(x->shape_view(), weight->shape_view(), /*trans_a=*/false, /*trans_b=*/true, &m, - &n, &k); - const double alpha = 1.0; - double beta = 0.0; - auto matmul = NewMatmulPrimitive(ctx); - CHECK(matmul); - matmul->Launch(ctx->stream(), m, n, k, alpha, x->dptr(), weight->dptr(), beta, - matmul_result->mut_dptr()); - const int64_t elem_cnt = out->shape_view().elem_cnt(); - const int64_t cols = out->shape_view().At(1); - if (interaction_mode == "vector") { - DispatchFusedBiasAddMulAddResidualIndexType( - ctx->stream(), matmul_result->mut_dptr(), x->dptr(), x0->dptr(), bias->dptr(), - out->mut_dptr(), cols, elem_cnt); - } else { - DispatchFusedBiasAddMulAddResidualIndexType( - ctx->stream(), matmul_result->mut_dptr(), x->dptr(), x0->dptr(), bias->dptr(), - out->mut_dptr(), cols, elem_cnt); - } - } -}; - -#define REGISTER_FUSED_CROSS_FEATURE_INTERACTION_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fused_cross_feature_interaction") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("x", 0) == GetDataType::value) \ - && MatmulPrimitiveExists()); - -REGISTER_FUSED_CROSS_FEATURE_INTERACTION_KERNEL(float) -REGISTER_FUSED_CROSS_FEATURE_INTERACTION_KERNEL(half) - -} // namespace - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/cuda_graph_support.h" +#include "oneflow/core/ep/include/primitive/matmul.h" +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +enum InteractionMode { kVector = 0, kMatrix }; + +constexpr int kBlockSize = 256; + +void InferMatmulMNK(const ShapeView& a_shape, const ShapeView& b_shape, bool transpose_a, + bool transpose_b, size_t* m, size_t* n, size_t* k) { + const int64_t num_a_axes = a_shape.NumAxes(); + CHECK_GE(num_a_axes, 2); + const int64_t num_b_axes = b_shape.NumAxes(); + CHECK_GE(num_b_axes, 2); + if (!transpose_a) { + *m = a_shape.At(num_a_axes - 2); + *k = a_shape.At(num_a_axes - 1); + } else { + *m = a_shape.At(num_a_axes - 1); + *k = a_shape.At(num_a_axes - 2); + } + if (!transpose_b) { + CHECK_EQ(b_shape.At(num_b_axes - 2), *k); + *n = b_shape.At(num_b_axes - 1); + } else { + CHECK_EQ(b_shape.At(num_b_axes - 1), *k); + *n = b_shape.At(num_b_axes - 2); + } +} + +ep::primitive::BlasTransposeType GetBlasTransposeType(bool transpose) { + return transpose ? ep::primitive::BlasTransposeType::T : ep::primitive::BlasTransposeType::N; +} + +std::unique_ptr NewMatmulPrimitive(DeviceType device_type, + DataType data_type, bool transpose_a, + bool transpose_b) { + const auto trans_a = GetBlasTransposeType(transpose_a); + const auto trans_b = GetBlasTransposeType(transpose_b); + return ep::primitive::NewPrimitive(device_type, data_type, trans_a, + trans_b); +} + +template +std::unique_ptr NewMatmulPrimitive(Context* ctx) { + const DataType data_type = ctx->TensorDesc4ArgNameAndIndex("x", 0)->data_type(); + return NewMatmulPrimitive(ctx->device_type(), data_type, /*transpose_a=*/false, + /*transpose_b=*/true); +} + +auto MatmulPrimitiveExists() { + return hob::make_custom("MatmulPrimitiveExists", [](const user_op::KernelRegContext& ctx) { + return NewMatmulPrimitive(&ctx).operator bool(); + }); +} + +template +__global__ void FusedBiasAddMulAddResidualKernel(const T* in, const T* x, const T* x0, + const T* bias, T* out, const IndexType cols, + const IndexType elem_cnt) { + const IndexType global_thread_id = blockDim.x * blockIdx.x + threadIdx.x; + using LoadPack = cuda::elementwise::Packed; + for (IndexType linear_index = global_thread_id * pack_size, + step = gridDim.x * blockDim.x * pack_size; + linear_index < elem_cnt; linear_index += step) { + const IndexType row_idx = linear_index / cols; + const IndexType col_idx = linear_index - row_idx * cols; + + const LoadPack* x0_load = reinterpret_cast(x0 + linear_index); + const LoadPack* x_load = reinterpret_cast(x + linear_index); + const LoadPack* bias_load = reinterpret_cast(bias + col_idx); + + LoadPack x0_vec = *x0_load; + LoadPack x_vec = *x_load; + LoadPack bias_vec = *bias_load; + + LoadPack out_store; + if (mode == InteractionMode::kVector) { + T in_val = in[row_idx]; +#pragma unroll + for (int i = 0; i < pack_size; i++) { + out_store.elem[i] = x0_vec.elem[i] * in_val + bias_vec.elem[i] + x_vec.elem[i]; + } + } else if (mode == InteractionMode::kMatrix) { + const LoadPack* in_load = reinterpret_cast(in + linear_index); + LoadPack in_vec = *in_load; +#pragma unroll + for (int i = 0; i < pack_size; i++) { + out_store.elem[i] = (in_vec.elem[i] + bias_vec.elem[i]) * x0_vec.elem[i] + x_vec.elem[i]; + } + } else { + asm volatile("s_trap 0;"); + } + *(reinterpret_cast(out + linear_index)) = out_store; + } +} + +template +int GetLaunchPackSize(const int64_t cols) { + constexpr int type_pack_size = cuda::elementwise::PackSize(); + for (int launch_pack_size = 8; launch_pack_size > 0; launch_pack_size /= 2) { + if (type_pack_size >= launch_pack_size && cols % launch_pack_size == 0) { + return launch_pack_size; + } + } + return 1; +} + +template +void DispatchFusedBiasAddMulAddResidualPackSize(ep::Stream* stream, const T* in, const T* x, + const T* x0, const T* bias, T* out, + const IndexType cols, const IndexType elem_cnt) { + int grid_size; + const int pack_size = GetLaunchPackSize(cols); + const int64_t pack_num = elem_cnt / pack_size; + hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size); + if (pack_size == 8) { + FusedBiasAddMulAddResidualKernel + <<As()->cuda_stream()>>>( + in, x, x0, bias, out, cols, elem_cnt); + } else if (pack_size == 4) { + FusedBiasAddMulAddResidualKernel + <<As()->cuda_stream()>>>( + in, x, x0, bias, out, cols, elem_cnt); + } else if (pack_size == 2) { + FusedBiasAddMulAddResidualKernel + <<As()->cuda_stream()>>>( + in, x, x0, bias, out, cols, elem_cnt); + } else { + FusedBiasAddMulAddResidualKernel + <<As()->cuda_stream()>>>( + in, x, x0, bias, out, cols, elem_cnt); + } +} + +template +void DispatchFusedBiasAddMulAddResidualIndexType(ep::Stream* stream, const T* in, const T* x, + const T* x0, const T* bias, T* out, + const int64_t cols, const int64_t elem_cnt) { + if (elem_cnt < GetMaxVal()) { + DispatchFusedBiasAddMulAddResidualPackSize(stream, in, x, x0, bias, out, cols, + elem_cnt); + } else { + DispatchFusedBiasAddMulAddResidualPackSize(stream, in, x, x0, bias, out, cols, + elem_cnt); + } +} + +template +class FusedCrossFeatureInteractionKernel final : public user_op::OpKernel, + public user_op::CudaGraphSupport { + public: + FusedCrossFeatureInteractionKernel() = default; + ~FusedCrossFeatureInteractionKernel() override = default; + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + /* + Cross Interaction v1: + 1. x matmul weight. matmul_result0 -> (B, E) matmul (1, E) -> (B, 1) + dx = dmatmul_result0 matmul weight + dw = x matmul dmatmul_result0 + + 2. matmul_result0 broadcast_mul x0. matmul_result1 -> (B, 1) broadcast_mul (B, E) -> (B, E) + dmatmul_result0 = reduce_sum(dmatmul_result1 * x0, axis=1) + dx0 = dmatmul_result1 broadcast_mul matmul_result0 + + 3. matmul_result1 broadcast_add bias. matmul_result2 -> (B, E) broadcast_add (1, E) -> (B, E) + dmatmul_result1 = dout + dbias = reduce_sum(dmatmul_result2, axis=0) + + 4. matmul_result2 add x. out -> (B, E) elementwise_add (B, E) -> (B, E) + dmatmul_result2 = dout, dx = dout. + + Cross Interaction Grad: + dw = x matmul dmatmul_result0 + dx0 = dmatmul_result1 broadcast_mul matmul_result0 + dbias = reduce_sum(dmatmul_result2, axis=0) + dx = (dmatmul_result0 matmul weight) + dout. + + Cross Interaction v2: + 1. x matmul weight. matmul_result0 -> (B, E) matmul (E, E) -> (B, E) + + 2. matmul_result0 add bias. matmul_result1 -> (B, E) bias_add (1, E) -> (B, E) + + 3. matmul_result1 multiply x0. matmul_result2 -> (B, E) elementwise_mul (B, E) -> (B, E) + + 4. matmul_result2 add x. out -> (B, E) elementwise_add (B, E) -> (B, E) + + */ + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); + const user_op::Tensor* x0 = ctx->Tensor4ArgNameAndIndex("x0", 0); + const user_op::Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + user_op::Tensor* matmul_result = ctx->Tensor4ArgNameAndIndex("matmul_result", 0); + const std::string interaction_mode = ctx->Attr("interaction_mode"); + + CHECK_EQ(out->shape_view().NumAxes(), 2); + size_t m = 0, n = 0, k = 0; + InferMatmulMNK(x->shape_view(), weight->shape_view(), /*trans_a=*/false, /*trans_b=*/true, &m, + &n, &k); + const double alpha = 1.0; + double beta = 0.0; + auto matmul = NewMatmulPrimitive(ctx); + CHECK(matmul); + matmul->Launch(ctx->stream(), m, n, k, alpha, x->dptr(), weight->dptr(), beta, + matmul_result->mut_dptr()); + const int64_t elem_cnt = out->shape_view().elem_cnt(); + const int64_t cols = out->shape_view().At(1); + if (interaction_mode == "vector") { + DispatchFusedBiasAddMulAddResidualIndexType( + ctx->stream(), matmul_result->mut_dptr(), x->dptr(), x0->dptr(), bias->dptr(), + out->mut_dptr(), cols, elem_cnt); + } else { + DispatchFusedBiasAddMulAddResidualIndexType( + ctx->stream(), matmul_result->mut_dptr(), x->dptr(), x0->dptr(), bias->dptr(), + out->mut_dptr(), cols, elem_cnt); + } + } +}; + +#define REGISTER_FUSED_CROSS_FEATURE_INTERACTION_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fused_cross_feature_interaction") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == GetDataType::value) \ + && MatmulPrimitiveExists()); + +REGISTER_FUSED_CROSS_FEATURE_INTERACTION_KERNEL(float) +REGISTER_FUSED_CROSS_FEATURE_INTERACTION_KERNEL(half) + +} // namespace + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/fused_cross_feature_interaction_grad.hip.cpp b/oneflow/user/kernels/fused_cross_feature_interaction_grad.hip.cpp index 6e5483a..38e37b0 100644 --- a/oneflow/user/kernels/fused_cross_feature_interaction_grad.hip.cpp +++ b/oneflow/user/kernels/fused_cross_feature_interaction_grad.hip.cpp @@ -1,455 +1,455 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/cuda_graph_support.h" -#include "oneflow/core/ep/include/primitive/matmul.h" -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -constexpr int kBlockSize = 256; - -void InferMatmulMNK(const DimVector& a_shape, const DimVector& b_shape, bool transpose_a, - bool transpose_b, size_t* m, size_t* n, size_t* k) { - const int64_t num_a_axes = a_shape.size(); - CHECK_GE(num_a_axes, 2); - const int64_t num_b_axes = b_shape.size(); - CHECK_GE(num_b_axes, 2); - if (!transpose_a) { - *m = a_shape.at(num_a_axes - 2); - *k = a_shape.at(num_a_axes - 1); - } else { - *m = a_shape.at(num_a_axes - 1); - *k = a_shape.at(num_a_axes - 2); - } - if (!transpose_b) { - CHECK_EQ(b_shape.at(num_b_axes - 2), *k); - *n = b_shape.at(num_b_axes - 1); - } else { - CHECK_EQ(b_shape.at(num_b_axes - 1), *k); - *n = b_shape.at(num_b_axes - 2); - } -} - -ep::primitive::BlasTransposeType GetBlasTransposeType(bool transpose) { - return transpose ? ep::primitive::BlasTransposeType::T : ep::primitive::BlasTransposeType::N; -} - -template -struct MulOp { - __device__ __forceinline__ T operator()(const T& a, const T& b) const { return a * b; } -}; - -template -struct AddOp { - __device__ __forceinline__ T operator()(const T& a, const T& b) const { return a + b; } -}; - -template -int GetLaunchPackSize(const int64_t cols) { - constexpr int type_pack_size = cuda::elementwise::PackSize(); - for (int launch_pack_size = 8; launch_pack_size > 0; launch_pack_size /= 2) { - if (type_pack_size >= launch_pack_size && cols % launch_pack_size == 0) { - return launch_pack_size; - } - } - return 1; -} - -template -__global__ void BroadcastMulKernel(const T* x, const T* y, T* out, const IndexType cols, - const IndexType elem_cnt) { - const IndexType global_thread_id = blockDim.x * blockIdx.x + threadIdx.x; - using LoadPack = cuda::elementwise::Packed; - for (IndexType linear_index = global_thread_id * pack_size, - step = gridDim.x * blockDim.x * pack_size; - linear_index < elem_cnt; linear_index += step) { - const IndexType row_idx = linear_index / cols; - const LoadPack* x_load = reinterpret_cast(x + linear_index); - LoadPack x_vec = *x_load; - LoadPack out_store; - const T y_val = y[row_idx]; -#pragma unroll - for (int i = 0; i < pack_size; i++) { out_store.elem[i] = x_vec.elem[i] * y_val; } - *(reinterpret_cast(out + linear_index)) = out_store; - } -} - -template -void DispatchBroadcastMulPackSize(ep::Stream* stream, const T* x, const T* y, T* out, - const IndexType cols, const IndexType elem_cnt) { - int grid_size; - const int pack_size = GetLaunchPackSize(cols); - const int64_t pack_num = elem_cnt / pack_size; - hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size); - if (pack_size == 8) { - BroadcastMulKernel - <<As()->cuda_stream()>>>(x, y, out, cols, - elem_cnt); - } else if (pack_size == 4) { - BroadcastMulKernel - <<As()->cuda_stream()>>>(x, y, out, cols, - elem_cnt); - } else if (pack_size == 2) { - BroadcastMulKernel - <<As()->cuda_stream()>>>(x, y, out, cols, - elem_cnt); - } else { - BroadcastMulKernel - <<As()->cuda_stream()>>>(x, y, out, cols, - elem_cnt); - } -} - -template -void DispatchBroadcastMulIndexType(ep::Stream* stream, const T* x, const T* y, T* out, - const int64_t cols, const int64_t elem_cnt) { - if (elem_cnt < GetMaxVal()) { - DispatchBroadcastMulPackSize(stream, x, y, out, cols, elem_cnt); - } else { - DispatchBroadcastMulPackSize(stream, x, y, out, cols, elem_cnt); - } -} - -template -__global__ void BroadcastAddElementwiseMulKernel(const T* x, const T* y, const T* z, T* out, - const IndexType cols, const IndexType elem_cnt) { - const IndexType global_thread_id = blockDim.x * blockIdx.x + threadIdx.x; - using LoadPack = cuda::elementwise::Packed; - for (IndexType linear_index = global_thread_id * pack_size, - step = gridDim.x * blockDim.x * pack_size; - linear_index < elem_cnt; linear_index += step) { - const IndexType row_idx = linear_index / cols; - const IndexType col_idx = linear_index - row_idx * cols; - const LoadPack* x_load = reinterpret_cast(x + linear_index); - const LoadPack* y_load = reinterpret_cast(y + col_idx); - const LoadPack* z_load = reinterpret_cast(z + linear_index); - - LoadPack x_vec = *x_load; - LoadPack y_vec = *y_load; - LoadPack z_vec = *z_load; - LoadPack out_store; - -#pragma unroll - for (int i = 0; i < pack_size; i++) { - out_store.elem[i] = (x_vec.elem[i] + y_vec.elem[i]) * z_vec.elem[i]; - } - *(reinterpret_cast(out + linear_index)) = out_store; - } -} - -template -void DispatchBroadcastAddElementwiseMulPackSize(ep::Stream* stream, const T* x, const T* y, - const T* z, T* out, const IndexType cols, - const IndexType elem_cnt) { - int grid_size; - const int pack_size = GetLaunchPackSize(cols); - const int64_t pack_num = elem_cnt / pack_size; - hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size); - if (pack_size == 8) { - BroadcastAddElementwiseMulKernel - <<As()->cuda_stream()>>>(x, y, z, out, - cols, elem_cnt); - } else if (pack_size == 4) { - BroadcastAddElementwiseMulKernel - <<As()->cuda_stream()>>>(x, y, z, out, - cols, elem_cnt); - } else if (pack_size == 2) { - BroadcastAddElementwiseMulKernel - <<As()->cuda_stream()>>>(x, y, z, out, - cols, elem_cnt); - } else { - BroadcastAddElementwiseMulKernel - <<As()->cuda_stream()>>>(x, y, z, out, - cols, elem_cnt); - } -} - -template -void DispatchBroadcastAddElementwiseMulIndexType(ep::Stream* stream, const T* x, const T* y, - const T* z, T* out, const int64_t cols, - const int64_t elem_cnt) { - if (elem_cnt < GetMaxVal()) { - DispatchBroadcastAddElementwiseMulPackSize(stream, x, y, z, out, cols, elem_cnt); - } else { - DispatchBroadcastAddElementwiseMulPackSize(stream, x, y, z, out, cols, elem_cnt); - } -} - -} // namespace - -namespace user_op { - -std::unique_ptr NewMatmulPrimitive(DeviceType device_type, - DataType data_type, bool transpose_a, - bool transpose_b) { - const auto trans_a = GetBlasTransposeType(transpose_a); - const auto trans_b = GetBlasTransposeType(transpose_b); - return ep::primitive::NewPrimitive(device_type, data_type, trans_a, - trans_b); -} - -template -std::unique_ptr NewReduceMatmulPrimitive(Context* ctx) { - const DataType data_type = ctx->TensorDesc4ArgNameAndIndex("dy", 0)->data_type(); - return NewMatmulPrimitive(ctx->device_type(), data_type, /*transpose_a=*/false, - /*transpose_b=*/false); -} - -auto ReduceMatmulPrimitiveExists() { - return hob::make_custom("MatmulPrimitiveExists", [](const KernelRegContext& ctx) { - return NewReduceMatmulPrimitive(&ctx).operator bool(); - }); -} - -template -std::unique_ptr NewWeightGradMatmulPrimitive(Context* ctx) { - const DataType data_type = ctx->TensorDesc4ArgNameAndIndex("x", 0)->data_type(); - return NewMatmulPrimitive(ctx->device_type(), data_type, /*transpose_a=*/true, - /*transpose_b=*/false); -} - -auto WeightGradMatmulPrimitiveExists() { - return hob::make_custom("MatmulPrimitiveExists", [](const KernelRegContext& ctx) { - return NewWeightGradMatmulPrimitive(&ctx).operator bool(); - }); -} - -template -class FusedCrossFeatureInteractionGradKernel final : public OpKernel, public CudaGraphSupport { - public: - FusedCrossFeatureInteractionGradKernel() = default; - ~FusedCrossFeatureInteractionGradKernel() override = default; - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } - - private: - using user_op::OpKernel::Compute; - void Compute(KernelComputeContext* ctx) const override { - const Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - const Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); - const Tensor* x0 = ctx->Tensor4ArgNameAndIndex("x0", 0); - const Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); - const Tensor* matmul_result = ctx->Tensor4ArgNameAndIndex("matmul_result", 0); - - const int64_t batch_size = dy->shape_view().At(0); - const int64_t hidden_size = dy->shape_view().At(1); - const int64_t out_size = weight->shape_view().At(0); - const int64_t dy_elem_cnt = dy->shape_view().elem_cnt(); - - Tensor* dx0 = ctx->Tensor4ArgNameAndIndex("dx0", 0); - Tensor* dw = ctx->Tensor4ArgNameAndIndex("dw", 0); - Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - Tensor* dbias = ctx->Tensor4ArgNameAndIndex("dbias", 0); - Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - - // step1: Get dbias. - const T* ones = nullptr; - auto* cuda_device = dynamic_cast(ctx->stream()->device()); - if (cuda_device != nullptr) { - ones = static_cast(cuda_device->GetConstOnes(dy->data_type(), batch_size)); - } - size_t m = 0, n = 0, k = 0; - DimVector dy_shape(2); - dy->shape_view().ToDimVector(&dy_shape); - DimVector ones_buf_shape(2); - ones_buf_shape.at(0) = 1; - ones_buf_shape.at(1) = batch_size; - InferMatmulMNK(ones_buf_shape, dy_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n, &k); - auto reduce_matmul = NewReduceMatmulPrimitive(ctx); - CHECK(reduce_matmul); - reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, ones, dy->dptr(), 0.0, dbias->mut_dptr()); - - // step2: Get dmatmul_result0. - T* dy_mul_x0 = reinterpret_cast(tmp_buffer->mut_dptr()); - T* dmatmul_result0 = reinterpret_cast(tmp_buffer->mut_dptr() - + GetCudaAlignedSize(dy_elem_cnt * sizeof(T))); - OF_CUDA_CHECK(cuda::elementwise::Binary(MulOp(), dy_elem_cnt, dy_mul_x0, dy->dptr(), - x0->dptr(), - ctx->stream()->As()->cuda_stream())); - - ones = static_cast(cuda_device->GetConstOnes(dy->data_type(), hidden_size)); - DimVector dy_mul_x0_shape(2); - dy->shape_view().ToDimVector(&dy_mul_x0_shape); - ones_buf_shape.at(0) = hidden_size; - ones_buf_shape.at(1) = 1; - InferMatmulMNK(dy_mul_x0_shape, ones_buf_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n, - &k); - reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, dy_mul_x0, ones, 0.0, dmatmul_result0); - - // step3: Get dx - T* dx_buf = reinterpret_cast(tmp_buffer->mut_dptr() - + GetCudaAlignedSize(dy_elem_cnt * sizeof(T)) - + GetCudaAlignedSize(batch_size * sizeof(T))); - DimVector dmatmul_result_shape(2); - dmatmul_result_shape.at(0) = batch_size; - dmatmul_result_shape.at(1) = 1; // todo change to hidden size - DimVector weight_shape(2); - weight->shape_view().ToDimVector(&weight_shape); - InferMatmulMNK(dmatmul_result_shape, weight_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n, - &k); - reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, dmatmul_result0, weight->dptr(), 0.0, - reinterpret_cast(dx_buf)); - OF_CUDA_CHECK(cuda::elementwise::Binary(AddOp(), dy_elem_cnt, dx->mut_dptr(), dx_buf, - dy->dptr(), - ctx->stream()->As()->cuda_stream())); - - // step4: Get dw. - DimVector x_shape(2); - x->shape_view().ToDimVector(&x_shape); - - InferMatmulMNK(dmatmul_result_shape, x_shape, /*trans_a=*/true, /*trans_b=*/false, &m, &n, &k); - auto weight_grad_matmul = NewWeightGradMatmulPrimitive(ctx); - CHECK(weight_grad_matmul); - weight_grad_matmul->Launch(ctx->stream(), m, n, k, 1.0, dmatmul_result0, x->dptr(), 0.0, - dw->mut_dptr()); - - // step5: Get dx0. - DispatchBroadcastMulIndexType(ctx->stream(), dy->dptr(), matmul_result->dptr(), - dx0->mut_dptr(), hidden_size, dy_elem_cnt); - } -}; - -#define REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V1_GRAD_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fused_cross_feature_interaction_v1_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((HobDeviceType() == DeviceType::kCUDA) \ - && (HobDataType("dy", 0) == GetDataType::value) \ - && ReduceMatmulPrimitiveExists() && WeightGradMatmulPrimitiveExists()) \ - .SetInferTmpSizeFn([](InferContext* ctx) { \ - size_t tmp_size = 0; \ - const TensorDesc& dy = ctx->InputTensorDesc("dy", 0); \ - const int64_t dy_elem_cnt = dy.shape().elem_cnt(); \ - const int64_t batch_size = dy.shape().At(0); \ - size_t dy_mul_x0_size = GetCudaAlignedSize(dy_elem_cnt * sizeof(dtype)); \ - size_t dmatmul_result_size = GetCudaAlignedSize(batch_size * sizeof(dtype)); \ - size_t dx_buf_size = dy_mul_x0_size; \ - tmp_size = dy_mul_x0_size + dmatmul_result_size + dx_buf_size; \ - return tmp_size; \ - }); - -REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V1_GRAD_KERNEL(float) -REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V1_GRAD_KERNEL(half) - -template -class FusedCrossFeatureInteractionV2GradKernel final : public OpKernel, public CudaGraphSupport { - public: - FusedCrossFeatureInteractionV2GradKernel() = default; - ~FusedCrossFeatureInteractionV2GradKernel() = default; - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } - - private: - using user_op::OpKernel::Compute; - void Compute(KernelComputeContext* ctx) const override { - const Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - const Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); - const Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0); - const Tensor* x0 = ctx->Tensor4ArgNameAndIndex("x0", 0); - const Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); - const Tensor* matmul_result = ctx->Tensor4ArgNameAndIndex("matmul_result", 0); - - const int64_t batch_size = dy->shape_view().At(0); - const int64_t in_size = weight->shape_view().At(1); - const int64_t hidden_size = weight->shape_view().At(0); - const int64_t dy_elem_cnt = dy->shape_view().elem_cnt(); - - Tensor* dx0 = ctx->Tensor4ArgNameAndIndex("dx0", 0); - Tensor* dw = ctx->Tensor4ArgNameAndIndex("dw", 0); - Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - Tensor* dbias = ctx->Tensor4ArgNameAndIndex("dbias", 0); - Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - - // step1: Get dx0. - DispatchBroadcastAddElementwiseMulIndexType(ctx->stream(), matmul_result->dptr(), - bias->dptr(), dy->dptr(), - dx0->mut_dptr(), hidden_size, dy_elem_cnt); - - // step2: Get dmatmul_result0. - T* dmatmul_result0 = reinterpret_cast(tmp_buffer->mut_dptr()); - OF_CUDA_CHECK(cuda::elementwise::Binary(MulOp(), dy_elem_cnt, dmatmul_result0, dy->dptr(), - x0->dptr(), - ctx->stream()->As()->cuda_stream())); - // step3: Get dx - T* dx_buf = reinterpret_cast(tmp_buffer->mut_dptr() - + GetCudaAlignedSize(dy_elem_cnt * sizeof(T))); - DimVector dmatmul_result_shape(2); - dmatmul_result_shape.at(0) = batch_size; - dmatmul_result_shape.at(1) = hidden_size; - DimVector weight_shape(2); - weight->shape_view().ToDimVector(&weight_shape); - size_t m = 0, n = 0, k = 0; - InferMatmulMNK(dmatmul_result_shape, weight_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n, - &k); - auto reduce_matmul = NewReduceMatmulPrimitive(ctx); - CHECK(reduce_matmul); - reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, dmatmul_result0, weight->dptr(), 0.0, - reinterpret_cast(dx_buf)); - OF_CUDA_CHECK(cuda::elementwise::Binary(AddOp(), dy_elem_cnt, dx->mut_dptr(), dx_buf, - dy->dptr(), - ctx->stream()->As()->cuda_stream())); - - // step4: Get dw. - DimVector x_shape(2); - x->shape_view().ToDimVector(&x_shape); - - InferMatmulMNK(dmatmul_result_shape, x_shape, /*trans_a=*/true, /*trans_b=*/false, &m, &n, &k); - auto weight_grad_matmul = NewWeightGradMatmulPrimitive(ctx); - CHECK(weight_grad_matmul); - weight_grad_matmul->Launch(ctx->stream(), m, n, k, 1.0, dmatmul_result0, x->dptr(), 0.0, - dw->mut_dptr()); - - // step5: Get dbias. - const T* ones = nullptr; - auto* cuda_device = dynamic_cast(ctx->stream()->device()); - if (cuda_device != nullptr) { - ones = static_cast(cuda_device->GetConstOnes(dy->data_type(), batch_size)); - } - DimVector dy_shape(2); - dy->shape_view().ToDimVector(&dy_shape); - DimVector ones_buf_shape(2); - ones_buf_shape.at(0) = 1; - ones_buf_shape.at(1) = batch_size; - InferMatmulMNK(ones_buf_shape, dy_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n, &k); - reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, ones, - reinterpret_cast(dmatmul_result0), 0.0, dbias->mut_dptr()); - } -}; - -#define REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V2_GRAD_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fused_cross_feature_interaction_v2_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((HobDeviceType() == DeviceType::kCUDA) \ - && (HobDataType("dy", 0) == GetDataType::value) \ - && ReduceMatmulPrimitiveExists() && WeightGradMatmulPrimitiveExists()) \ - .SetInferTmpSizeFn([](InferContext* ctx) { \ - size_t tmp_size = 0; \ - const TensorDesc& dy = ctx->InputTensorDesc("dy", 0); \ - const int64_t dy_elem_cnt = dy.shape().elem_cnt(); \ - size_t dmatmul_result_size = GetCudaAlignedSize(dy_elem_cnt * sizeof(dtype)); \ - size_t dx_buf_size = dmatmul_result_size; \ - tmp_size = dmatmul_result_size + dx_buf_size; \ - return tmp_size; \ - }); - -REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V2_GRAD_KERNEL(float) -REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V2_GRAD_KERNEL(half) - -} // namespace user_op - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/cuda_graph_support.h" +#include "oneflow/core/ep/include/primitive/matmul.h" +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +constexpr int kBlockSize = 256; + +void InferMatmulMNK(const DimVector& a_shape, const DimVector& b_shape, bool transpose_a, + bool transpose_b, size_t* m, size_t* n, size_t* k) { + const int64_t num_a_axes = a_shape.size(); + CHECK_GE(num_a_axes, 2); + const int64_t num_b_axes = b_shape.size(); + CHECK_GE(num_b_axes, 2); + if (!transpose_a) { + *m = a_shape.at(num_a_axes - 2); + *k = a_shape.at(num_a_axes - 1); + } else { + *m = a_shape.at(num_a_axes - 1); + *k = a_shape.at(num_a_axes - 2); + } + if (!transpose_b) { + CHECK_EQ(b_shape.at(num_b_axes - 2), *k); + *n = b_shape.at(num_b_axes - 1); + } else { + CHECK_EQ(b_shape.at(num_b_axes - 1), *k); + *n = b_shape.at(num_b_axes - 2); + } +} + +ep::primitive::BlasTransposeType GetBlasTransposeType(bool transpose) { + return transpose ? ep::primitive::BlasTransposeType::T : ep::primitive::BlasTransposeType::N; +} + +template +struct MulOp { + __device__ __forceinline__ T operator()(const T& a, const T& b) const { return a * b; } +}; + +template +struct AddOp { + __device__ __forceinline__ T operator()(const T& a, const T& b) const { return a + b; } +}; + +template +int GetLaunchPackSize(const int64_t cols) { + constexpr int type_pack_size = cuda::elementwise::PackSize(); + for (int launch_pack_size = 8; launch_pack_size > 0; launch_pack_size /= 2) { + if (type_pack_size >= launch_pack_size && cols % launch_pack_size == 0) { + return launch_pack_size; + } + } + return 1; +} + +template +__global__ void BroadcastMulKernel(const T* x, const T* y, T* out, const IndexType cols, + const IndexType elem_cnt) { + const IndexType global_thread_id = blockDim.x * blockIdx.x + threadIdx.x; + using LoadPack = cuda::elementwise::Packed; + for (IndexType linear_index = global_thread_id * pack_size, + step = gridDim.x * blockDim.x * pack_size; + linear_index < elem_cnt; linear_index += step) { + const IndexType row_idx = linear_index / cols; + const LoadPack* x_load = reinterpret_cast(x + linear_index); + LoadPack x_vec = *x_load; + LoadPack out_store; + const T y_val = y[row_idx]; +#pragma unroll + for (int i = 0; i < pack_size; i++) { out_store.elem[i] = x_vec.elem[i] * y_val; } + *(reinterpret_cast(out + linear_index)) = out_store; + } +} + +template +void DispatchBroadcastMulPackSize(ep::Stream* stream, const T* x, const T* y, T* out, + const IndexType cols, const IndexType elem_cnt) { + int grid_size; + const int pack_size = GetLaunchPackSize(cols); + const int64_t pack_num = elem_cnt / pack_size; + hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size); + if (pack_size == 8) { + BroadcastMulKernel + <<As()->cuda_stream()>>>(x, y, out, cols, + elem_cnt); + } else if (pack_size == 4) { + BroadcastMulKernel + <<As()->cuda_stream()>>>(x, y, out, cols, + elem_cnt); + } else if (pack_size == 2) { + BroadcastMulKernel + <<As()->cuda_stream()>>>(x, y, out, cols, + elem_cnt); + } else { + BroadcastMulKernel + <<As()->cuda_stream()>>>(x, y, out, cols, + elem_cnt); + } +} + +template +void DispatchBroadcastMulIndexType(ep::Stream* stream, const T* x, const T* y, T* out, + const int64_t cols, const int64_t elem_cnt) { + if (elem_cnt < GetMaxVal()) { + DispatchBroadcastMulPackSize(stream, x, y, out, cols, elem_cnt); + } else { + DispatchBroadcastMulPackSize(stream, x, y, out, cols, elem_cnt); + } +} + +template +__global__ void BroadcastAddElementwiseMulKernel(const T* x, const T* y, const T* z, T* out, + const IndexType cols, const IndexType elem_cnt) { + const IndexType global_thread_id = blockDim.x * blockIdx.x + threadIdx.x; + using LoadPack = cuda::elementwise::Packed; + for (IndexType linear_index = global_thread_id * pack_size, + step = gridDim.x * blockDim.x * pack_size; + linear_index < elem_cnt; linear_index += step) { + const IndexType row_idx = linear_index / cols; + const IndexType col_idx = linear_index - row_idx * cols; + const LoadPack* x_load = reinterpret_cast(x + linear_index); + const LoadPack* y_load = reinterpret_cast(y + col_idx); + const LoadPack* z_load = reinterpret_cast(z + linear_index); + + LoadPack x_vec = *x_load; + LoadPack y_vec = *y_load; + LoadPack z_vec = *z_load; + LoadPack out_store; + +#pragma unroll + for (int i = 0; i < pack_size; i++) { + out_store.elem[i] = (x_vec.elem[i] + y_vec.elem[i]) * z_vec.elem[i]; + } + *(reinterpret_cast(out + linear_index)) = out_store; + } +} + +template +void DispatchBroadcastAddElementwiseMulPackSize(ep::Stream* stream, const T* x, const T* y, + const T* z, T* out, const IndexType cols, + const IndexType elem_cnt) { + int grid_size; + const int pack_size = GetLaunchPackSize(cols); + const int64_t pack_num = elem_cnt / pack_size; + hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size); + if (pack_size == 8) { + BroadcastAddElementwiseMulKernel + <<As()->cuda_stream()>>>(x, y, z, out, + cols, elem_cnt); + } else if (pack_size == 4) { + BroadcastAddElementwiseMulKernel + <<As()->cuda_stream()>>>(x, y, z, out, + cols, elem_cnt); + } else if (pack_size == 2) { + BroadcastAddElementwiseMulKernel + <<As()->cuda_stream()>>>(x, y, z, out, + cols, elem_cnt); + } else { + BroadcastAddElementwiseMulKernel + <<As()->cuda_stream()>>>(x, y, z, out, + cols, elem_cnt); + } +} + +template +void DispatchBroadcastAddElementwiseMulIndexType(ep::Stream* stream, const T* x, const T* y, + const T* z, T* out, const int64_t cols, + const int64_t elem_cnt) { + if (elem_cnt < GetMaxVal()) { + DispatchBroadcastAddElementwiseMulPackSize(stream, x, y, z, out, cols, elem_cnt); + } else { + DispatchBroadcastAddElementwiseMulPackSize(stream, x, y, z, out, cols, elem_cnt); + } +} + +} // namespace + +namespace user_op { + +std::unique_ptr NewMatmulPrimitive(DeviceType device_type, + DataType data_type, bool transpose_a, + bool transpose_b) { + const auto trans_a = GetBlasTransposeType(transpose_a); + const auto trans_b = GetBlasTransposeType(transpose_b); + return ep::primitive::NewPrimitive(device_type, data_type, trans_a, + trans_b); +} + +template +std::unique_ptr NewReduceMatmulPrimitive(Context* ctx) { + const DataType data_type = ctx->TensorDesc4ArgNameAndIndex("dy", 0)->data_type(); + return NewMatmulPrimitive(ctx->device_type(), data_type, /*transpose_a=*/false, + /*transpose_b=*/false); +} + +auto ReduceMatmulPrimitiveExists() { + return hob::make_custom("MatmulPrimitiveExists", [](const KernelRegContext& ctx) { + return NewReduceMatmulPrimitive(&ctx).operator bool(); + }); +} + +template +std::unique_ptr NewWeightGradMatmulPrimitive(Context* ctx) { + const DataType data_type = ctx->TensorDesc4ArgNameAndIndex("x", 0)->data_type(); + return NewMatmulPrimitive(ctx->device_type(), data_type, /*transpose_a=*/true, + /*transpose_b=*/false); +} + +auto WeightGradMatmulPrimitiveExists() { + return hob::make_custom("MatmulPrimitiveExists", [](const KernelRegContext& ctx) { + return NewWeightGradMatmulPrimitive(&ctx).operator bool(); + }); +} + +template +class FusedCrossFeatureInteractionGradKernel final : public OpKernel, public CudaGraphSupport { + public: + FusedCrossFeatureInteractionGradKernel() = default; + ~FusedCrossFeatureInteractionGradKernel() override = default; + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + + private: + using user_op::OpKernel::Compute; + void Compute(KernelComputeContext* ctx) const override { + const Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + const Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); + const Tensor* x0 = ctx->Tensor4ArgNameAndIndex("x0", 0); + const Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + const Tensor* matmul_result = ctx->Tensor4ArgNameAndIndex("matmul_result", 0); + + const int64_t batch_size = dy->shape_view().At(0); + const int64_t hidden_size = dy->shape_view().At(1); + const int64_t out_size = weight->shape_view().At(0); + const int64_t dy_elem_cnt = dy->shape_view().elem_cnt(); + + Tensor* dx0 = ctx->Tensor4ArgNameAndIndex("dx0", 0); + Tensor* dw = ctx->Tensor4ArgNameAndIndex("dw", 0); + Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + Tensor* dbias = ctx->Tensor4ArgNameAndIndex("dbias", 0); + Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + + // step1: Get dbias. + const T* ones = nullptr; + auto* cuda_device = dynamic_cast(ctx->stream()->device()); + if (cuda_device != nullptr) { + ones = static_cast(cuda_device->GetConstOnes(dy->data_type(), batch_size)); + } + size_t m = 0, n = 0, k = 0; + DimVector dy_shape(2); + dy->shape_view().ToDimVector(&dy_shape); + DimVector ones_buf_shape(2); + ones_buf_shape.at(0) = 1; + ones_buf_shape.at(1) = batch_size; + InferMatmulMNK(ones_buf_shape, dy_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n, &k); + auto reduce_matmul = NewReduceMatmulPrimitive(ctx); + CHECK(reduce_matmul); + reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, ones, dy->dptr(), 0.0, dbias->mut_dptr()); + + // step2: Get dmatmul_result0. + T* dy_mul_x0 = reinterpret_cast(tmp_buffer->mut_dptr()); + T* dmatmul_result0 = reinterpret_cast(tmp_buffer->mut_dptr() + + GetCudaAlignedSize(dy_elem_cnt * sizeof(T))); + OF_CUDA_CHECK(cuda::elementwise::Binary(MulOp(), dy_elem_cnt, dy_mul_x0, dy->dptr(), + x0->dptr(), + ctx->stream()->As()->cuda_stream())); + + ones = static_cast(cuda_device->GetConstOnes(dy->data_type(), hidden_size)); + DimVector dy_mul_x0_shape(2); + dy->shape_view().ToDimVector(&dy_mul_x0_shape); + ones_buf_shape.at(0) = hidden_size; + ones_buf_shape.at(1) = 1; + InferMatmulMNK(dy_mul_x0_shape, ones_buf_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n, + &k); + reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, dy_mul_x0, ones, 0.0, dmatmul_result0); + + // step3: Get dx + T* dx_buf = reinterpret_cast(tmp_buffer->mut_dptr() + + GetCudaAlignedSize(dy_elem_cnt * sizeof(T)) + + GetCudaAlignedSize(batch_size * sizeof(T))); + DimVector dmatmul_result_shape(2); + dmatmul_result_shape.at(0) = batch_size; + dmatmul_result_shape.at(1) = 1; // todo change to hidden size + DimVector weight_shape(2); + weight->shape_view().ToDimVector(&weight_shape); + InferMatmulMNK(dmatmul_result_shape, weight_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n, + &k); + reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, dmatmul_result0, weight->dptr(), 0.0, + reinterpret_cast(dx_buf)); + OF_CUDA_CHECK(cuda::elementwise::Binary(AddOp(), dy_elem_cnt, dx->mut_dptr(), dx_buf, + dy->dptr(), + ctx->stream()->As()->cuda_stream())); + + // step4: Get dw. + DimVector x_shape(2); + x->shape_view().ToDimVector(&x_shape); + + InferMatmulMNK(dmatmul_result_shape, x_shape, /*trans_a=*/true, /*trans_b=*/false, &m, &n, &k); + auto weight_grad_matmul = NewWeightGradMatmulPrimitive(ctx); + CHECK(weight_grad_matmul); + weight_grad_matmul->Launch(ctx->stream(), m, n, k, 1.0, dmatmul_result0, x->dptr(), 0.0, + dw->mut_dptr()); + + // step5: Get dx0. + DispatchBroadcastMulIndexType(ctx->stream(), dy->dptr(), matmul_result->dptr(), + dx0->mut_dptr(), hidden_size, dy_elem_cnt); + } +}; + +#define REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V1_GRAD_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fused_cross_feature_interaction_v1_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((HobDeviceType() == DeviceType::kCUDA) \ + && (HobDataType("dy", 0) == GetDataType::value) \ + && ReduceMatmulPrimitiveExists() && WeightGradMatmulPrimitiveExists()) \ + .SetInferTmpSizeFn([](InferContext* ctx) { \ + size_t tmp_size = 0; \ + const TensorDesc& dy = ctx->InputTensorDesc("dy", 0); \ + const int64_t dy_elem_cnt = dy.shape().elem_cnt(); \ + const int64_t batch_size = dy.shape().At(0); \ + size_t dy_mul_x0_size = GetCudaAlignedSize(dy_elem_cnt * sizeof(dtype)); \ + size_t dmatmul_result_size = GetCudaAlignedSize(batch_size * sizeof(dtype)); \ + size_t dx_buf_size = dy_mul_x0_size; \ + tmp_size = dy_mul_x0_size + dmatmul_result_size + dx_buf_size; \ + return tmp_size; \ + }); + +REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V1_GRAD_KERNEL(float) +REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V1_GRAD_KERNEL(half) + +template +class FusedCrossFeatureInteractionV2GradKernel final : public OpKernel, public CudaGraphSupport { + public: + FusedCrossFeatureInteractionV2GradKernel() = default; + ~FusedCrossFeatureInteractionV2GradKernel() = default; + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + + private: + using user_op::OpKernel::Compute; + void Compute(KernelComputeContext* ctx) const override { + const Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + const Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); + const Tensor* bias = ctx->Tensor4ArgNameAndIndex("bias", 0); + const Tensor* x0 = ctx->Tensor4ArgNameAndIndex("x0", 0); + const Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + const Tensor* matmul_result = ctx->Tensor4ArgNameAndIndex("matmul_result", 0); + + const int64_t batch_size = dy->shape_view().At(0); + const int64_t in_size = weight->shape_view().At(1); + const int64_t hidden_size = weight->shape_view().At(0); + const int64_t dy_elem_cnt = dy->shape_view().elem_cnt(); + + Tensor* dx0 = ctx->Tensor4ArgNameAndIndex("dx0", 0); + Tensor* dw = ctx->Tensor4ArgNameAndIndex("dw", 0); + Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + Tensor* dbias = ctx->Tensor4ArgNameAndIndex("dbias", 0); + Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + + // step1: Get dx0. + DispatchBroadcastAddElementwiseMulIndexType(ctx->stream(), matmul_result->dptr(), + bias->dptr(), dy->dptr(), + dx0->mut_dptr(), hidden_size, dy_elem_cnt); + + // step2: Get dmatmul_result0. + T* dmatmul_result0 = reinterpret_cast(tmp_buffer->mut_dptr()); + OF_CUDA_CHECK(cuda::elementwise::Binary(MulOp(), dy_elem_cnt, dmatmul_result0, dy->dptr(), + x0->dptr(), + ctx->stream()->As()->cuda_stream())); + // step3: Get dx + T* dx_buf = reinterpret_cast(tmp_buffer->mut_dptr() + + GetCudaAlignedSize(dy_elem_cnt * sizeof(T))); + DimVector dmatmul_result_shape(2); + dmatmul_result_shape.at(0) = batch_size; + dmatmul_result_shape.at(1) = hidden_size; + DimVector weight_shape(2); + weight->shape_view().ToDimVector(&weight_shape); + size_t m = 0, n = 0, k = 0; + InferMatmulMNK(dmatmul_result_shape, weight_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n, + &k); + auto reduce_matmul = NewReduceMatmulPrimitive(ctx); + CHECK(reduce_matmul); + reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, dmatmul_result0, weight->dptr(), 0.0, + reinterpret_cast(dx_buf)); + OF_CUDA_CHECK(cuda::elementwise::Binary(AddOp(), dy_elem_cnt, dx->mut_dptr(), dx_buf, + dy->dptr(), + ctx->stream()->As()->cuda_stream())); + + // step4: Get dw. + DimVector x_shape(2); + x->shape_view().ToDimVector(&x_shape); + + InferMatmulMNK(dmatmul_result_shape, x_shape, /*trans_a=*/true, /*trans_b=*/false, &m, &n, &k); + auto weight_grad_matmul = NewWeightGradMatmulPrimitive(ctx); + CHECK(weight_grad_matmul); + weight_grad_matmul->Launch(ctx->stream(), m, n, k, 1.0, dmatmul_result0, x->dptr(), 0.0, + dw->mut_dptr()); + + // step5: Get dbias. + const T* ones = nullptr; + auto* cuda_device = dynamic_cast(ctx->stream()->device()); + if (cuda_device != nullptr) { + ones = static_cast(cuda_device->GetConstOnes(dy->data_type(), batch_size)); + } + DimVector dy_shape(2); + dy->shape_view().ToDimVector(&dy_shape); + DimVector ones_buf_shape(2); + ones_buf_shape.at(0) = 1; + ones_buf_shape.at(1) = batch_size; + InferMatmulMNK(ones_buf_shape, dy_shape, /*trans_a=*/false, /*trans_b=*/false, &m, &n, &k); + reduce_matmul->Launch(ctx->stream(), m, n, k, 1.0, ones, + reinterpret_cast(dmatmul_result0), 0.0, dbias->mut_dptr()); + } +}; + +#define REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V2_GRAD_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fused_cross_feature_interaction_v2_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((HobDeviceType() == DeviceType::kCUDA) \ + && (HobDataType("dy", 0) == GetDataType::value) \ + && ReduceMatmulPrimitiveExists() && WeightGradMatmulPrimitiveExists()) \ + .SetInferTmpSizeFn([](InferContext* ctx) { \ + size_t tmp_size = 0; \ + const TensorDesc& dy = ctx->InputTensorDesc("dy", 0); \ + const int64_t dy_elem_cnt = dy.shape().elem_cnt(); \ + size_t dmatmul_result_size = GetCudaAlignedSize(dy_elem_cnt * sizeof(dtype)); \ + size_t dx_buf_size = dmatmul_result_size; \ + tmp_size = dmatmul_result_size + dx_buf_size; \ + return tmp_size; \ + }); + +REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V2_GRAD_KERNEL(float) +REGISTER_FUSED_CROSS_FEATURE_INTERACTION_V2_GRAD_KERNEL(half) + +} // namespace user_op + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/fused_dot_feature_interaction_kernel.hip.cpp b/oneflow/user/kernels/fused_dot_feature_interaction_kernel.hip.cpp index 07993ac..bf662d2 100644 --- a/oneflow/user/kernels/fused_dot_feature_interaction_kernel.hip.cpp +++ b/oneflow/user/kernels/fused_dot_feature_interaction_kernel.hip.cpp @@ -1,923 +1,923 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include "oneflow/core/ep/include/primitive/copy_nd.h" -#include "oneflow/core/ep/include/primitive/batch_matmul.h" -#include "oneflow/core/kernel/cuda_graph_support.h" - -namespace oneflow { - -namespace { - -__global__ void GenerateGatherIndicesGpu(const int32_t elem_cnt, const int32_t stride, - const int32_t in_cols, const int32_t offset, - int32_t* gather_indices) { - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { - const int32_t row = i / stride; - const int32_t col = i - row * stride; - if (col < row + offset) { - int32_t in_index = row * in_cols + col; - int32_t idx = row * (offset + row - 1 + offset) / 2 + col; - gather_indices[idx] = in_index; - } - } -} - -template -__global__ void GatherConcatGpu(int32_t elem_cnt, int32_t out_cols, int32_t valid_out_cols, - int32_t in_cols, int32_t output_concat_end_dim, - const int32_t* gather_indices, const T* in, - const T* output_concat_ptr, T* out_ptr) { - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { - const int32_t row = i / out_cols; - const int32_t col = i - row * out_cols; - T out_val; - if (col < output_concat_end_dim) { - const int32_t output_concat_idx = row * output_concat_end_dim + col; - out_val = output_concat_ptr[output_concat_idx]; - } else if (col < valid_out_cols) { - const int32_t gather_col_idx = gather_indices[col - output_concat_end_dim]; - const int32_t in_offset = row * in_cols + gather_col_idx; - out_val = in[in_offset]; - } else { - out_val = 0; - } - out_ptr[i] = out_val; - } -} - -template -__global__ void ScatterSplitAddTransposeGpu(int32_t elem_cnt, int32_t stride_dim, int32_t out_dim, - int32_t in_grad_stride, int32_t in_grad_matrix_dim, - int32_t in_grad_matrix_valid_dim, - int32_t output_concat_end_dim, const int32_t offset, - const T* dy, T* output_concat_grad, T* in_grad) { - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { - const int32_t row = i / stride_dim; - const int32_t col = i - row * stride_dim; - if (col < output_concat_end_dim) { - output_concat_grad[row * output_concat_end_dim + col] = dy[row * out_dim + col]; - } else { - int32_t in_col_id = col - output_concat_end_dim; - const int32_t matrix_row = in_col_id / in_grad_matrix_dim; - const int32_t matrix_col = in_col_id - matrix_row * in_grad_matrix_dim; - T grad_val = 0; - const T* row_dy = dy + row * out_dim + output_concat_end_dim; - if (matrix_row < in_grad_matrix_valid_dim && matrix_col < in_grad_matrix_valid_dim) { - if (matrix_col < matrix_row) { - int32_t dy_col_idx = matrix_row * (offset + matrix_row - 1 + offset) / 2 + matrix_col; - grad_val = row_dy[dy_col_idx]; - } else if (matrix_row < matrix_col) { - // transpose add - int32_t trans_row_id = matrix_col; - int32_t trans_col_id = matrix_row; - int32_t dy_col_idx = - trans_row_id * (offset + trans_row_id - 1 + offset) / 2 + trans_col_id; - grad_val = row_dy[dy_col_idx]; - } else if ((matrix_row == matrix_col) && (offset == 1)) { - int32_t dy_col_idx = matrix_row * (offset + matrix_row - 1 + offset) / 2 + matrix_col; - grad_val = row_dy[dy_col_idx] * static_cast(2); - } - } - int32_t in_grad_offset = row * in_grad_stride + in_col_id; - in_grad[in_grad_offset] = grad_val; - } - } -} - -template -void ConcatFeatures(user_op::KernelComputeContext* ctx, int64_t dst_rows, int64_t dst_cols, - void* dst_ptr) { - const int64_t feature_input_size = ctx->input_size("features"); - auto primitive = ep::primitive::NewPrimitive(DeviceType::kCUDA, 2); - DimVector dst_shape = {dst_rows, dst_cols}; - int64_t out_col_offset = 0; - for (int64_t i = 0; i < feature_input_size; ++i) { - const user_op::Tensor* feature = ctx->Tensor4ArgNameAndIndex("features", i); - const int64_t feature_rows = feature->shape_view().At(0); - const int64_t feature_cols = feature->shape_view().Count(1); - DimVector dst_pos_vec = {0, out_col_offset}; - DimVector src_shape = {feature_rows, feature_cols}; - DimVector src_pos_vec = {0, 0}; - DimVector extent_vec = {feature_rows, feature_cols}; - primitive->Launch(ctx->stream(), feature->data_type(), 2, dst_ptr, dst_shape.data(), - dst_pos_vec.data(), feature->dptr(), src_shape.data(), src_pos_vec.data(), - extent_vec.data()); - out_col_offset += feature_cols; - } - int64_t pad_dim = dst_cols - out_col_offset; - if (pad_dim > 0) { - char* out_ptr = reinterpret_cast(dst_ptr) + out_col_offset * sizeof(T); - OF_CUDA_CHECK(hipMemset2DAsync(out_ptr, dst_cols * sizeof(T), 0, pad_dim * sizeof(T), dst_rows, - ctx->stream()->As()->cuda_stream())); - } -} - -template -void GatherConcatKernel(ep::Stream* stream, int32_t elem_cnt, int32_t out_dim, - int32_t valid_out_dim, int32_t features_concated_dim, - int32_t concated_padded_dim, int32_t output_concat_end_dim, - bool self_interaction, const T* matmul_out, const T* output_concat_ptr, - int32_t* gather_indices_ptr, T* out_ptr) { - hipStream_t cuda_stream = stream->As()->cuda_stream(); - const int32_t gen_indices_elem_cnt = features_concated_dim * features_concated_dim; - int32_t offset = self_interaction ? 1 : 0; - hipLaunchKernelGGL(GenerateGatherIndicesGpu, BlocksNum4ThreadsNum(gen_indices_elem_cnt), kCudaThreadsNumPerBlock, 0, cuda_stream, gen_indices_elem_cnt, features_concated_dim, - concated_padded_dim, offset, gather_indices_ptr); - - int32_t matmul_stride = concated_padded_dim * concated_padded_dim; - hipLaunchKernelGGL(GatherConcatGpu, BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0, cuda_stream, - elem_cnt, out_dim, valid_out_dim, matmul_stride, output_concat_end_dim, gather_indices_ptr, - matmul_out, output_concat_ptr, out_ptr); -} - -template -void ScatterSplitAddTranspose(ep::Stream* stream, int32_t batch_size, int32_t out_dim, - int32_t concated_padded_dim, int32_t features_concated_dim, - int32_t output_concat_end_dim, const bool self_interaction, - const T* dy, T* output_concat_grad, T* matmul_out_grad_ptr) { - int32_t stride_dim = output_concat_end_dim + concated_padded_dim * concated_padded_dim; - int32_t matmul_stride = concated_padded_dim * concated_padded_dim; - const int32_t elem_cnt = batch_size * stride_dim; - int32_t offset = self_interaction ? 1 : 0; - ScatterSplitAddTransposeGpu<<As()->cuda_stream()>>>( - elem_cnt, stride_dim, out_dim, matmul_stride, concated_padded_dim, features_concated_dim, - output_concat_end_dim, offset, dy, output_concat_grad, matmul_out_grad_ptr); -} - -template -void ConcatFeaturesGrad(user_op::KernelComputeContext* ctx, const int64_t batch_size, - const int64_t concated_padded_dim, const int64_t vector_size, - const T* concated_features_grad) { - auto primitive = ep::primitive::NewPrimitive(DeviceType::kCUDA, 2); - DimVector src_shape = {batch_size, concated_padded_dim * vector_size}; - int64_t in_col_offset = 0; - for (int64_t i = 0; i < ctx->output_size("features_grad"); ++i) { - user_op::Tensor* feature_grad = ctx->Tensor4ArgNameAndIndex("features_grad", i); - const int64_t feature_grad_rows = feature_grad->shape_view().At(0); - const int64_t feature_grad_cols = feature_grad->shape_view().Count(1); - DimVector dst_shape = {feature_grad_rows, feature_grad_cols}; - DimVector dst_pos_vec = {0, 0}; - DimVector src_pos_vec = {0, in_col_offset}; - DimVector extent_vec = {feature_grad_rows, feature_grad_cols}; - in_col_offset += feature_grad_cols; - primitive->Launch(ctx->stream(), feature_grad->data_type(), 2, feature_grad->mut_dptr(), - dst_shape.data(), dst_pos_vec.data(), concated_features_grad, - src_shape.data(), src_pos_vec.data(), extent_vec.data()); - } -} - -template -struct DefaultComputeType { - using type = T; -}; - -template<> -struct DefaultComputeType { - using type = float; -}; - -template -struct alignas(sizeof(T) * pack_size) Pack { - T elem[pack_size]; -}; - -int64_t GetPaddedDim(int64_t dim) { - const int64_t align_dim = 16; - const int64_t padded_dim = (dim + align_dim - 1) / align_dim * align_dim; - return padded_dim; -} - -template -struct DotFwdParam { - const T* in[max_in]; - int32_t in_feature_dim[max_in]; - int32_t dim_start_offset[max_in]; - int32_t features_dim; - const T* output_concat; - int32_t output_concat_size; - T* out; - int32_t num_in; -}; - -constexpr int kUnrollDim = 2; -template -__global__ void DotFeatureInteractionWmmaImpl( - int m_num_tiles, int k_num_tiles, int64_t batch_size, int padded_num_rows, int vector_num_pack, - int padded_vector_num_pack, int out_num_cols, int out_num_cols_num_pack, int in_shared_mem_cols, - int in_shared_mem_cols_num_pack, int acc_shared_mem_cols, int acc_shared_mem_cols_num_pack, - int offset, int output_padding, DotFwdParam param) { - asm volatile("s_trap 0;"); -} - -template -struct KTileDim { - static const int val = 16; -}; - -template<> -struct KTileDim { - static const int val = 8; -}; - -template -struct DotFeatureInteractionKernel { - static bool Launch(ep::Stream* stream, int64_t batch_size, int concated_padded_dim, - int vector_size, int out_num_cols, bool self_interaction, int output_padding, - const DotFwdParam& param) { - const int block_size = 128; - const int block_dim_x = 32; - const int block_dim_y = block_size / block_dim_x; - const int num_blocks = batch_size; - const int mn_tile_dim = 16; - const int k_tile_dim = KTileDim::val; - const int64_t padded_vector_size = GetPaddedDim(vector_size); - const int m_num_tiles = concated_padded_dim / mn_tile_dim; - const int k_num_tiles = padded_vector_size / k_tile_dim; - const int skew_in = 8; - const int skew_acc = 8; - const int in_shared_mem_num_cols = padded_vector_size + skew_in; - const int acc_shared_mem_num_cols = concated_padded_dim + skew_acc; - const size_t in_shared_mem_bytes = concated_padded_dim * in_shared_mem_num_cols * sizeof(T); - using ComputeType = typename DefaultComputeType::type; - const size_t acc_shared_mem_bytes = - concated_padded_dim * acc_shared_mem_num_cols * sizeof(ComputeType); - const size_t total_shared_mem_bytes = in_shared_mem_bytes + acc_shared_mem_bytes; - const int32_t offset = self_interaction ? 1 : 0; - const int out_num_cols_num_pack = out_num_cols / pack_size; - const int vector_num_pack = vector_size / pack_size; - const int padded_vector_num_pack = padded_vector_size / pack_size; - const int in_shared_mem_cols_num_pack = in_shared_mem_num_cols / pack_size; - const int acc_shared_mem_cols_num_pack = acc_shared_mem_num_cols / pack_size; - int max_active_blocks; - OF_CUDA_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks, - DotFeatureInteractionWmmaImpl, - block_size, total_shared_mem_bytes)); - if (max_active_blocks <= 0) { return false; } - hipStream_t cuda_stream = stream->As()->cuda_stream(); - hipLaunchKernelGGL(HIP_KERNEL_NAME(DotFeatureInteractionWmmaImpl), num_blocks, dim3(block_dim_x, block_dim_y), total_shared_mem_bytes, cuda_stream, - m_num_tiles, k_num_tiles, batch_size, concated_padded_dim, vector_num_pack, - padded_vector_num_pack, out_num_cols, out_num_cols_num_pack, in_shared_mem_num_cols, - in_shared_mem_cols_num_pack, acc_shared_mem_num_cols, acc_shared_mem_cols_num_pack, - offset, output_padding, param); - return true; - } -}; - -template -struct DotBwdParam { - const T* out_grad; - const T* in[max_in]; - T* in_grad[max_in]; - T* output_concat_grad; - int32_t output_concat_size; - int32_t in_feature_dim[max_in]; - int32_t dim_start_offset[max_in]; - int32_t features_dim; - int32_t num_in; -}; - -template -__global__ void DotFeatureInteractionBackwardWmmaImpl( - int m_num_tiles, int n_num_tiles, int k_num_tiles, int64_t batch_size, int padded_num_rows, - int vector_num_pack, int padded_vector_num_pack, int out_num_cols, int in_shared_mem_cols, - int in_shared_mem_cols_num_pack, int matrix_out_grad_shared_mem_cols, int offset, - DotBwdParam param) { - asm volatile("s_trap 0;"); -} - -template -struct DotFeatureInteractionBackwardKernel { - static bool Launch(ep::Stream* stream, int64_t batch_size, int concated_padded_dim, - int vector_size, int out_num_cols, bool self_interaction, - const DotBwdParam& param) { - const int block_size = 256; - const int block_dim_x = 32; - const int block_dim_y = block_size / block_dim_x; - const int num_blocks = batch_size; - const int mn_tile_dim = 16; - const int k_tile_dim = KTileDim::val; - const int64_t padded_vector_size = GetPaddedDim(vector_size); - const int m_num_tiles = concated_padded_dim / mn_tile_dim; - const int k_num_tiles = concated_padded_dim / k_tile_dim; - const int n_num_tiles = padded_vector_size / mn_tile_dim; - const int skew_in = 8; - const int in_shared_mem_num_cols = padded_vector_size + skew_in; - const int matrix_out_grad_shared_mem_cols = concated_padded_dim + skew_in; - const size_t in_shared_mem_bytes = concated_padded_dim * in_shared_mem_num_cols * sizeof(T); - const size_t matrix_out_grad_shared_mem_bytes = - concated_padded_dim * matrix_out_grad_shared_mem_cols * sizeof(T); - using ComputeType = typename DefaultComputeType::type; - const size_t in_grad_shared_mem_bytes = - concated_padded_dim * in_shared_mem_num_cols * sizeof(ComputeType); - const size_t total_shared_mem_bytes = - in_shared_mem_bytes + matrix_out_grad_shared_mem_bytes + in_grad_shared_mem_bytes; - const int32_t offset = self_interaction ? 1 : 0; - const int vector_num_pack = vector_size / pack_size; - const int padded_vector_num_pack = padded_vector_size / pack_size; - const int in_shared_mem_cols_num_pack = in_shared_mem_num_cols / pack_size; - int max_active_blocks; - OF_CUDA_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks, - DotFeatureInteractionBackwardWmmaImpl, - block_size, total_shared_mem_bytes)); - if (max_active_blocks <= 0) { return false; } - hipStream_t cuda_stream = stream->As()->cuda_stream(); - DotFeatureInteractionBackwardWmmaImpl - <<>>( - m_num_tiles, n_num_tiles, k_num_tiles, batch_size, concated_padded_dim, vector_num_pack, - padded_vector_num_pack, out_num_cols, in_shared_mem_num_cols, - in_shared_mem_cols_num_pack, matrix_out_grad_shared_mem_cols, offset, param); - - return true; - } -}; - -template -bool DispatchFeatureInteractionDotPackSize(user_op::KernelComputeContext* ctx, - const int32_t input_size) { - CHECK_LE(input_size, max_in) << input_size; - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const int64_t batch_size = out->shape_view().At(0); - const int64_t out_num_cols = out->shape_view().At(1); - const int64_t vector_size = ctx->TensorDesc4ArgNameAndIndex("features", 0)->shape().At(2); - DotFwdParam param; - param.num_in = input_size; - param.out = out->mut_dptr(); - int64_t features_concated_dim = 0; - for (int i = 0; i < input_size; ++i) { - param.in[i] = ctx->Tensor4ArgNameAndIndex("features", i)->dptr(); - param.in_feature_dim[i] = ctx->TensorDesc4ArgNameAndIndex("features", i)->shape().At(1); - param.dim_start_offset[i] = features_concated_dim; - features_concated_dim += param.in_feature_dim[i]; - } - const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim); - param.features_dim = features_concated_dim; - if (ctx->has_input("output_concat", 0)) { - const user_op::Tensor* output_concat = ctx->Tensor4ArgNameAndIndex("output_concat", 0); - param.output_concat = output_concat->dptr(); - param.output_concat_size = output_concat->shape_view().At(1); - } else { - param.output_concat = nullptr; - param.output_concat_size = 0; - } - const bool self_interaction = ctx->Attr("self_interaction"); - const int32_t output_padding = ctx->Attr("output_padding"); - if (vector_size % 4 == 0 && out_num_cols % 4 == 0) { - return DotFeatureInteractionKernel::Launch( - ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction, - output_padding, param); - } else if (vector_size % 2 == 0 && out_num_cols % 2 == 0) { - return DotFeatureInteractionKernel::Launch( - ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction, - output_padding, param); - } else { - return DotFeatureInteractionKernel::Launch( - ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction, - output_padding, param); - } -} - -template -bool DispatchFeatureInteractionDotBackwardPackSize(user_op::KernelComputeContext* ctx, - const int32_t input_size) { - CHECK_LE(input_size, max_in) << input_size; - user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - const int64_t batch_size = dy->shape_view().At(0); - const int64_t out_num_cols = dy->shape_view().At(1); - const int64_t vector_size = ctx->TensorDesc4ArgNameAndIndex("features", 0)->shape().At(2); - DotBwdParam param; - param.num_in = input_size; - param.out_grad = dy->dptr(); - int64_t features_concated_dim = 0; - for (int i = 0; i < input_size; ++i) { - param.in[i] = ctx->Tensor4ArgNameAndIndex("features", i)->dptr(); - param.in_grad[i] = ctx->Tensor4ArgNameAndIndex("features_grad", i)->mut_dptr(); - param.in_feature_dim[i] = ctx->TensorDesc4ArgNameAndIndex("features", i)->shape().At(1); - param.dim_start_offset[i] = features_concated_dim; - features_concated_dim += param.in_feature_dim[i]; - } - const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim); - param.features_dim = features_concated_dim; - if (ctx->has_output("output_concat_grad", 0)) { - user_op::Tensor* output_concat_grad = ctx->Tensor4ArgNameAndIndex("output_concat_grad", 0); - param.output_concat_grad = output_concat_grad->mut_dptr(); - param.output_concat_size = output_concat_grad->shape_view().At(1); - } else { - param.output_concat_grad = nullptr; - param.output_concat_size = 0; - } - const bool self_interaction = ctx->Attr("self_interaction"); - if (vector_size % 4 == 0) { - return DotFeatureInteractionBackwardKernel::Launch( - ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction, - param); - } else if (vector_size % 2 == 0) { - return DotFeatureInteractionBackwardKernel::Launch( - ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction, - param); - } else { - return DotFeatureInteractionBackwardKernel::Launch( - ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction, - param); - } -} - -template -struct Param { - const T* in[max_in]; - int32_t in_feature_dim[max_in]; - T* out; - int32_t num_in; -}; - -template -__global__ void FeatureInteractionSum(int64_t batch_size, int64_t vector_num_pack, - Param param) { - using ComputeType = typename DefaultComputeType::type; - Pack* dst_pack = reinterpret_cast*>(param.out); - for (int batch_idx = blockIdx.x * blockDim.y + threadIdx.y; batch_idx < batch_size; - batch_idx += gridDim.x * blockDim.y) { - Pack* batch_out = dst_pack + batch_idx * vector_num_pack; - for (int col_id = threadIdx.x; col_id < vector_num_pack; col_id += blockDim.x) { - Pack sum; - Pack square_sum; -#pragma unroll - for (int k = 0; k < pack_size; ++k) { - sum.elem[k] = static_cast(0); - square_sum.elem[k] = static_cast(0); - } - for (int i = 0; i < max_in; ++i) { - if (i >= param.num_in) { break; } - const Pack* batch_in = - reinterpret_cast*>(param.in[i]) - + batch_idx * param.in_feature_dim[i] * vector_num_pack; -#pragma unroll - for (int j = 0; j < param.in_feature_dim[i]; ++j) { - Pack val = batch_in[j * vector_num_pack + col_id]; -#pragma unroll - for (int k = 0; k < pack_size; ++k) { - const ComputeType compute_val = static_cast(val.elem[k]); - sum.elem[k] += compute_val; - square_sum.elem[k] += compute_val * compute_val; - } - } - } - Pack out; -#pragma unroll - for (int k = 0; k < pack_size; ++k) { - out.elem[k] = static_cast((sum.elem[k] * sum.elem[k] - square_sum.elem[k]) - * static_cast(0.5)); - } - batch_out[col_id] = out; - } - } -} - -template -struct GradParam { - const T* out_grad; - const T* in[max_in]; - int32_t in_feature_dim[max_in]; - T* in_grad[max_in]; - int32_t num_in; -}; - -template -__global__ void FeatureInteractionSumGrad(int64_t batch_size, int64_t vector_size, - GradParam param) { - using ComputeType = typename DefaultComputeType::type; - for (int batch_idx = blockIdx.x * blockDim.y + threadIdx.y; batch_idx < batch_size; - batch_idx += gridDim.x * blockDim.y) { - const T* batch_out_grad = param.out_grad + batch_idx * vector_size; - for (int col_id = threadIdx.x; col_id < vector_size; col_id += blockDim.x) { - ComputeType sum = 0; - for (int i = 0; i < max_in; ++i) { - if (i >= param.num_in) { break; } - const T* batch_in = param.in[i] + batch_idx * param.in_feature_dim[i] * vector_size; - for (int j = 0; j < param.in_feature_dim[i]; ++j) { - sum += static_cast(batch_in[j * vector_size + col_id]); - } - } - for (int i = 0; i < max_in; ++i) { - if (i >= param.num_in) { break; } - const int64_t in_batch_offset = batch_idx * param.in_feature_dim[i] * vector_size; - const T* batch_in = param.in[i] + in_batch_offset; - T* batch_in_grad = param.in_grad[i] + in_batch_offset; - for (int j = 0; j < param.in_feature_dim[i]; ++j) { - const int64_t offset = j * vector_size + col_id; - batch_in_grad[offset] = - static_cast(static_cast(batch_out_grad[col_id]) - * (sum - static_cast(batch_in[offset]))); - } - } - } - } -} - -void GetBlockDims(const int64_t vector_size, int* block_dim_x, int* block_dim_y) { - const int block_size = 256; - if (vector_size < block_size) { - *block_dim_x = std::ceil(static_cast(vector_size) / 8) * 8; - *block_dim_y = (block_size + *block_dim_x - 1) / *block_dim_x; - } else { - *block_dim_x = block_size; - *block_dim_y = 1; - } -} - -int GetNumBlocks(const int64_t num_instances, const int64_t instance_per_block) { - int max_blocks = (num_instances + instance_per_block - 1) / instance_per_block; - return std::min(max_blocks, kCudaMaxBlocksNum); -} - -template -void DispatchFeatureInteractionSumPackSize(ep::Stream* stream, const int64_t batch_size, - const int64_t vector_size, - const Param& param) { - int block_dim_x; - int block_dim_y; - const int pack_size = (vector_size % 2 == 0) ? 2 : 1; - const int64_t vector_num_pack = vector_size / pack_size; - GetBlockDims(vector_num_pack, &block_dim_x, &block_dim_y); - const int num_blocks = GetNumBlocks(batch_size, block_dim_y); - dim3 block_dims = dim3(block_dim_x, block_dim_y); - hipStream_t cuda_stream = stream->As()->cuda_stream(); - if (pack_size == 2) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(FeatureInteractionSum), num_blocks, block_dims, 0, cuda_stream, batch_size, vector_num_pack, param); - } else { - hipLaunchKernelGGL(HIP_KERNEL_NAME(FeatureInteractionSum), num_blocks, block_dims, 0, cuda_stream, batch_size, vector_num_pack, param); - } -} - -template -void DispatchFeatureInteractionSumInputSize(user_op::KernelComputeContext* ctx, - const int32_t input_size) { - CHECK_LE(input_size, max_in) << input_size; - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const int64_t batch_size = out->shape_view().At(0); - const int64_t vector_size = out->shape_view().At(1); - Param param; - param.num_in = input_size; - param.out = out->mut_dptr(); - for (int i = 0; i < input_size; ++i) { - param.in[i] = ctx->Tensor4ArgNameAndIndex("features", i)->dptr(); - param.in_feature_dim[i] = ctx->TensorDesc4ArgNameAndIndex("features", i)->shape().At(1); - } - DispatchFeatureInteractionSumPackSize(ctx->stream(), batch_size, vector_size, param); -} - -template -void DispatchFeatureInteractionSumGradInputSize(user_op::KernelComputeContext* ctx, - const int32_t input_size) { - CHECK_LE(input_size, max_in) << input_size; - const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - const int64_t batch_size = dy->shape_view().At(0); - const int64_t vector_size = dy->shape_view().At(1); - int block_dim_x; - int block_dim_y; - GetBlockDims(vector_size, &block_dim_x, &block_dim_y); - const int num_blocks = GetNumBlocks(batch_size, block_dim_y); - dim3 block_dims = dim3(block_dim_x, block_dim_y); - GradParam param; - param.num_in = input_size; - param.out_grad = dy->dptr(); - for (int i = 0; i < input_size; ++i) { - param.in[i] = ctx->Tensor4ArgNameAndIndex("features", i)->dptr(); - param.in_grad[i] = ctx->Tensor4ArgNameAndIndex("features_grad", i)->mut_dptr(); - param.in_feature_dim[i] = ctx->TensorDesc4ArgNameAndIndex("features_grad", i)->shape().At(1); - } - FeatureInteractionSumGrad - <<stream()->As()->cuda_stream()>>>( - batch_size, vector_size, param); -} - -} // namespace - -template -class FusedDotFeatureInteractionPoolingSumKernel final : public user_op::OpKernel, - public user_op::CudaGraphSupport { - public: - FusedDotFeatureInteractionPoolingSumKernel() = default; - ~FusedDotFeatureInteractionPoolingSumKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const int input_size = ctx->input_size("features"); - if (input_size == 1) { - DispatchFeatureInteractionSumInputSize(ctx, input_size); - } else if (input_size == 2) { - DispatchFeatureInteractionSumInputSize(ctx, input_size); - } else if (input_size <= 8) { - DispatchFeatureInteractionSumInputSize(ctx, input_size); - } else { - CHECK_LE(input_size, 128) << "input_size must not greater than 128. "; - DispatchFeatureInteractionSumInputSize(ctx, input_size); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fused_dot_feature_interaction") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("out", 0) == GetDataType::value) \ - && (user_op::HobAttr("pooling") == "sum")); - -REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_KERNEL(float) -REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_KERNEL(half) - -template -bool TryLaunchTensorCoreDotKernel(user_op::KernelComputeContext* ctx) { - const int input_size = ctx->input_size("features"); - if (input_size == 1) { - return DispatchFeatureInteractionDotPackSize(ctx, input_size); - } else if (input_size == 2) { - return DispatchFeatureInteractionDotPackSize(ctx, input_size); - } else if (input_size <= 8) { - return DispatchFeatureInteractionDotPackSize(ctx, input_size); - } else { - CHECK_LE(input_size, 128) << "input_size must not greater than 128. "; - return DispatchFeatureInteractionDotPackSize(ctx, input_size); - } -} - -template -bool TryLaunchTensorCoreDotBackwardKernel(user_op::KernelComputeContext* ctx) { - const int input_size = ctx->input_size("features"); - if (input_size == 1) { - return DispatchFeatureInteractionDotBackwardPackSize(ctx, input_size); - } else if (input_size == 2) { - return DispatchFeatureInteractionDotBackwardPackSize(ctx, input_size); - } else if (input_size <= 8) { - return DispatchFeatureInteractionDotBackwardPackSize(ctx, input_size); - } else { - CHECK_LE(input_size, 128) << "input_size must not greater than 128. "; - return DispatchFeatureInteractionDotBackwardPackSize(ctx, input_size); - } -} -template -class FusedDotFeatureInteractionKernel final : public user_op::OpKernel, - public user_op::CudaGraphSupport { - public: - FusedDotFeatureInteractionKernel() = default; - ~FusedDotFeatureInteractionKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const DataType data_type = out->data_type(); - CHECK_LT(out->shape_view().elem_cnt(), GetMaxVal()); - auto* cuda_stream = ctx->stream()->As(); - // if ((cuda_stream->device_properties().major >= 7 && data_type == DataType::kFloat16) - // || (cuda_stream->device_properties().major >= 8 && data_type == DataType::kFloat)) { - // bool success = TryLaunchTensorCoreDotKernel(ctx); - // if (success == true) { return; } - // } - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const int64_t batch_size = out->shape_view().At(0); - int64_t features_concated_dim = 0; - for (int64_t i = 0; i < ctx->input_size("features"); ++i) { - features_concated_dim += ctx->TensorDesc4ArgNameAndIndex("features", i)->shape().At(1); - } - const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim); - const int64_t vector_size = ctx->TensorDesc4ArgNameAndIndex("features", 0)->shape().At(2); - const int64_t out_dim = out->shape_view().At(1); - const int32_t output_padding = ctx->Attr("output_padding"); - const int64_t valid_out_dim = out_dim - output_padding; - const bool self_interaction = ctx->Attr("self_interaction"); - - T* matmul_out = reinterpret_cast(tmp_buffer->mut_dptr()); - size_t matmul_out_size = - GetCudaAlignedSize(batch_size * concated_padded_dim * concated_padded_dim * sizeof(T)); - const int64_t interaction_dim = self_interaction - ? features_concated_dim * (features_concated_dim + 1) / 2 - : features_concated_dim * (features_concated_dim - 1) / 2; - int32_t* gather_indices_ptr = - reinterpret_cast(tmp_buffer->mut_dptr() + matmul_out_size); - size_t gather_indices_size = GetCudaAlignedSize(interaction_dim * sizeof(int32_t)); - T* padded_concated_features_ptr = - reinterpret_cast(tmp_buffer->mut_dptr() + matmul_out_size + gather_indices_size); - size_t padded_concated_features_size = - GetCudaAlignedSize(batch_size * concated_padded_dim * vector_size * sizeof(T)); - CHECK_GE(tmp_buffer->shape_view().elem_cnt(), - matmul_out_size + gather_indices_size + padded_concated_features_size); - ConcatFeatures(ctx, batch_size, concated_padded_dim * vector_size, - padded_concated_features_ptr); - auto batch_matmul = ep::primitive::NewPrimitive( - ctx->device_type(), data_type, ep::primitive::BlasTransposeType::N, - ep::primitive::BlasTransposeType::T); - batch_matmul->Launch(ctx->stream(), batch_size, concated_padded_dim, concated_padded_dim, - vector_size, 1.0, padded_concated_features_ptr, - padded_concated_features_ptr, 0.0, matmul_out); - - int64_t output_concat_end_dim = 0; - const T* output_concat_ptr = nullptr; - if (ctx->has_input("output_concat", 0)) { - user_op::Tensor* output_concat = ctx->Tensor4ArgNameAndIndex("output_concat", 0); - output_concat_end_dim = output_concat->shape_view().At(1); - output_concat_ptr = output_concat->dptr(); - } - CHECK_EQ(valid_out_dim, output_concat_end_dim + interaction_dim); - GatherConcatKernel(ctx->stream(), out->shape_view().elem_cnt(), out_dim, valid_out_dim, - features_concated_dim, concated_padded_dim, output_concat_end_dim, - self_interaction, matmul_out, output_concat_ptr, gather_indices_ptr, - out->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -user_op::InferTmpSizeFn GenFusedDotFeatureInteractionInferTmpSizeFn() { - return [](user_op::InferContext* ctx) { - const Shape& first_feature_shape = ctx->InputShape("features", 0); - const int64_t batch_size = first_feature_shape.At(0); - const int64_t vector_size = first_feature_shape.At(2); - int64_t features_concated_dim = 0; - for (int32_t i = 0; i < ctx->input_size("features"); ++i) { - features_concated_dim += ctx->InputShape("features", i).At(1); - } - const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim); - size_t matmul_out_size = - GetCudaAlignedSize(batch_size * concated_padded_dim * concated_padded_dim * sizeof(T)); - const bool self_interaction = ctx->Attr("self_interaction"); - const int64_t interaction_dim = self_interaction - ? features_concated_dim * (features_concated_dim + 1) / 2 - : features_concated_dim * (features_concated_dim - 1) / 2; - size_t gather_indices_size = GetCudaAlignedSize(interaction_dim * sizeof(int32_t)); - size_t padded_concated_features_size = - GetCudaAlignedSize(batch_size * concated_padded_dim * vector_size * sizeof(T)); - return matmul_out_size + gather_indices_size + padded_concated_features_size; - }; -} - -#define REGISTER_FUSED_DOT_FEATURE_INTERACTION_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fused_dot_feature_interaction") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("out", 0) == GetDataType::value) \ - && (user_op::HobAttr("pooling") == "none")) \ - .SetInferTmpSizeFn(GenFusedDotFeatureInteractionInferTmpSizeFn()); - -REGISTER_FUSED_DOT_FEATURE_INTERACTION_KERNEL(float) -REGISTER_FUSED_DOT_FEATURE_INTERACTION_KERNEL(half) - -template -class FusedDotFeatureInteractionGradKernel final : public user_op::OpKernel, - public user_op::CudaGraphSupport { - public: - FusedDotFeatureInteractionGradKernel() = default; - ~FusedDotFeatureInteractionGradKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const DataType data_type = dy->data_type(); - auto* cuda_stream = ctx->stream()->As(); - // if ((cuda_stream->device_properties().major >= 7 && data_type == DataType::kFloat16) - // || (cuda_stream->device_properties().major >= 8 && data_type == DataType::kFloat)) { - // bool success = TryLaunchTensorCoreDotBackwardKernel(ctx); - // if (success == true) { return; } - // } - const int64_t batch_size = dy->shape_view().At(0); - int64_t features_concated_dim = 0; - for (int32_t i = 0; i < ctx->output_size("features_grad"); ++i) { - features_concated_dim += ctx->TensorDesc4ArgNameAndIndex("features_grad", i)->shape().At(1); - } - const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim); - const int64_t vector_size = ctx->TensorDesc4ArgNameAndIndex("features_grad", 0)->shape().At(2); - const int64_t out_dim = dy->shape_view().At(1); - const bool self_interaction = ctx->Attr("self_interaction"); - T* matmul_out_grad_ptr = reinterpret_cast(tmp_buffer->mut_dptr()); - size_t matmul_out_grad_size = - GetCudaAlignedSize(batch_size * concated_padded_dim * concated_padded_dim * sizeof(T)); - T* padded_concated_features_grad_ptr = - reinterpret_cast(tmp_buffer->mut_dptr() + matmul_out_grad_size); - size_t padded_concated_features_grad_size = - GetCudaAlignedSize(batch_size * concated_padded_dim * vector_size * sizeof(T)); - T* padded_concated_features_ptr = reinterpret_cast( - tmp_buffer->mut_dptr() + matmul_out_grad_size + padded_concated_features_grad_size); - size_t padded_concated_features_size = padded_concated_features_grad_size; - CHECK_LE( - matmul_out_grad_size + padded_concated_features_grad_size + padded_concated_features_size, - tmp_buffer->shape_view().elem_cnt()); - ConcatFeatures(ctx, batch_size, concated_padded_dim * vector_size, - padded_concated_features_ptr); - - T* output_concat_grad_ptr = nullptr; - int64_t output_concat_end_dim = 0; - if (ctx->has_output("output_concat_grad", 0)) { - user_op::Tensor* output_concat_grad = ctx->Tensor4ArgNameAndIndex("output_concat_grad", 0); - output_concat_grad_ptr = output_concat_grad->mut_dptr(); - output_concat_end_dim = output_concat_grad->shape_view().At(1); - } - ScatterSplitAddTranspose(ctx->stream(), batch_size, out_dim, concated_padded_dim, - features_concated_dim, output_concat_end_dim, self_interaction, - dy->dptr(), output_concat_grad_ptr, matmul_out_grad_ptr); - - auto batch_matmul = ep::primitive::NewPrimitive( - ctx->device_type(), data_type, ep::primitive::BlasTransposeType::N, - ep::primitive::BlasTransposeType::N); - batch_matmul->Launch(ctx->stream(), batch_size, concated_padded_dim, vector_size, - concated_padded_dim, 1.0, matmul_out_grad_ptr, - padded_concated_features_ptr, 0.0, padded_concated_features_grad_ptr); - - ConcatFeaturesGrad(ctx, batch_size, concated_padded_dim, vector_size, - padded_concated_features_grad_ptr); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -user_op::InferTmpSizeFn GenFusedDotFeatureInteractionGradInferTmpSizeFn() { - return [](user_op::InferContext* ctx) { - int64_t features_concated_dim = 0; - for (int32_t i = 0; i < ctx->output_size("features_grad"); ++i) { - features_concated_dim += ctx->InputShape("features_grad", i).At(1); - } - const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim); - const int64_t batch_size = ctx->InputShape("features_grad", 0).At(0); - const int64_t vector_size = ctx->InputShape("features_grad", 0).At(2); - size_t matmul_out_grad_size = - GetCudaAlignedSize(batch_size * concated_padded_dim * concated_padded_dim * sizeof(T)); - size_t padded_concated_features_grad_size = - GetCudaAlignedSize(batch_size * concated_padded_dim * vector_size * sizeof(T)); - size_t padded_concated_features_size = padded_concated_features_grad_size; - return matmul_out_grad_size + padded_concated_features_grad_size - + padded_concated_features_size; - }; -} - -#define REGISTER_FUSED_DOT_FEATURE_INTERACTION_GRAD_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fused_dot_feature_interaction_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dy", 0) == GetDataType::value) \ - && (user_op::HobAttr("pooling") == "none")) \ - .SetInferTmpSizeFn(GenFusedDotFeatureInteractionGradInferTmpSizeFn()); - -REGISTER_FUSED_DOT_FEATURE_INTERACTION_GRAD_KERNEL(float) -REGISTER_FUSED_DOT_FEATURE_INTERACTION_GRAD_KERNEL(half) - -template -class FusedDotFeatureInteractionPoolingSumGradKernel final : public user_op::OpKernel, - public user_op::CudaGraphSupport { - public: - FusedDotFeatureInteractionPoolingSumGradKernel() = default; - ~FusedDotFeatureInteractionPoolingSumGradKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const int input_size = ctx->input_size("features"); - if (input_size == 1) { - DispatchFeatureInteractionSumGradInputSize(ctx, input_size); - } else if (input_size == 2) { - DispatchFeatureInteractionSumGradInputSize(ctx, input_size); - } else if (input_size <= 8) { - DispatchFeatureInteractionSumGradInputSize(ctx, input_size); - } else { - CHECK_LE(input_size, 128) << "input_size must not greater than 128. "; - DispatchFeatureInteractionSumGradInputSize(ctx, input_size); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_GRAD_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fused_dot_feature_interaction_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dy", 0) == GetDataType::value) \ - && (user_op::HobAttr("pooling") == "sum")); - -REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_GRAD_KERNEL(float) -REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_GRAD_KERNEL(half) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include "oneflow/core/ep/include/primitive/copy_nd.h" +#include "oneflow/core/ep/include/primitive/batch_matmul.h" +#include "oneflow/core/kernel/cuda_graph_support.h" + +namespace oneflow { + +namespace { + +__global__ void GenerateGatherIndicesGpu(const int32_t elem_cnt, const int32_t stride, + const int32_t in_cols, const int32_t offset, + int32_t* gather_indices) { + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { + const int32_t row = i / stride; + const int32_t col = i - row * stride; + if (col < row + offset) { + int32_t in_index = row * in_cols + col; + int32_t idx = row * (offset + row - 1 + offset) / 2 + col; + gather_indices[idx] = in_index; + } + } +} + +template +__global__ void GatherConcatGpu(int32_t elem_cnt, int32_t out_cols, int32_t valid_out_cols, + int32_t in_cols, int32_t output_concat_end_dim, + const int32_t* gather_indices, const T* in, + const T* output_concat_ptr, T* out_ptr) { + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { + const int32_t row = i / out_cols; + const int32_t col = i - row * out_cols; + T out_val; + if (col < output_concat_end_dim) { + const int32_t output_concat_idx = row * output_concat_end_dim + col; + out_val = output_concat_ptr[output_concat_idx]; + } else if (col < valid_out_cols) { + const int32_t gather_col_idx = gather_indices[col - output_concat_end_dim]; + const int32_t in_offset = row * in_cols + gather_col_idx; + out_val = in[in_offset]; + } else { + out_val = 0; + } + out_ptr[i] = out_val; + } +} + +template +__global__ void ScatterSplitAddTransposeGpu(int32_t elem_cnt, int32_t stride_dim, int32_t out_dim, + int32_t in_grad_stride, int32_t in_grad_matrix_dim, + int32_t in_grad_matrix_valid_dim, + int32_t output_concat_end_dim, const int32_t offset, + const T* dy, T* output_concat_grad, T* in_grad) { + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { + const int32_t row = i / stride_dim; + const int32_t col = i - row * stride_dim; + if (col < output_concat_end_dim) { + output_concat_grad[row * output_concat_end_dim + col] = dy[row * out_dim + col]; + } else { + int32_t in_col_id = col - output_concat_end_dim; + const int32_t matrix_row = in_col_id / in_grad_matrix_dim; + const int32_t matrix_col = in_col_id - matrix_row * in_grad_matrix_dim; + T grad_val = 0; + const T* row_dy = dy + row * out_dim + output_concat_end_dim; + if (matrix_row < in_grad_matrix_valid_dim && matrix_col < in_grad_matrix_valid_dim) { + if (matrix_col < matrix_row) { + int32_t dy_col_idx = matrix_row * (offset + matrix_row - 1 + offset) / 2 + matrix_col; + grad_val = row_dy[dy_col_idx]; + } else if (matrix_row < matrix_col) { + // transpose add + int32_t trans_row_id = matrix_col; + int32_t trans_col_id = matrix_row; + int32_t dy_col_idx = + trans_row_id * (offset + trans_row_id - 1 + offset) / 2 + trans_col_id; + grad_val = row_dy[dy_col_idx]; + } else if ((matrix_row == matrix_col) && (offset == 1)) { + int32_t dy_col_idx = matrix_row * (offset + matrix_row - 1 + offset) / 2 + matrix_col; + grad_val = row_dy[dy_col_idx] * static_cast(2); + } + } + int32_t in_grad_offset = row * in_grad_stride + in_col_id; + in_grad[in_grad_offset] = grad_val; + } + } +} + +template +void ConcatFeatures(user_op::KernelComputeContext* ctx, int64_t dst_rows, int64_t dst_cols, + void* dst_ptr) { + const int64_t feature_input_size = ctx->input_size("features"); + auto primitive = ep::primitive::NewPrimitive(DeviceType::kCUDA, 2); + DimVector dst_shape = {dst_rows, dst_cols}; + int64_t out_col_offset = 0; + for (int64_t i = 0; i < feature_input_size; ++i) { + const user_op::Tensor* feature = ctx->Tensor4ArgNameAndIndex("features", i); + const int64_t feature_rows = feature->shape_view().At(0); + const int64_t feature_cols = feature->shape_view().Count(1); + DimVector dst_pos_vec = {0, out_col_offset}; + DimVector src_shape = {feature_rows, feature_cols}; + DimVector src_pos_vec = {0, 0}; + DimVector extent_vec = {feature_rows, feature_cols}; + primitive->Launch(ctx->stream(), feature->data_type(), 2, dst_ptr, dst_shape.data(), + dst_pos_vec.data(), feature->dptr(), src_shape.data(), src_pos_vec.data(), + extent_vec.data()); + out_col_offset += feature_cols; + } + int64_t pad_dim = dst_cols - out_col_offset; + if (pad_dim > 0) { + char* out_ptr = reinterpret_cast(dst_ptr) + out_col_offset * sizeof(T); + OF_CUDA_CHECK(hipMemset2DAsync(out_ptr, dst_cols * sizeof(T), 0, pad_dim * sizeof(T), dst_rows, + ctx->stream()->As()->cuda_stream())); + } +} + +template +void GatherConcatKernel(ep::Stream* stream, int32_t elem_cnt, int32_t out_dim, + int32_t valid_out_dim, int32_t features_concated_dim, + int32_t concated_padded_dim, int32_t output_concat_end_dim, + bool self_interaction, const T* matmul_out, const T* output_concat_ptr, + int32_t* gather_indices_ptr, T* out_ptr) { + hipStream_t cuda_stream = stream->As()->cuda_stream(); + const int32_t gen_indices_elem_cnt = features_concated_dim * features_concated_dim; + int32_t offset = self_interaction ? 1 : 0; + hipLaunchKernelGGL(GenerateGatherIndicesGpu, BlocksNum4ThreadsNum(gen_indices_elem_cnt), kCudaThreadsNumPerBlock, 0, cuda_stream, gen_indices_elem_cnt, features_concated_dim, + concated_padded_dim, offset, gather_indices_ptr); + + int32_t matmul_stride = concated_padded_dim * concated_padded_dim; + hipLaunchKernelGGL(GatherConcatGpu, BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0, cuda_stream, + elem_cnt, out_dim, valid_out_dim, matmul_stride, output_concat_end_dim, gather_indices_ptr, + matmul_out, output_concat_ptr, out_ptr); +} + +template +void ScatterSplitAddTranspose(ep::Stream* stream, int32_t batch_size, int32_t out_dim, + int32_t concated_padded_dim, int32_t features_concated_dim, + int32_t output_concat_end_dim, const bool self_interaction, + const T* dy, T* output_concat_grad, T* matmul_out_grad_ptr) { + int32_t stride_dim = output_concat_end_dim + concated_padded_dim * concated_padded_dim; + int32_t matmul_stride = concated_padded_dim * concated_padded_dim; + const int32_t elem_cnt = batch_size * stride_dim; + int32_t offset = self_interaction ? 1 : 0; + ScatterSplitAddTransposeGpu<<As()->cuda_stream()>>>( + elem_cnt, stride_dim, out_dim, matmul_stride, concated_padded_dim, features_concated_dim, + output_concat_end_dim, offset, dy, output_concat_grad, matmul_out_grad_ptr); +} + +template +void ConcatFeaturesGrad(user_op::KernelComputeContext* ctx, const int64_t batch_size, + const int64_t concated_padded_dim, const int64_t vector_size, + const T* concated_features_grad) { + auto primitive = ep::primitive::NewPrimitive(DeviceType::kCUDA, 2); + DimVector src_shape = {batch_size, concated_padded_dim * vector_size}; + int64_t in_col_offset = 0; + for (int64_t i = 0; i < ctx->output_size("features_grad"); ++i) { + user_op::Tensor* feature_grad = ctx->Tensor4ArgNameAndIndex("features_grad", i); + const int64_t feature_grad_rows = feature_grad->shape_view().At(0); + const int64_t feature_grad_cols = feature_grad->shape_view().Count(1); + DimVector dst_shape = {feature_grad_rows, feature_grad_cols}; + DimVector dst_pos_vec = {0, 0}; + DimVector src_pos_vec = {0, in_col_offset}; + DimVector extent_vec = {feature_grad_rows, feature_grad_cols}; + in_col_offset += feature_grad_cols; + primitive->Launch(ctx->stream(), feature_grad->data_type(), 2, feature_grad->mut_dptr(), + dst_shape.data(), dst_pos_vec.data(), concated_features_grad, + src_shape.data(), src_pos_vec.data(), extent_vec.data()); + } +} + +template +struct DefaultComputeType { + using type = T; +}; + +template<> +struct DefaultComputeType { + using type = float; +}; + +template +struct alignas(sizeof(T) * pack_size) Pack { + T elem[pack_size]; +}; + +int64_t GetPaddedDim(int64_t dim) { + const int64_t align_dim = 16; + const int64_t padded_dim = (dim + align_dim - 1) / align_dim * align_dim; + return padded_dim; +} + +template +struct DotFwdParam { + const T* in[max_in]; + int32_t in_feature_dim[max_in]; + int32_t dim_start_offset[max_in]; + int32_t features_dim; + const T* output_concat; + int32_t output_concat_size; + T* out; + int32_t num_in; +}; + +constexpr int kUnrollDim = 2; +template +__global__ void DotFeatureInteractionWmmaImpl( + int m_num_tiles, int k_num_tiles, int64_t batch_size, int padded_num_rows, int vector_num_pack, + int padded_vector_num_pack, int out_num_cols, int out_num_cols_num_pack, int in_shared_mem_cols, + int in_shared_mem_cols_num_pack, int acc_shared_mem_cols, int acc_shared_mem_cols_num_pack, + int offset, int output_padding, DotFwdParam param) { + asm volatile("s_trap 0;"); +} + +template +struct KTileDim { + static const int val = 16; +}; + +template<> +struct KTileDim { + static const int val = 8; +}; + +template +struct DotFeatureInteractionKernel { + static bool Launch(ep::Stream* stream, int64_t batch_size, int concated_padded_dim, + int vector_size, int out_num_cols, bool self_interaction, int output_padding, + const DotFwdParam& param) { + const int block_size = 128; + const int block_dim_x = 32; + const int block_dim_y = block_size / block_dim_x; + const int num_blocks = batch_size; + const int mn_tile_dim = 16; + const int k_tile_dim = KTileDim::val; + const int64_t padded_vector_size = GetPaddedDim(vector_size); + const int m_num_tiles = concated_padded_dim / mn_tile_dim; + const int k_num_tiles = padded_vector_size / k_tile_dim; + const int skew_in = 8; + const int skew_acc = 8; + const int in_shared_mem_num_cols = padded_vector_size + skew_in; + const int acc_shared_mem_num_cols = concated_padded_dim + skew_acc; + const size_t in_shared_mem_bytes = concated_padded_dim * in_shared_mem_num_cols * sizeof(T); + using ComputeType = typename DefaultComputeType::type; + const size_t acc_shared_mem_bytes = + concated_padded_dim * acc_shared_mem_num_cols * sizeof(ComputeType); + const size_t total_shared_mem_bytes = in_shared_mem_bytes + acc_shared_mem_bytes; + const int32_t offset = self_interaction ? 1 : 0; + const int out_num_cols_num_pack = out_num_cols / pack_size; + const int vector_num_pack = vector_size / pack_size; + const int padded_vector_num_pack = padded_vector_size / pack_size; + const int in_shared_mem_cols_num_pack = in_shared_mem_num_cols / pack_size; + const int acc_shared_mem_cols_num_pack = acc_shared_mem_num_cols / pack_size; + int max_active_blocks; + OF_CUDA_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks, + DotFeatureInteractionWmmaImpl, + block_size, total_shared_mem_bytes)); + if (max_active_blocks <= 0) { return false; } + hipStream_t cuda_stream = stream->As()->cuda_stream(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(DotFeatureInteractionWmmaImpl), num_blocks, dim3(block_dim_x, block_dim_y), total_shared_mem_bytes, cuda_stream, + m_num_tiles, k_num_tiles, batch_size, concated_padded_dim, vector_num_pack, + padded_vector_num_pack, out_num_cols, out_num_cols_num_pack, in_shared_mem_num_cols, + in_shared_mem_cols_num_pack, acc_shared_mem_num_cols, acc_shared_mem_cols_num_pack, + offset, output_padding, param); + return true; + } +}; + +template +struct DotBwdParam { + const T* out_grad; + const T* in[max_in]; + T* in_grad[max_in]; + T* output_concat_grad; + int32_t output_concat_size; + int32_t in_feature_dim[max_in]; + int32_t dim_start_offset[max_in]; + int32_t features_dim; + int32_t num_in; +}; + +template +__global__ void DotFeatureInteractionBackwardWmmaImpl( + int m_num_tiles, int n_num_tiles, int k_num_tiles, int64_t batch_size, int padded_num_rows, + int vector_num_pack, int padded_vector_num_pack, int out_num_cols, int in_shared_mem_cols, + int in_shared_mem_cols_num_pack, int matrix_out_grad_shared_mem_cols, int offset, + DotBwdParam param) { + asm volatile("s_trap 0;"); +} + +template +struct DotFeatureInteractionBackwardKernel { + static bool Launch(ep::Stream* stream, int64_t batch_size, int concated_padded_dim, + int vector_size, int out_num_cols, bool self_interaction, + const DotBwdParam& param) { + const int block_size = 256; + const int block_dim_x = 32; + const int block_dim_y = block_size / block_dim_x; + const int num_blocks = batch_size; + const int mn_tile_dim = 16; + const int k_tile_dim = KTileDim::val; + const int64_t padded_vector_size = GetPaddedDim(vector_size); + const int m_num_tiles = concated_padded_dim / mn_tile_dim; + const int k_num_tiles = concated_padded_dim / k_tile_dim; + const int n_num_tiles = padded_vector_size / mn_tile_dim; + const int skew_in = 8; + const int in_shared_mem_num_cols = padded_vector_size + skew_in; + const int matrix_out_grad_shared_mem_cols = concated_padded_dim + skew_in; + const size_t in_shared_mem_bytes = concated_padded_dim * in_shared_mem_num_cols * sizeof(T); + const size_t matrix_out_grad_shared_mem_bytes = + concated_padded_dim * matrix_out_grad_shared_mem_cols * sizeof(T); + using ComputeType = typename DefaultComputeType::type; + const size_t in_grad_shared_mem_bytes = + concated_padded_dim * in_shared_mem_num_cols * sizeof(ComputeType); + const size_t total_shared_mem_bytes = + in_shared_mem_bytes + matrix_out_grad_shared_mem_bytes + in_grad_shared_mem_bytes; + const int32_t offset = self_interaction ? 1 : 0; + const int vector_num_pack = vector_size / pack_size; + const int padded_vector_num_pack = padded_vector_size / pack_size; + const int in_shared_mem_cols_num_pack = in_shared_mem_num_cols / pack_size; + int max_active_blocks; + OF_CUDA_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks, + DotFeatureInteractionBackwardWmmaImpl, + block_size, total_shared_mem_bytes)); + if (max_active_blocks <= 0) { return false; } + hipStream_t cuda_stream = stream->As()->cuda_stream(); + DotFeatureInteractionBackwardWmmaImpl + <<>>( + m_num_tiles, n_num_tiles, k_num_tiles, batch_size, concated_padded_dim, vector_num_pack, + padded_vector_num_pack, out_num_cols, in_shared_mem_num_cols, + in_shared_mem_cols_num_pack, matrix_out_grad_shared_mem_cols, offset, param); + + return true; + } +}; + +template +bool DispatchFeatureInteractionDotPackSize(user_op::KernelComputeContext* ctx, + const int32_t input_size) { + CHECK_LE(input_size, max_in) << input_size; + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + const int64_t batch_size = out->shape_view().At(0); + const int64_t out_num_cols = out->shape_view().At(1); + const int64_t vector_size = ctx->TensorDesc4ArgNameAndIndex("features", 0)->shape().At(2); + DotFwdParam param; + param.num_in = input_size; + param.out = out->mut_dptr(); + int64_t features_concated_dim = 0; + for (int i = 0; i < input_size; ++i) { + param.in[i] = ctx->Tensor4ArgNameAndIndex("features", i)->dptr(); + param.in_feature_dim[i] = ctx->TensorDesc4ArgNameAndIndex("features", i)->shape().At(1); + param.dim_start_offset[i] = features_concated_dim; + features_concated_dim += param.in_feature_dim[i]; + } + const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim); + param.features_dim = features_concated_dim; + if (ctx->has_input("output_concat", 0)) { + const user_op::Tensor* output_concat = ctx->Tensor4ArgNameAndIndex("output_concat", 0); + param.output_concat = output_concat->dptr(); + param.output_concat_size = output_concat->shape_view().At(1); + } else { + param.output_concat = nullptr; + param.output_concat_size = 0; + } + const bool self_interaction = ctx->Attr("self_interaction"); + const int32_t output_padding = ctx->Attr("output_padding"); + if (vector_size % 4 == 0 && out_num_cols % 4 == 0) { + return DotFeatureInteractionKernel::Launch( + ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction, + output_padding, param); + } else if (vector_size % 2 == 0 && out_num_cols % 2 == 0) { + return DotFeatureInteractionKernel::Launch( + ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction, + output_padding, param); + } else { + return DotFeatureInteractionKernel::Launch( + ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction, + output_padding, param); + } +} + +template +bool DispatchFeatureInteractionDotBackwardPackSize(user_op::KernelComputeContext* ctx, + const int32_t input_size) { + CHECK_LE(input_size, max_in) << input_size; + user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + const int64_t batch_size = dy->shape_view().At(0); + const int64_t out_num_cols = dy->shape_view().At(1); + const int64_t vector_size = ctx->TensorDesc4ArgNameAndIndex("features", 0)->shape().At(2); + DotBwdParam param; + param.num_in = input_size; + param.out_grad = dy->dptr(); + int64_t features_concated_dim = 0; + for (int i = 0; i < input_size; ++i) { + param.in[i] = ctx->Tensor4ArgNameAndIndex("features", i)->dptr(); + param.in_grad[i] = ctx->Tensor4ArgNameAndIndex("features_grad", i)->mut_dptr(); + param.in_feature_dim[i] = ctx->TensorDesc4ArgNameAndIndex("features", i)->shape().At(1); + param.dim_start_offset[i] = features_concated_dim; + features_concated_dim += param.in_feature_dim[i]; + } + const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim); + param.features_dim = features_concated_dim; + if (ctx->has_output("output_concat_grad", 0)) { + user_op::Tensor* output_concat_grad = ctx->Tensor4ArgNameAndIndex("output_concat_grad", 0); + param.output_concat_grad = output_concat_grad->mut_dptr(); + param.output_concat_size = output_concat_grad->shape_view().At(1); + } else { + param.output_concat_grad = nullptr; + param.output_concat_size = 0; + } + const bool self_interaction = ctx->Attr("self_interaction"); + if (vector_size % 4 == 0) { + return DotFeatureInteractionBackwardKernel::Launch( + ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction, + param); + } else if (vector_size % 2 == 0) { + return DotFeatureInteractionBackwardKernel::Launch( + ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction, + param); + } else { + return DotFeatureInteractionBackwardKernel::Launch( + ctx->stream(), batch_size, concated_padded_dim, vector_size, out_num_cols, self_interaction, + param); + } +} + +template +struct Param { + const T* in[max_in]; + int32_t in_feature_dim[max_in]; + T* out; + int32_t num_in; +}; + +template +__global__ void FeatureInteractionSum(int64_t batch_size, int64_t vector_num_pack, + Param param) { + using ComputeType = typename DefaultComputeType::type; + Pack* dst_pack = reinterpret_cast*>(param.out); + for (int batch_idx = blockIdx.x * blockDim.y + threadIdx.y; batch_idx < batch_size; + batch_idx += gridDim.x * blockDim.y) { + Pack* batch_out = dst_pack + batch_idx * vector_num_pack; + for (int col_id = threadIdx.x; col_id < vector_num_pack; col_id += blockDim.x) { + Pack sum; + Pack square_sum; +#pragma unroll + for (int k = 0; k < pack_size; ++k) { + sum.elem[k] = static_cast(0); + square_sum.elem[k] = static_cast(0); + } + for (int i = 0; i < max_in; ++i) { + if (i >= param.num_in) { break; } + const Pack* batch_in = + reinterpret_cast*>(param.in[i]) + + batch_idx * param.in_feature_dim[i] * vector_num_pack; +#pragma unroll + for (int j = 0; j < param.in_feature_dim[i]; ++j) { + Pack val = batch_in[j * vector_num_pack + col_id]; +#pragma unroll + for (int k = 0; k < pack_size; ++k) { + const ComputeType compute_val = static_cast(val.elem[k]); + sum.elem[k] += compute_val; + square_sum.elem[k] += compute_val * compute_val; + } + } + } + Pack out; +#pragma unroll + for (int k = 0; k < pack_size; ++k) { + out.elem[k] = static_cast((sum.elem[k] * sum.elem[k] - square_sum.elem[k]) + * static_cast(0.5)); + } + batch_out[col_id] = out; + } + } +} + +template +struct GradParam { + const T* out_grad; + const T* in[max_in]; + int32_t in_feature_dim[max_in]; + T* in_grad[max_in]; + int32_t num_in; +}; + +template +__global__ void FeatureInteractionSumGrad(int64_t batch_size, int64_t vector_size, + GradParam param) { + using ComputeType = typename DefaultComputeType::type; + for (int batch_idx = blockIdx.x * blockDim.y + threadIdx.y; batch_idx < batch_size; + batch_idx += gridDim.x * blockDim.y) { + const T* batch_out_grad = param.out_grad + batch_idx * vector_size; + for (int col_id = threadIdx.x; col_id < vector_size; col_id += blockDim.x) { + ComputeType sum = 0; + for (int i = 0; i < max_in; ++i) { + if (i >= param.num_in) { break; } + const T* batch_in = param.in[i] + batch_idx * param.in_feature_dim[i] * vector_size; + for (int j = 0; j < param.in_feature_dim[i]; ++j) { + sum += static_cast(batch_in[j * vector_size + col_id]); + } + } + for (int i = 0; i < max_in; ++i) { + if (i >= param.num_in) { break; } + const int64_t in_batch_offset = batch_idx * param.in_feature_dim[i] * vector_size; + const T* batch_in = param.in[i] + in_batch_offset; + T* batch_in_grad = param.in_grad[i] + in_batch_offset; + for (int j = 0; j < param.in_feature_dim[i]; ++j) { + const int64_t offset = j * vector_size + col_id; + batch_in_grad[offset] = + static_cast(static_cast(batch_out_grad[col_id]) + * (sum - static_cast(batch_in[offset]))); + } + } + } + } +} + +void GetBlockDims(const int64_t vector_size, int* block_dim_x, int* block_dim_y) { + const int block_size = 256; + if (vector_size < block_size) { + *block_dim_x = std::ceil(static_cast(vector_size) / 8) * 8; + *block_dim_y = (block_size + *block_dim_x - 1) / *block_dim_x; + } else { + *block_dim_x = block_size; + *block_dim_y = 1; + } +} + +int GetNumBlocks(const int64_t num_instances, const int64_t instance_per_block) { + int max_blocks = (num_instances + instance_per_block - 1) / instance_per_block; + return std::min(max_blocks, kCudaMaxBlocksNum); +} + +template +void DispatchFeatureInteractionSumPackSize(ep::Stream* stream, const int64_t batch_size, + const int64_t vector_size, + const Param& param) { + int block_dim_x; + int block_dim_y; + const int pack_size = (vector_size % 2 == 0) ? 2 : 1; + const int64_t vector_num_pack = vector_size / pack_size; + GetBlockDims(vector_num_pack, &block_dim_x, &block_dim_y); + const int num_blocks = GetNumBlocks(batch_size, block_dim_y); + dim3 block_dims = dim3(block_dim_x, block_dim_y); + hipStream_t cuda_stream = stream->As()->cuda_stream(); + if (pack_size == 2) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(FeatureInteractionSum), num_blocks, block_dims, 0, cuda_stream, batch_size, vector_num_pack, param); + } else { + hipLaunchKernelGGL(HIP_KERNEL_NAME(FeatureInteractionSum), num_blocks, block_dims, 0, cuda_stream, batch_size, vector_num_pack, param); + } +} + +template +void DispatchFeatureInteractionSumInputSize(user_op::KernelComputeContext* ctx, + const int32_t input_size) { + CHECK_LE(input_size, max_in) << input_size; + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + const int64_t batch_size = out->shape_view().At(0); + const int64_t vector_size = out->shape_view().At(1); + Param param; + param.num_in = input_size; + param.out = out->mut_dptr(); + for (int i = 0; i < input_size; ++i) { + param.in[i] = ctx->Tensor4ArgNameAndIndex("features", i)->dptr(); + param.in_feature_dim[i] = ctx->TensorDesc4ArgNameAndIndex("features", i)->shape().At(1); + } + DispatchFeatureInteractionSumPackSize(ctx->stream(), batch_size, vector_size, param); +} + +template +void DispatchFeatureInteractionSumGradInputSize(user_op::KernelComputeContext* ctx, + const int32_t input_size) { + CHECK_LE(input_size, max_in) << input_size; + const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + const int64_t batch_size = dy->shape_view().At(0); + const int64_t vector_size = dy->shape_view().At(1); + int block_dim_x; + int block_dim_y; + GetBlockDims(vector_size, &block_dim_x, &block_dim_y); + const int num_blocks = GetNumBlocks(batch_size, block_dim_y); + dim3 block_dims = dim3(block_dim_x, block_dim_y); + GradParam param; + param.num_in = input_size; + param.out_grad = dy->dptr(); + for (int i = 0; i < input_size; ++i) { + param.in[i] = ctx->Tensor4ArgNameAndIndex("features", i)->dptr(); + param.in_grad[i] = ctx->Tensor4ArgNameAndIndex("features_grad", i)->mut_dptr(); + param.in_feature_dim[i] = ctx->TensorDesc4ArgNameAndIndex("features_grad", i)->shape().At(1); + } + FeatureInteractionSumGrad + <<stream()->As()->cuda_stream()>>>( + batch_size, vector_size, param); +} + +} // namespace + +template +class FusedDotFeatureInteractionPoolingSumKernel final : public user_op::OpKernel, + public user_op::CudaGraphSupport { + public: + FusedDotFeatureInteractionPoolingSumKernel() = default; + ~FusedDotFeatureInteractionPoolingSumKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const int input_size = ctx->input_size("features"); + if (input_size == 1) { + DispatchFeatureInteractionSumInputSize(ctx, input_size); + } else if (input_size == 2) { + DispatchFeatureInteractionSumInputSize(ctx, input_size); + } else if (input_size <= 8) { + DispatchFeatureInteractionSumInputSize(ctx, input_size); + } else { + CHECK_LE(input_size, 128) << "input_size must not greater than 128. "; + DispatchFeatureInteractionSumInputSize(ctx, input_size); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fused_dot_feature_interaction") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("out", 0) == GetDataType::value) \ + && (user_op::HobAttr("pooling") == "sum")); + +REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_KERNEL(float) +REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_KERNEL(half) + +template +bool TryLaunchTensorCoreDotKernel(user_op::KernelComputeContext* ctx) { + const int input_size = ctx->input_size("features"); + if (input_size == 1) { + return DispatchFeatureInteractionDotPackSize(ctx, input_size); + } else if (input_size == 2) { + return DispatchFeatureInteractionDotPackSize(ctx, input_size); + } else if (input_size <= 8) { + return DispatchFeatureInteractionDotPackSize(ctx, input_size); + } else { + CHECK_LE(input_size, 128) << "input_size must not greater than 128. "; + return DispatchFeatureInteractionDotPackSize(ctx, input_size); + } +} + +template +bool TryLaunchTensorCoreDotBackwardKernel(user_op::KernelComputeContext* ctx) { + const int input_size = ctx->input_size("features"); + if (input_size == 1) { + return DispatchFeatureInteractionDotBackwardPackSize(ctx, input_size); + } else if (input_size == 2) { + return DispatchFeatureInteractionDotBackwardPackSize(ctx, input_size); + } else if (input_size <= 8) { + return DispatchFeatureInteractionDotBackwardPackSize(ctx, input_size); + } else { + CHECK_LE(input_size, 128) << "input_size must not greater than 128. "; + return DispatchFeatureInteractionDotBackwardPackSize(ctx, input_size); + } +} +template +class FusedDotFeatureInteractionKernel final : public user_op::OpKernel, + public user_op::CudaGraphSupport { + public: + FusedDotFeatureInteractionKernel() = default; + ~FusedDotFeatureInteractionKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + const DataType data_type = out->data_type(); + CHECK_LT(out->shape_view().elem_cnt(), GetMaxVal()); + auto* cuda_stream = ctx->stream()->As(); + // if ((cuda_stream->device_properties().major >= 7 && data_type == DataType::kFloat16) + // || (cuda_stream->device_properties().major >= 8 && data_type == DataType::kFloat)) { + // bool success = TryLaunchTensorCoreDotKernel(ctx); + // if (success == true) { return; } + // } + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + const int64_t batch_size = out->shape_view().At(0); + int64_t features_concated_dim = 0; + for (int64_t i = 0; i < ctx->input_size("features"); ++i) { + features_concated_dim += ctx->TensorDesc4ArgNameAndIndex("features", i)->shape().At(1); + } + const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim); + const int64_t vector_size = ctx->TensorDesc4ArgNameAndIndex("features", 0)->shape().At(2); + const int64_t out_dim = out->shape_view().At(1); + const int32_t output_padding = ctx->Attr("output_padding"); + const int64_t valid_out_dim = out_dim - output_padding; + const bool self_interaction = ctx->Attr("self_interaction"); + + T* matmul_out = reinterpret_cast(tmp_buffer->mut_dptr()); + size_t matmul_out_size = + GetCudaAlignedSize(batch_size * concated_padded_dim * concated_padded_dim * sizeof(T)); + const int64_t interaction_dim = self_interaction + ? features_concated_dim * (features_concated_dim + 1) / 2 + : features_concated_dim * (features_concated_dim - 1) / 2; + int32_t* gather_indices_ptr = + reinterpret_cast(tmp_buffer->mut_dptr() + matmul_out_size); + size_t gather_indices_size = GetCudaAlignedSize(interaction_dim * sizeof(int32_t)); + T* padded_concated_features_ptr = + reinterpret_cast(tmp_buffer->mut_dptr() + matmul_out_size + gather_indices_size); + size_t padded_concated_features_size = + GetCudaAlignedSize(batch_size * concated_padded_dim * vector_size * sizeof(T)); + CHECK_GE(tmp_buffer->shape_view().elem_cnt(), + matmul_out_size + gather_indices_size + padded_concated_features_size); + ConcatFeatures(ctx, batch_size, concated_padded_dim * vector_size, + padded_concated_features_ptr); + auto batch_matmul = ep::primitive::NewPrimitive( + ctx->device_type(), data_type, ep::primitive::BlasTransposeType::N, + ep::primitive::BlasTransposeType::T); + batch_matmul->Launch(ctx->stream(), batch_size, concated_padded_dim, concated_padded_dim, + vector_size, 1.0, padded_concated_features_ptr, + padded_concated_features_ptr, 0.0, matmul_out); + + int64_t output_concat_end_dim = 0; + const T* output_concat_ptr = nullptr; + if (ctx->has_input("output_concat", 0)) { + user_op::Tensor* output_concat = ctx->Tensor4ArgNameAndIndex("output_concat", 0); + output_concat_end_dim = output_concat->shape_view().At(1); + output_concat_ptr = output_concat->dptr(); + } + CHECK_EQ(valid_out_dim, output_concat_end_dim + interaction_dim); + GatherConcatKernel(ctx->stream(), out->shape_view().elem_cnt(), out_dim, valid_out_dim, + features_concated_dim, concated_padded_dim, output_concat_end_dim, + self_interaction, matmul_out, output_concat_ptr, gather_indices_ptr, + out->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +user_op::InferTmpSizeFn GenFusedDotFeatureInteractionInferTmpSizeFn() { + return [](user_op::InferContext* ctx) { + const Shape& first_feature_shape = ctx->InputShape("features", 0); + const int64_t batch_size = first_feature_shape.At(0); + const int64_t vector_size = first_feature_shape.At(2); + int64_t features_concated_dim = 0; + for (int32_t i = 0; i < ctx->input_size("features"); ++i) { + features_concated_dim += ctx->InputShape("features", i).At(1); + } + const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim); + size_t matmul_out_size = + GetCudaAlignedSize(batch_size * concated_padded_dim * concated_padded_dim * sizeof(T)); + const bool self_interaction = ctx->Attr("self_interaction"); + const int64_t interaction_dim = self_interaction + ? features_concated_dim * (features_concated_dim + 1) / 2 + : features_concated_dim * (features_concated_dim - 1) / 2; + size_t gather_indices_size = GetCudaAlignedSize(interaction_dim * sizeof(int32_t)); + size_t padded_concated_features_size = + GetCudaAlignedSize(batch_size * concated_padded_dim * vector_size * sizeof(T)); + return matmul_out_size + gather_indices_size + padded_concated_features_size; + }; +} + +#define REGISTER_FUSED_DOT_FEATURE_INTERACTION_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fused_dot_feature_interaction") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("out", 0) == GetDataType::value) \ + && (user_op::HobAttr("pooling") == "none")) \ + .SetInferTmpSizeFn(GenFusedDotFeatureInteractionInferTmpSizeFn()); + +REGISTER_FUSED_DOT_FEATURE_INTERACTION_KERNEL(float) +REGISTER_FUSED_DOT_FEATURE_INTERACTION_KERNEL(half) + +template +class FusedDotFeatureInteractionGradKernel final : public user_op::OpKernel, + public user_op::CudaGraphSupport { + public: + FusedDotFeatureInteractionGradKernel() = default; + ~FusedDotFeatureInteractionGradKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + const DataType data_type = dy->data_type(); + auto* cuda_stream = ctx->stream()->As(); + // if ((cuda_stream->device_properties().major >= 7 && data_type == DataType::kFloat16) + // || (cuda_stream->device_properties().major >= 8 && data_type == DataType::kFloat)) { + // bool success = TryLaunchTensorCoreDotBackwardKernel(ctx); + // if (success == true) { return; } + // } + const int64_t batch_size = dy->shape_view().At(0); + int64_t features_concated_dim = 0; + for (int32_t i = 0; i < ctx->output_size("features_grad"); ++i) { + features_concated_dim += ctx->TensorDesc4ArgNameAndIndex("features_grad", i)->shape().At(1); + } + const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim); + const int64_t vector_size = ctx->TensorDesc4ArgNameAndIndex("features_grad", 0)->shape().At(2); + const int64_t out_dim = dy->shape_view().At(1); + const bool self_interaction = ctx->Attr("self_interaction"); + T* matmul_out_grad_ptr = reinterpret_cast(tmp_buffer->mut_dptr()); + size_t matmul_out_grad_size = + GetCudaAlignedSize(batch_size * concated_padded_dim * concated_padded_dim * sizeof(T)); + T* padded_concated_features_grad_ptr = + reinterpret_cast(tmp_buffer->mut_dptr() + matmul_out_grad_size); + size_t padded_concated_features_grad_size = + GetCudaAlignedSize(batch_size * concated_padded_dim * vector_size * sizeof(T)); + T* padded_concated_features_ptr = reinterpret_cast( + tmp_buffer->mut_dptr() + matmul_out_grad_size + padded_concated_features_grad_size); + size_t padded_concated_features_size = padded_concated_features_grad_size; + CHECK_LE( + matmul_out_grad_size + padded_concated_features_grad_size + padded_concated_features_size, + tmp_buffer->shape_view().elem_cnt()); + ConcatFeatures(ctx, batch_size, concated_padded_dim * vector_size, + padded_concated_features_ptr); + + T* output_concat_grad_ptr = nullptr; + int64_t output_concat_end_dim = 0; + if (ctx->has_output("output_concat_grad", 0)) { + user_op::Tensor* output_concat_grad = ctx->Tensor4ArgNameAndIndex("output_concat_grad", 0); + output_concat_grad_ptr = output_concat_grad->mut_dptr(); + output_concat_end_dim = output_concat_grad->shape_view().At(1); + } + ScatterSplitAddTranspose(ctx->stream(), batch_size, out_dim, concated_padded_dim, + features_concated_dim, output_concat_end_dim, self_interaction, + dy->dptr(), output_concat_grad_ptr, matmul_out_grad_ptr); + + auto batch_matmul = ep::primitive::NewPrimitive( + ctx->device_type(), data_type, ep::primitive::BlasTransposeType::N, + ep::primitive::BlasTransposeType::N); + batch_matmul->Launch(ctx->stream(), batch_size, concated_padded_dim, vector_size, + concated_padded_dim, 1.0, matmul_out_grad_ptr, + padded_concated_features_ptr, 0.0, padded_concated_features_grad_ptr); + + ConcatFeaturesGrad(ctx, batch_size, concated_padded_dim, vector_size, + padded_concated_features_grad_ptr); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +user_op::InferTmpSizeFn GenFusedDotFeatureInteractionGradInferTmpSizeFn() { + return [](user_op::InferContext* ctx) { + int64_t features_concated_dim = 0; + for (int32_t i = 0; i < ctx->output_size("features_grad"); ++i) { + features_concated_dim += ctx->InputShape("features_grad", i).At(1); + } + const int64_t concated_padded_dim = GetPaddedDim(features_concated_dim); + const int64_t batch_size = ctx->InputShape("features_grad", 0).At(0); + const int64_t vector_size = ctx->InputShape("features_grad", 0).At(2); + size_t matmul_out_grad_size = + GetCudaAlignedSize(batch_size * concated_padded_dim * concated_padded_dim * sizeof(T)); + size_t padded_concated_features_grad_size = + GetCudaAlignedSize(batch_size * concated_padded_dim * vector_size * sizeof(T)); + size_t padded_concated_features_size = padded_concated_features_grad_size; + return matmul_out_grad_size + padded_concated_features_grad_size + + padded_concated_features_size; + }; +} + +#define REGISTER_FUSED_DOT_FEATURE_INTERACTION_GRAD_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fused_dot_feature_interaction_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dy", 0) == GetDataType::value) \ + && (user_op::HobAttr("pooling") == "none")) \ + .SetInferTmpSizeFn(GenFusedDotFeatureInteractionGradInferTmpSizeFn()); + +REGISTER_FUSED_DOT_FEATURE_INTERACTION_GRAD_KERNEL(float) +REGISTER_FUSED_DOT_FEATURE_INTERACTION_GRAD_KERNEL(half) + +template +class FusedDotFeatureInteractionPoolingSumGradKernel final : public user_op::OpKernel, + public user_op::CudaGraphSupport { + public: + FusedDotFeatureInteractionPoolingSumGradKernel() = default; + ~FusedDotFeatureInteractionPoolingSumGradKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const int input_size = ctx->input_size("features"); + if (input_size == 1) { + DispatchFeatureInteractionSumGradInputSize(ctx, input_size); + } else if (input_size == 2) { + DispatchFeatureInteractionSumGradInputSize(ctx, input_size); + } else if (input_size <= 8) { + DispatchFeatureInteractionSumGradInputSize(ctx, input_size); + } else { + CHECK_LE(input_size, 128) << "input_size must not greater than 128. "; + DispatchFeatureInteractionSumGradInputSize(ctx, input_size); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_GRAD_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fused_dot_feature_interaction_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dy", 0) == GetDataType::value) \ + && (user_op::HobAttr("pooling") == "sum")); + +REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_GRAD_KERNEL(float) +REGISTER_FUSED_DOT_FEATURE_INTERACTION_POOLING_SUM_GRAD_KERNEL(half) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/fused_gru_cell_kernel.hip.cpp b/oneflow/user/kernels/fused_gru_cell_kernel.hip.cpp index d910b5d..3371b04 100644 --- a/oneflow/user/kernels/fused_gru_cell_kernel.hip.cpp +++ b/oneflow/user/kernels/fused_gru_cell_kernel.hip.cpp @@ -1,472 +1,472 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/ndarray/ndarray_util.h" -#include "oneflow/core/ndarray/xpu_var_ndarray.h" -#include "oneflow/core/kernel/kernel_util.h" -#include "oneflow/core/kernel/cuda_graph_support.h" -#include "oneflow/core/ep/include/primitive/cast.h" -#include "oneflow/core/ep/include/primitive/fill.h" -#include "oneflow/core/ep/rocm/cuda_device.h" -#include "oneflow/core/ep/include/primitive/matmul.h" -#include "oneflow/user/kernels/fused_rnn_cell_kernel_util.h" - -// NOTE(Liang Depeng): The implementation of fused_gru_cell is modified from -// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/RNN.cu - -namespace oneflow { - -namespace { - -template -struct AccumulateType {}; -template<> -struct AccumulateType { - using type = float; -}; -template<> -struct AccumulateType { - using type = double; -}; - -template -using acc_type = typename AccumulateType::type; - -#define H2F(input) static_cast(input) -#define F2H(input) static_cast(input) - -template -__device__ __forceinline__ T sigmoid(T in) { - T one = static_cast(1.0); - return one / (one + ::exp(-in)); -} - -template -#if __CUDA_ARCH__ >= 350 -OF_LAUNCH_BOUNDS_2(512, 4) -#endif -__global__ void gru_cell_forward(const IDX_TYPE numel, const IDX_TYPE hidden_size, - const T* input_gates_ptr, const T* hidden_gates_ptr, - const T* hx_ptr, const T* input_bias_ptr, const T* hidden_bias_ptr, - T* hy_ptr, T* workspace_ptr) { - bool has_bias = input_bias_ptr != nullptr; - for (IDX_TYPE linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < numel; - linearIndex += gridDim.x * blockDim.x) { - IDX_TYPE offset = (linearIndex / hidden_size) * 3 * hidden_size + linearIndex % hidden_size; - - T ir = input_gates_ptr[offset + 0 * hidden_size]; - T ii = input_gates_ptr[offset + 1 * hidden_size]; - T in = input_gates_ptr[offset + 2 * hidden_size]; - T hr = hidden_gates_ptr[offset + 0 * hidden_size]; - T hi = hidden_gates_ptr[offset + 1 * hidden_size]; - T hn = hidden_gates_ptr[offset + 2 * hidden_size]; - - T hx = hx_ptr[linearIndex]; - T* hy = &(hy_ptr[linearIndex]); - - T b1r, b1i, b1n, b2r, b2i, b2n; - - if (has_bias) { - b1r = input_bias_ptr[linearIndex % hidden_size + 0 * hidden_size]; - b1i = input_bias_ptr[linearIndex % hidden_size + 1 * hidden_size]; - b1n = input_bias_ptr[linearIndex % hidden_size + 2 * hidden_size]; - - b2r = hidden_bias_ptr[linearIndex % hidden_size + 0 * hidden_size]; - b2i = hidden_bias_ptr[linearIndex % hidden_size + 1 * hidden_size]; - b2n = hidden_bias_ptr[linearIndex % hidden_size + 2 * hidden_size]; - } else { - b1r = F2H(0.0); - b1i = F2H(0.0); - b1n = F2H(0.0); - b2r = F2H(0.0); - b2i = F2H(0.0); - b2n = F2H(0.0); - } - - offset = (linearIndex / hidden_size) * 5 * hidden_size + linearIndex % hidden_size; - ACC_T rg, ig, ng; - rg = sigmoid(H2F(ir) + H2F(hr) + H2F(b1r) + H2F(b2r)); - ig = sigmoid(H2F(ii) + H2F(hi) + H2F(b1i) + H2F(b2i)); - - ng = H2F(in) + H2F(b1n) + rg * (H2F(hn) + H2F(b2n)); - ng = ::tanh(ng); - *hy = F2H(ng + ig * (H2F(hx) - ng)); - - // SAVE FOR BACKWARDS - workspace_ptr[offset + 0 * hidden_size] = F2H(rg); - workspace_ptr[offset + 1 * hidden_size] = F2H(ig); - workspace_ptr[offset + 2 * hidden_size] = F2H(ng); - workspace_ptr[offset + 3 * hidden_size] = hx; - workspace_ptr[offset + 4 * hidden_size] = F2H(H2F(hn) + H2F(b2n)); - } -} - -template -#if __CUDA_ARCH__ >= 350 -OF_LAUNCH_BOUNDS_2(512, 4) -#endif -__global__ - void gru_cell_backward(const IDX_TYPE numel, const IDX_TYPE hidden_size, const T* grad_hy_ptr, - const T* workspace_ptr, T* grad_input_gates_ptr, - T* grad_hidden_gates_ptr, T* grad_hx_ptr) { - for (IDX_TYPE linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < numel; - linearIndex += gridDim.x * blockDim.x) { - IDX_TYPE offset = (linearIndex / hidden_size) * 5 * hidden_size + linearIndex % hidden_size; - - T rg = workspace_ptr[offset + 0 * hidden_size]; - T ig = workspace_ptr[offset + 1 * hidden_size]; - T ng = workspace_ptr[offset + 2 * hidden_size]; - T hx = workspace_ptr[offset + 3 * hidden_size]; - T hn = workspace_ptr[offset + 4 * hidden_size]; - - T go = grad_hy_ptr[linearIndex]; - - offset = (linearIndex / hidden_size) * 3 * hidden_size + linearIndex % hidden_size; - - ACC_T gig = H2F(go) * (H2F(hx) - H2F(ng)) * (1 - H2F(ig)) * H2F(ig); - ACC_T ghx = H2F(go) * H2F(ig); - ACC_T gin = H2F(go) * (1 - H2F(ig)) * (1 - H2F(ng) * H2F(ng)); - ACC_T ghn = gin * H2F(rg); - ACC_T grg = gin * H2F(hn) * (1 - H2F(rg)) * H2F(rg); - - grad_input_gates_ptr[offset + 0 * hidden_size] = F2H(grg); - grad_input_gates_ptr[offset + 1 * hidden_size] = F2H(gig); - grad_input_gates_ptr[offset + 2 * hidden_size] = F2H(gin); - - grad_hidden_gates_ptr[offset + 0 * hidden_size] = F2H(grg); - grad_hidden_gates_ptr[offset + 1 * hidden_size] = F2H(gig); - grad_hidden_gates_ptr[offset + 2 * hidden_size] = F2H(ghn); - if (grad_hx_ptr != nullptr) { grad_hx_ptr[linearIndex] = F2H(ghx); } - } -} - -template -struct FusedGruCellGradFunctor final { - void operator()(ep::Stream* stream, const int64_t hx_numel, const int64_t workspace_numel, - const int64_t hidden_size, const T* grad_hy_ptr, const T* workspace_ptr, - T* grad_input_gates_ptr, T* grad_hidden_gates_ptr, T* grad_hx_ptr) { - using ACC_T = acc_type; - if (workspace_numel < std::numeric_limits::max()) { - RUN_CUDA_KERNEL((gru_cell_backward), stream, hx_numel, - static_cast(hx_numel), static_cast(hidden_size), - grad_hy_ptr, workspace_ptr, grad_input_gates_ptr, grad_hidden_gates_ptr, - grad_hx_ptr); - } else { - RUN_CUDA_KERNEL((gru_cell_backward), stream, hx_numel, hx_numel, - hidden_size, grad_hy_ptr, workspace_ptr, grad_input_gates_ptr, - grad_hidden_gates_ptr, grad_hx_ptr); - } - } -}; - -template<> -void FusedGruCellGradFunctor::operator()( - ep::Stream* stream, const int64_t hx_numel, const int64_t workspace_numel, - const int64_t hidden_size, const float16* grad_hy_ptr, const float16* workspace_ptr, - float16* grad_input_gates_ptr, float16* grad_hidden_gates_ptr, float16* grad_hx_ptr) { - if (workspace_numel < std::numeric_limits::max()) { - RUN_CUDA_KERNEL( - (gru_cell_backward), stream, hx_numel, static_cast(hx_numel), - static_cast(hidden_size), reinterpret_cast(grad_hy_ptr), - reinterpret_cast(workspace_ptr), reinterpret_cast(grad_input_gates_ptr), - reinterpret_cast(grad_hidden_gates_ptr), reinterpret_cast(grad_hx_ptr)); - } else { - RUN_CUDA_KERNEL( - (gru_cell_backward), stream, hx_numel, hx_numel, hidden_size, - reinterpret_cast(grad_hy_ptr), reinterpret_cast(workspace_ptr), - reinterpret_cast(grad_input_gates_ptr), - reinterpret_cast(grad_hidden_gates_ptr), reinterpret_cast(grad_hx_ptr)); - } -} - -template -struct FusedGruCellFunctor final { - void operator()(ep::Stream* stream, const int64_t hx_numel, const int64_t workspace_numel, - const int64_t hidden_size, const T* input_gates_ptr, const T* hidden_gates_ptr, - const T* hx_ptr, const T* input_bias_ptr, const T* hidden_bias_ptr, T* hy_ptr, - T* workspace_ptr) { - using ACC_T = acc_type; - if (workspace_numel < std::numeric_limits::max()) { - RUN_CUDA_KERNEL((gru_cell_forward), stream, hx_numel, - static_cast(hx_numel), static_cast(hidden_size), - input_gates_ptr, hidden_gates_ptr, hx_ptr, input_bias_ptr, hidden_bias_ptr, - hy_ptr, workspace_ptr); - } else { - RUN_CUDA_KERNEL((gru_cell_forward), stream, hx_numel, hx_numel, - hidden_size, input_gates_ptr, hidden_gates_ptr, hx_ptr, input_bias_ptr, - hidden_bias_ptr, hy_ptr, workspace_ptr); - } - } -}; - -template<> -void FusedGruCellFunctor::operator()( - ep::Stream* stream, const int64_t hx_numel, const int64_t workspace_numel, - const int64_t hidden_size, const float16* input_gates_ptr, const float16* hidden_gates_ptr, - const float16* hx_ptr, const float16* input_bias_ptr, const float16* hidden_bias_ptr, - float16* hy_ptr, float16* workspace_ptr) { - if (workspace_numel < std::numeric_limits::max()) { - RUN_CUDA_KERNEL( - (gru_cell_forward), stream, hx_numel, static_cast(hx_numel), - static_cast(hidden_size), reinterpret_cast(input_gates_ptr), - reinterpret_cast(hidden_gates_ptr), reinterpret_cast(hx_ptr), - reinterpret_cast(input_bias_ptr), - reinterpret_cast(hidden_bias_ptr), reinterpret_cast(hy_ptr), - reinterpret_cast(workspace_ptr)); - } else { - RUN_CUDA_KERNEL((gru_cell_forward), stream, hx_numel, hx_numel, - hidden_size, reinterpret_cast(input_gates_ptr), - reinterpret_cast(hidden_gates_ptr), - reinterpret_cast(hx_ptr), - reinterpret_cast(input_bias_ptr), - reinterpret_cast(hidden_bias_ptr), reinterpret_cast(hy_ptr), - reinterpret_cast(workspace_ptr)); - } -} - -} // namespace - -template -class GpuFusedGruCellKernel final : public user_op::OpKernel { - public: - GpuFusedGruCellKernel() = default; - ~GpuFusedGruCellKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* input_gates = ctx->Tensor4ArgNameAndIndex("input_gates", 0); - const user_op::Tensor* hidden_gates = ctx->Tensor4ArgNameAndIndex("hidden_gates", 0); - const user_op::Tensor* hx = ctx->Tensor4ArgNameAndIndex("hx", 0); - user_op::Tensor* hy = ctx->Tensor4ArgNameAndIndex("hy", 0); - user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0); - - const T* input_bias_ptr = nullptr; - const T* hidden_bias_ptr = nullptr; - if (ctx->has_input("input_bias", 0)) { - CHECK(ctx->has_input("hidden_bias", 0)); - input_bias_ptr = ctx->Tensor4ArgNameAndIndex("input_bias", 0)->dptr(); - hidden_bias_ptr = ctx->Tensor4ArgNameAndIndex("hidden_bias", 0)->dptr(); - } - const T* input_gates_ptr = input_gates->dptr(); - const T* hidden_gates_ptr = hidden_gates->dptr(); - const T* hx_ptr = hx->dptr(); - - T* hy_ptr = hy->mut_dptr(); - T* workspace_ptr = workspace->mut_dptr(); - const int64_t hx_numel = hx->shape_view().elem_cnt(); - const int64_t workspace_numel = workspace->shape_view().elem_cnt(); - const int64_t hidden_size = hx->shape_view().At(hx->shape_view().NumAxes() - 1); - FusedGruCellFunctor()(ctx->stream(), hx_numel, workspace_numel, hidden_size, input_gates_ptr, - hidden_gates_ptr, hx_ptr, input_bias_ptr, hidden_bias_ptr, hy_ptr, - workspace_ptr); - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_FUSED_GRU_CELL_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fused_gru_cell") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("hx", 0) == GetDataType::value) \ - && (user_op::HobDataType("input_gates", 0) == GetDataType::value) \ - && (user_op::HobDataType("hidden_gates", 0) == GetDataType::value)) - -REGISTER_FUSED_GRU_CELL_KERNEL(float); -REGISTER_FUSED_GRU_CELL_KERNEL(float16); - -class GpuFusedGruCellGradFloatKernel final : public user_op::OpKernel { - public: - GpuFusedGruCellGradFloatKernel() = default; - ~GpuFusedGruCellGradFloatKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* grad_hy = ctx->Tensor4ArgNameAndIndex("grad_hy", 0); - const user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0); - user_op::Tensor* grad_input_gates = ctx->Tensor4ArgNameAndIndex("grad_input_gates", 0); - user_op::Tensor* grad_hidden_gates = ctx->Tensor4ArgNameAndIndex("grad_hidden_gates", 0); - - const float* grad_hy_ptr = grad_hy->dptr(); - const float* workspace_ptr = workspace->dptr(); - - float* grad_input_gates_ptr = grad_input_gates->mut_dptr(); - float* grad_hidden_gates_ptr = grad_hidden_gates->mut_dptr(); - - float* grad_hx_ptr = nullptr; - if (ctx->has_output("grad_hx", 0)) { - user_op::Tensor* grad_hx = ctx->Tensor4ArgNameAndIndex("grad_hx", 0); - grad_hx_ptr = grad_hx->mut_dptr(); - } - - const int64_t hx_numel = grad_hy->shape_view().elem_cnt(); - const int64_t workspace_numel = workspace->shape_view().elem_cnt(); - const int64_t hidden_size = grad_hy->shape_view().At(grad_hy->shape_view().NumAxes() - 1); - FusedGruCellGradFunctor()(ctx->stream(), hx_numel, workspace_numel, hidden_size, - grad_hy_ptr, workspace_ptr, grad_input_gates_ptr, - grad_hidden_gates_ptr, grad_hx_ptr); - - if (ctx->has_output("grad_input_bias", 0) && ctx->has_output("grad_hidden_bias", 0)) { - float* grad_input_bias_ptr = - ctx->Tensor4ArgNameAndIndex("grad_input_bias", 0)->mut_dptr(); - std::vector axis; - axis.push_back(0); - const Shape& reduced_shape = - CreateReducedShape(grad_input_gates->shape_view(), {axis.begin(), axis.end()}); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - NdarrayReduce::Reduce( - ctx->stream(), XpuVarNdarray(reduced_shape, grad_input_bias_ptr), - XpuVarNdarray(grad_input_gates->shape_view(), - grad_input_gates->dptr()), - XpuVarNdarray(tmp_buffer->shape_view(), tmp_buffer->mut_dptr())); - - float* grad_hidden_bias_ptr = - ctx->Tensor4ArgNameAndIndex("grad_hidden_bias", 0)->mut_dptr(); - NdarrayReduce::Reduce( - ctx->stream(), XpuVarNdarray(reduced_shape, grad_hidden_bias_ptr), - XpuVarNdarray(grad_hidden_gates->shape_view(), - grad_hidden_gates->dptr()), - XpuVarNdarray(tmp_buffer->shape_view(), tmp_buffer->mut_dptr())); - } - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -REGISTER_USER_KERNEL("fused_gru_cell_grad") - .SetCreateFn() - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) - && (user_op::HobDataType("grad_hy", 0) == GetDataType::value) - && (user_op::HobDataType("workspace", 0) == GetDataType::value)) - .SetInferTmpSizeFn([](user_op::InferContext* ctx) { - size_t tmp_bytes = 0; - if (ctx->has_output("grad_input_bias", 0) && ctx->has_output("grad_hidden_bias", 0)) { - const Shape& in_shape = ctx->InputTensorDesc("grad_hy", 0).shape(); - tmp_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * 3 * sizeof(float)); - } else { - tmp_bytes = 0; - } - return tmp_bytes; - }); - -class GpuFusedGruCellGradHalfKernel final : public user_op::OpKernel { - public: - GpuFusedGruCellGradHalfKernel() = default; - ~GpuFusedGruCellGradHalfKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* grad_hy = ctx->Tensor4ArgNameAndIndex("grad_hy", 0); - const user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0); - user_op::Tensor* grad_input_gates = ctx->Tensor4ArgNameAndIndex("grad_input_gates", 0); - user_op::Tensor* grad_hidden_gates = ctx->Tensor4ArgNameAndIndex("grad_hidden_gates", 0); - - const float16* grad_hy_ptr = grad_hy->dptr(); - const float16* workspace_ptr = workspace->dptr(); - - float16* grad_input_gates_ptr = grad_input_gates->mut_dptr(); - float16* grad_hidden_gates_ptr = grad_hidden_gates->mut_dptr(); - - float16* grad_hx_ptr = nullptr; - if (ctx->has_output("grad_hx", 0)) { - user_op::Tensor* grad_hx = ctx->Tensor4ArgNameAndIndex("grad_hx", 0); - grad_hx_ptr = grad_hx->mut_dptr(); - } - - const int64_t hx_numel = grad_hy->shape_view().elem_cnt(); - const int64_t workspace_numel = workspace->shape_view().elem_cnt(); - const int64_t hidden_size = grad_hy->shape_view().At(grad_hy->shape_view().NumAxes() - 1); - FusedGruCellGradFunctor()(ctx->stream(), hx_numel, workspace_numel, hidden_size, - grad_hy_ptr, workspace_ptr, grad_input_gates_ptr, - grad_hidden_gates_ptr, grad_hx_ptr); - - if (ctx->has_output("grad_input_bias", 0) && ctx->has_output("grad_hidden_bias", 0)) { - std::vector axis; - axis.push_back(0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const ShapeView& in_shape = grad_input_gates->shape_view(); - const Shape& reduced_shape = CreateReducedShape(in_shape, {axis.begin(), axis.end()}); - float* in_tmp_buffer = tmp_buffer->mut_dptr(); - const size_t in_tmp_buffer_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float)); - float* out_tmp_buffer = - reinterpret_cast(tmp_buffer->mut_dptr() + in_tmp_buffer_bytes); - const size_t out_tmp_buffer_bytes = - GetCudaAlignedSize(reduced_shape.elem_cnt() * sizeof(float)); - float* reduce_tmp_buffer = reinterpret_cast( - tmp_buffer->mut_dptr() + in_tmp_buffer_bytes + out_tmp_buffer_bytes); - const size_t reduce_tmp_buffer_bytes = - GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float)); - CHECK_LE(in_tmp_buffer_bytes + out_tmp_buffer_bytes + reduce_tmp_buffer_bytes, - tmp_buffer->shape_view().elem_cnt()); - auto h2f = ep::primitive::NewPrimitive( - ctx->device_type(), DataType::kFloat16, DataType::kFloat); - CHECK(h2f); - auto f2h = ep::primitive::NewPrimitive( - ctx->device_type(), DataType::kFloat, DataType::kFloat16); - CHECK(f2h); - h2f->Launch(ctx->stream(), grad_input_gates->dptr(), in_tmp_buffer, - in_shape.elem_cnt()); - - NdarrayReduce::Reduce( - ctx->stream(), XpuVarNdarray(reduced_shape, out_tmp_buffer), - XpuVarNdarray(in_shape, in_tmp_buffer), - XpuVarNdarray(in_shape, reduce_tmp_buffer)); - - user_op::Tensor* output_tensor = ctx->Tensor4ArgNameAndIndex("grad_input_bias", 0); - f2h->Launch(ctx->stream(), out_tmp_buffer, output_tensor->mut_dptr(), - output_tensor->shape_view().elem_cnt()); - - h2f->Launch(ctx->stream(), grad_hidden_gates->dptr(), in_tmp_buffer, - in_shape.elem_cnt()); - NdarrayReduce::Reduce( - ctx->stream(), XpuVarNdarray(reduced_shape, out_tmp_buffer), - XpuVarNdarray(in_shape, in_tmp_buffer), - XpuVarNdarray(in_shape, reduce_tmp_buffer)); - - output_tensor = ctx->Tensor4ArgNameAndIndex("grad_hidden_bias", 0); - f2h->Launch(ctx->stream(), out_tmp_buffer, output_tensor->mut_dptr(), - output_tensor->shape_view().elem_cnt()); - } - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -REGISTER_USER_KERNEL("fused_gru_cell_grad") - .SetCreateFn() - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) - && (user_op::HobDataType("grad_hy", 0) == GetDataType::value) - && (user_op::HobDataType("workspace", 0) == GetDataType::value)) - .SetInferTmpSizeFn([](user_op::InferContext* ctx) { - size_t tmp_bytes = 0; - if (ctx->has_output("grad_input_bias", 0) && ctx->has_output("grad_hidden_bias", 0)) { - const Shape& in_shape = ctx->InputTensorDesc("grad_hy", 0).shape(); - const Shape& out_shape = ctx->OutputTensorDesc("grad_input_bias", 0)->shape(); - tmp_bytes = (2 * GetCudaAlignedSize(in_shape.elem_cnt() * 3 * sizeof(float)) - + GetCudaAlignedSize(out_shape.elem_cnt() * sizeof(float))); - } else { - tmp_bytes = 0; - } - return tmp_bytes; - }); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/ndarray/ndarray_util.h" +#include "oneflow/core/ndarray/xpu_var_ndarray.h" +#include "oneflow/core/kernel/kernel_util.h" +#include "oneflow/core/kernel/cuda_graph_support.h" +#include "oneflow/core/ep/include/primitive/cast.h" +#include "oneflow/core/ep/include/primitive/fill.h" +#include "oneflow/core/ep/rocm/cuda_device.h" +#include "oneflow/core/ep/include/primitive/matmul.h" +#include "oneflow/user/kernels/fused_rnn_cell_kernel_util.h" + +// NOTE(Liang Depeng): The implementation of fused_gru_cell is modified from +// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/RNN.cu + +namespace oneflow { + +namespace { + +template +struct AccumulateType {}; +template<> +struct AccumulateType { + using type = float; +}; +template<> +struct AccumulateType { + using type = double; +}; + +template +using acc_type = typename AccumulateType::type; + +#define H2F(input) static_cast(input) +#define F2H(input) static_cast(input) + +template +__device__ __forceinline__ T sigmoid(T in) { + T one = static_cast(1.0); + return one / (one + ::exp(-in)); +} + +template +#if __CUDA_ARCH__ >= 350 +OF_LAUNCH_BOUNDS_2(512, 4) +#endif +__global__ void gru_cell_forward(const IDX_TYPE numel, const IDX_TYPE hidden_size, + const T* input_gates_ptr, const T* hidden_gates_ptr, + const T* hx_ptr, const T* input_bias_ptr, const T* hidden_bias_ptr, + T* hy_ptr, T* workspace_ptr) { + bool has_bias = input_bias_ptr != nullptr; + for (IDX_TYPE linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < numel; + linearIndex += gridDim.x * blockDim.x) { + IDX_TYPE offset = (linearIndex / hidden_size) * 3 * hidden_size + linearIndex % hidden_size; + + T ir = input_gates_ptr[offset + 0 * hidden_size]; + T ii = input_gates_ptr[offset + 1 * hidden_size]; + T in = input_gates_ptr[offset + 2 * hidden_size]; + T hr = hidden_gates_ptr[offset + 0 * hidden_size]; + T hi = hidden_gates_ptr[offset + 1 * hidden_size]; + T hn = hidden_gates_ptr[offset + 2 * hidden_size]; + + T hx = hx_ptr[linearIndex]; + T* hy = &(hy_ptr[linearIndex]); + + T b1r, b1i, b1n, b2r, b2i, b2n; + + if (has_bias) { + b1r = input_bias_ptr[linearIndex % hidden_size + 0 * hidden_size]; + b1i = input_bias_ptr[linearIndex % hidden_size + 1 * hidden_size]; + b1n = input_bias_ptr[linearIndex % hidden_size + 2 * hidden_size]; + + b2r = hidden_bias_ptr[linearIndex % hidden_size + 0 * hidden_size]; + b2i = hidden_bias_ptr[linearIndex % hidden_size + 1 * hidden_size]; + b2n = hidden_bias_ptr[linearIndex % hidden_size + 2 * hidden_size]; + } else { + b1r = F2H(0.0); + b1i = F2H(0.0); + b1n = F2H(0.0); + b2r = F2H(0.0); + b2i = F2H(0.0); + b2n = F2H(0.0); + } + + offset = (linearIndex / hidden_size) * 5 * hidden_size + linearIndex % hidden_size; + ACC_T rg, ig, ng; + rg = sigmoid(H2F(ir) + H2F(hr) + H2F(b1r) + H2F(b2r)); + ig = sigmoid(H2F(ii) + H2F(hi) + H2F(b1i) + H2F(b2i)); + + ng = H2F(in) + H2F(b1n) + rg * (H2F(hn) + H2F(b2n)); + ng = ::tanh(ng); + *hy = F2H(ng + ig * (H2F(hx) - ng)); + + // SAVE FOR BACKWARDS + workspace_ptr[offset + 0 * hidden_size] = F2H(rg); + workspace_ptr[offset + 1 * hidden_size] = F2H(ig); + workspace_ptr[offset + 2 * hidden_size] = F2H(ng); + workspace_ptr[offset + 3 * hidden_size] = hx; + workspace_ptr[offset + 4 * hidden_size] = F2H(H2F(hn) + H2F(b2n)); + } +} + +template +#if __CUDA_ARCH__ >= 350 +OF_LAUNCH_BOUNDS_2(512, 4) +#endif +__global__ + void gru_cell_backward(const IDX_TYPE numel, const IDX_TYPE hidden_size, const T* grad_hy_ptr, + const T* workspace_ptr, T* grad_input_gates_ptr, + T* grad_hidden_gates_ptr, T* grad_hx_ptr) { + for (IDX_TYPE linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < numel; + linearIndex += gridDim.x * blockDim.x) { + IDX_TYPE offset = (linearIndex / hidden_size) * 5 * hidden_size + linearIndex % hidden_size; + + T rg = workspace_ptr[offset + 0 * hidden_size]; + T ig = workspace_ptr[offset + 1 * hidden_size]; + T ng = workspace_ptr[offset + 2 * hidden_size]; + T hx = workspace_ptr[offset + 3 * hidden_size]; + T hn = workspace_ptr[offset + 4 * hidden_size]; + + T go = grad_hy_ptr[linearIndex]; + + offset = (linearIndex / hidden_size) * 3 * hidden_size + linearIndex % hidden_size; + + ACC_T gig = H2F(go) * (H2F(hx) - H2F(ng)) * (1 - H2F(ig)) * H2F(ig); + ACC_T ghx = H2F(go) * H2F(ig); + ACC_T gin = H2F(go) * (1 - H2F(ig)) * (1 - H2F(ng) * H2F(ng)); + ACC_T ghn = gin * H2F(rg); + ACC_T grg = gin * H2F(hn) * (1 - H2F(rg)) * H2F(rg); + + grad_input_gates_ptr[offset + 0 * hidden_size] = F2H(grg); + grad_input_gates_ptr[offset + 1 * hidden_size] = F2H(gig); + grad_input_gates_ptr[offset + 2 * hidden_size] = F2H(gin); + + grad_hidden_gates_ptr[offset + 0 * hidden_size] = F2H(grg); + grad_hidden_gates_ptr[offset + 1 * hidden_size] = F2H(gig); + grad_hidden_gates_ptr[offset + 2 * hidden_size] = F2H(ghn); + if (grad_hx_ptr != nullptr) { grad_hx_ptr[linearIndex] = F2H(ghx); } + } +} + +template +struct FusedGruCellGradFunctor final { + void operator()(ep::Stream* stream, const int64_t hx_numel, const int64_t workspace_numel, + const int64_t hidden_size, const T* grad_hy_ptr, const T* workspace_ptr, + T* grad_input_gates_ptr, T* grad_hidden_gates_ptr, T* grad_hx_ptr) { + using ACC_T = acc_type; + if (workspace_numel < std::numeric_limits::max()) { + RUN_CUDA_KERNEL((gru_cell_backward), stream, hx_numel, + static_cast(hx_numel), static_cast(hidden_size), + grad_hy_ptr, workspace_ptr, grad_input_gates_ptr, grad_hidden_gates_ptr, + grad_hx_ptr); + } else { + RUN_CUDA_KERNEL((gru_cell_backward), stream, hx_numel, hx_numel, + hidden_size, grad_hy_ptr, workspace_ptr, grad_input_gates_ptr, + grad_hidden_gates_ptr, grad_hx_ptr); + } + } +}; + +template<> +void FusedGruCellGradFunctor::operator()( + ep::Stream* stream, const int64_t hx_numel, const int64_t workspace_numel, + const int64_t hidden_size, const float16* grad_hy_ptr, const float16* workspace_ptr, + float16* grad_input_gates_ptr, float16* grad_hidden_gates_ptr, float16* grad_hx_ptr) { + if (workspace_numel < std::numeric_limits::max()) { + RUN_CUDA_KERNEL( + (gru_cell_backward), stream, hx_numel, static_cast(hx_numel), + static_cast(hidden_size), reinterpret_cast(grad_hy_ptr), + reinterpret_cast(workspace_ptr), reinterpret_cast(grad_input_gates_ptr), + reinterpret_cast(grad_hidden_gates_ptr), reinterpret_cast(grad_hx_ptr)); + } else { + RUN_CUDA_KERNEL( + (gru_cell_backward), stream, hx_numel, hx_numel, hidden_size, + reinterpret_cast(grad_hy_ptr), reinterpret_cast(workspace_ptr), + reinterpret_cast(grad_input_gates_ptr), + reinterpret_cast(grad_hidden_gates_ptr), reinterpret_cast(grad_hx_ptr)); + } +} + +template +struct FusedGruCellFunctor final { + void operator()(ep::Stream* stream, const int64_t hx_numel, const int64_t workspace_numel, + const int64_t hidden_size, const T* input_gates_ptr, const T* hidden_gates_ptr, + const T* hx_ptr, const T* input_bias_ptr, const T* hidden_bias_ptr, T* hy_ptr, + T* workspace_ptr) { + using ACC_T = acc_type; + if (workspace_numel < std::numeric_limits::max()) { + RUN_CUDA_KERNEL((gru_cell_forward), stream, hx_numel, + static_cast(hx_numel), static_cast(hidden_size), + input_gates_ptr, hidden_gates_ptr, hx_ptr, input_bias_ptr, hidden_bias_ptr, + hy_ptr, workspace_ptr); + } else { + RUN_CUDA_KERNEL((gru_cell_forward), stream, hx_numel, hx_numel, + hidden_size, input_gates_ptr, hidden_gates_ptr, hx_ptr, input_bias_ptr, + hidden_bias_ptr, hy_ptr, workspace_ptr); + } + } +}; + +template<> +void FusedGruCellFunctor::operator()( + ep::Stream* stream, const int64_t hx_numel, const int64_t workspace_numel, + const int64_t hidden_size, const float16* input_gates_ptr, const float16* hidden_gates_ptr, + const float16* hx_ptr, const float16* input_bias_ptr, const float16* hidden_bias_ptr, + float16* hy_ptr, float16* workspace_ptr) { + if (workspace_numel < std::numeric_limits::max()) { + RUN_CUDA_KERNEL( + (gru_cell_forward), stream, hx_numel, static_cast(hx_numel), + static_cast(hidden_size), reinterpret_cast(input_gates_ptr), + reinterpret_cast(hidden_gates_ptr), reinterpret_cast(hx_ptr), + reinterpret_cast(input_bias_ptr), + reinterpret_cast(hidden_bias_ptr), reinterpret_cast(hy_ptr), + reinterpret_cast(workspace_ptr)); + } else { + RUN_CUDA_KERNEL((gru_cell_forward), stream, hx_numel, hx_numel, + hidden_size, reinterpret_cast(input_gates_ptr), + reinterpret_cast(hidden_gates_ptr), + reinterpret_cast(hx_ptr), + reinterpret_cast(input_bias_ptr), + reinterpret_cast(hidden_bias_ptr), reinterpret_cast(hy_ptr), + reinterpret_cast(workspace_ptr)); + } +} + +} // namespace + +template +class GpuFusedGruCellKernel final : public user_op::OpKernel { + public: + GpuFusedGruCellKernel() = default; + ~GpuFusedGruCellKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* input_gates = ctx->Tensor4ArgNameAndIndex("input_gates", 0); + const user_op::Tensor* hidden_gates = ctx->Tensor4ArgNameAndIndex("hidden_gates", 0); + const user_op::Tensor* hx = ctx->Tensor4ArgNameAndIndex("hx", 0); + user_op::Tensor* hy = ctx->Tensor4ArgNameAndIndex("hy", 0); + user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0); + + const T* input_bias_ptr = nullptr; + const T* hidden_bias_ptr = nullptr; + if (ctx->has_input("input_bias", 0)) { + CHECK(ctx->has_input("hidden_bias", 0)); + input_bias_ptr = ctx->Tensor4ArgNameAndIndex("input_bias", 0)->dptr(); + hidden_bias_ptr = ctx->Tensor4ArgNameAndIndex("hidden_bias", 0)->dptr(); + } + const T* input_gates_ptr = input_gates->dptr(); + const T* hidden_gates_ptr = hidden_gates->dptr(); + const T* hx_ptr = hx->dptr(); + + T* hy_ptr = hy->mut_dptr(); + T* workspace_ptr = workspace->mut_dptr(); + const int64_t hx_numel = hx->shape_view().elem_cnt(); + const int64_t workspace_numel = workspace->shape_view().elem_cnt(); + const int64_t hidden_size = hx->shape_view().At(hx->shape_view().NumAxes() - 1); + FusedGruCellFunctor()(ctx->stream(), hx_numel, workspace_numel, hidden_size, input_gates_ptr, + hidden_gates_ptr, hx_ptr, input_bias_ptr, hidden_bias_ptr, hy_ptr, + workspace_ptr); + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_FUSED_GRU_CELL_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fused_gru_cell") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("hx", 0) == GetDataType::value) \ + && (user_op::HobDataType("input_gates", 0) == GetDataType::value) \ + && (user_op::HobDataType("hidden_gates", 0) == GetDataType::value)) + +REGISTER_FUSED_GRU_CELL_KERNEL(float); +REGISTER_FUSED_GRU_CELL_KERNEL(float16); + +class GpuFusedGruCellGradFloatKernel final : public user_op::OpKernel { + public: + GpuFusedGruCellGradFloatKernel() = default; + ~GpuFusedGruCellGradFloatKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* grad_hy = ctx->Tensor4ArgNameAndIndex("grad_hy", 0); + const user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0); + user_op::Tensor* grad_input_gates = ctx->Tensor4ArgNameAndIndex("grad_input_gates", 0); + user_op::Tensor* grad_hidden_gates = ctx->Tensor4ArgNameAndIndex("grad_hidden_gates", 0); + + const float* grad_hy_ptr = grad_hy->dptr(); + const float* workspace_ptr = workspace->dptr(); + + float* grad_input_gates_ptr = grad_input_gates->mut_dptr(); + float* grad_hidden_gates_ptr = grad_hidden_gates->mut_dptr(); + + float* grad_hx_ptr = nullptr; + if (ctx->has_output("grad_hx", 0)) { + user_op::Tensor* grad_hx = ctx->Tensor4ArgNameAndIndex("grad_hx", 0); + grad_hx_ptr = grad_hx->mut_dptr(); + } + + const int64_t hx_numel = grad_hy->shape_view().elem_cnt(); + const int64_t workspace_numel = workspace->shape_view().elem_cnt(); + const int64_t hidden_size = grad_hy->shape_view().At(grad_hy->shape_view().NumAxes() - 1); + FusedGruCellGradFunctor()(ctx->stream(), hx_numel, workspace_numel, hidden_size, + grad_hy_ptr, workspace_ptr, grad_input_gates_ptr, + grad_hidden_gates_ptr, grad_hx_ptr); + + if (ctx->has_output("grad_input_bias", 0) && ctx->has_output("grad_hidden_bias", 0)) { + float* grad_input_bias_ptr = + ctx->Tensor4ArgNameAndIndex("grad_input_bias", 0)->mut_dptr(); + std::vector axis; + axis.push_back(0); + const Shape& reduced_shape = + CreateReducedShape(grad_input_gates->shape_view(), {axis.begin(), axis.end()}); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + NdarrayReduce::Reduce( + ctx->stream(), XpuVarNdarray(reduced_shape, grad_input_bias_ptr), + XpuVarNdarray(grad_input_gates->shape_view(), + grad_input_gates->dptr()), + XpuVarNdarray(tmp_buffer->shape_view(), tmp_buffer->mut_dptr())); + + float* grad_hidden_bias_ptr = + ctx->Tensor4ArgNameAndIndex("grad_hidden_bias", 0)->mut_dptr(); + NdarrayReduce::Reduce( + ctx->stream(), XpuVarNdarray(reduced_shape, grad_hidden_bias_ptr), + XpuVarNdarray(grad_hidden_gates->shape_view(), + grad_hidden_gates->dptr()), + XpuVarNdarray(tmp_buffer->shape_view(), tmp_buffer->mut_dptr())); + } + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +REGISTER_USER_KERNEL("fused_gru_cell_grad") + .SetCreateFn() + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) + && (user_op::HobDataType("grad_hy", 0) == GetDataType::value) + && (user_op::HobDataType("workspace", 0) == GetDataType::value)) + .SetInferTmpSizeFn([](user_op::InferContext* ctx) { + size_t tmp_bytes = 0; + if (ctx->has_output("grad_input_bias", 0) && ctx->has_output("grad_hidden_bias", 0)) { + const Shape& in_shape = ctx->InputTensorDesc("grad_hy", 0).shape(); + tmp_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * 3 * sizeof(float)); + } else { + tmp_bytes = 0; + } + return tmp_bytes; + }); + +class GpuFusedGruCellGradHalfKernel final : public user_op::OpKernel { + public: + GpuFusedGruCellGradHalfKernel() = default; + ~GpuFusedGruCellGradHalfKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* grad_hy = ctx->Tensor4ArgNameAndIndex("grad_hy", 0); + const user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0); + user_op::Tensor* grad_input_gates = ctx->Tensor4ArgNameAndIndex("grad_input_gates", 0); + user_op::Tensor* grad_hidden_gates = ctx->Tensor4ArgNameAndIndex("grad_hidden_gates", 0); + + const float16* grad_hy_ptr = grad_hy->dptr(); + const float16* workspace_ptr = workspace->dptr(); + + float16* grad_input_gates_ptr = grad_input_gates->mut_dptr(); + float16* grad_hidden_gates_ptr = grad_hidden_gates->mut_dptr(); + + float16* grad_hx_ptr = nullptr; + if (ctx->has_output("grad_hx", 0)) { + user_op::Tensor* grad_hx = ctx->Tensor4ArgNameAndIndex("grad_hx", 0); + grad_hx_ptr = grad_hx->mut_dptr(); + } + + const int64_t hx_numel = grad_hy->shape_view().elem_cnt(); + const int64_t workspace_numel = workspace->shape_view().elem_cnt(); + const int64_t hidden_size = grad_hy->shape_view().At(grad_hy->shape_view().NumAxes() - 1); + FusedGruCellGradFunctor()(ctx->stream(), hx_numel, workspace_numel, hidden_size, + grad_hy_ptr, workspace_ptr, grad_input_gates_ptr, + grad_hidden_gates_ptr, grad_hx_ptr); + + if (ctx->has_output("grad_input_bias", 0) && ctx->has_output("grad_hidden_bias", 0)) { + std::vector axis; + axis.push_back(0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + const ShapeView& in_shape = grad_input_gates->shape_view(); + const Shape& reduced_shape = CreateReducedShape(in_shape, {axis.begin(), axis.end()}); + float* in_tmp_buffer = tmp_buffer->mut_dptr(); + const size_t in_tmp_buffer_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float)); + float* out_tmp_buffer = + reinterpret_cast(tmp_buffer->mut_dptr() + in_tmp_buffer_bytes); + const size_t out_tmp_buffer_bytes = + GetCudaAlignedSize(reduced_shape.elem_cnt() * sizeof(float)); + float* reduce_tmp_buffer = reinterpret_cast( + tmp_buffer->mut_dptr() + in_tmp_buffer_bytes + out_tmp_buffer_bytes); + const size_t reduce_tmp_buffer_bytes = + GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float)); + CHECK_LE(in_tmp_buffer_bytes + out_tmp_buffer_bytes + reduce_tmp_buffer_bytes, + tmp_buffer->shape_view().elem_cnt()); + auto h2f = ep::primitive::NewPrimitive( + ctx->device_type(), DataType::kFloat16, DataType::kFloat); + CHECK(h2f); + auto f2h = ep::primitive::NewPrimitive( + ctx->device_type(), DataType::kFloat, DataType::kFloat16); + CHECK(f2h); + h2f->Launch(ctx->stream(), grad_input_gates->dptr(), in_tmp_buffer, + in_shape.elem_cnt()); + + NdarrayReduce::Reduce( + ctx->stream(), XpuVarNdarray(reduced_shape, out_tmp_buffer), + XpuVarNdarray(in_shape, in_tmp_buffer), + XpuVarNdarray(in_shape, reduce_tmp_buffer)); + + user_op::Tensor* output_tensor = ctx->Tensor4ArgNameAndIndex("grad_input_bias", 0); + f2h->Launch(ctx->stream(), out_tmp_buffer, output_tensor->mut_dptr(), + output_tensor->shape_view().elem_cnt()); + + h2f->Launch(ctx->stream(), grad_hidden_gates->dptr(), in_tmp_buffer, + in_shape.elem_cnt()); + NdarrayReduce::Reduce( + ctx->stream(), XpuVarNdarray(reduced_shape, out_tmp_buffer), + XpuVarNdarray(in_shape, in_tmp_buffer), + XpuVarNdarray(in_shape, reduce_tmp_buffer)); + + output_tensor = ctx->Tensor4ArgNameAndIndex("grad_hidden_bias", 0); + f2h->Launch(ctx->stream(), out_tmp_buffer, output_tensor->mut_dptr(), + output_tensor->shape_view().elem_cnt()); + } + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +REGISTER_USER_KERNEL("fused_gru_cell_grad") + .SetCreateFn() + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) + && (user_op::HobDataType("grad_hy", 0) == GetDataType::value) + && (user_op::HobDataType("workspace", 0) == GetDataType::value)) + .SetInferTmpSizeFn([](user_op::InferContext* ctx) { + size_t tmp_bytes = 0; + if (ctx->has_output("grad_input_bias", 0) && ctx->has_output("grad_hidden_bias", 0)) { + const Shape& in_shape = ctx->InputTensorDesc("grad_hy", 0).shape(); + const Shape& out_shape = ctx->OutputTensorDesc("grad_input_bias", 0)->shape(); + tmp_bytes = (2 * GetCudaAlignedSize(in_shape.elem_cnt() * 3 * sizeof(float)) + + GetCudaAlignedSize(out_shape.elem_cnt() * sizeof(float))); + } else { + tmp_bytes = 0; + } + return tmp_bytes; + }); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/fused_lstm_cell_kernel.hip.cpp b/oneflow/user/kernels/fused_lstm_cell_kernel.hip.cpp index 60cbc6c..d06005c 100644 --- a/oneflow/user/kernels/fused_lstm_cell_kernel.hip.cpp +++ b/oneflow/user/kernels/fused_lstm_cell_kernel.hip.cpp @@ -1,505 +1,505 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/ndarray/ndarray_util.h" -#include "oneflow/core/ndarray/xpu_var_ndarray.h" -#include "oneflow/core/kernel/kernel_util.h" -#include "oneflow/core/kernel/cuda_graph_support.h" -#include "oneflow/core/ep/include/primitive/cast.h" -#include "oneflow/core/ep/include/primitive/fill.h" -#include "oneflow/core/ep/rocm/cuda_device.h" -#include "oneflow/core/ep/include/primitive/matmul.h" -#include "oneflow/user/kernels/fused_rnn_cell_kernel_util.h" - -// NOTE(Liang Depeng): The implementation of fused_lstm_cell is modified from -// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/RNN.cu - -namespace oneflow { - -namespace { - -template -struct AccumulateType {}; -template<> -struct AccumulateType { - using type = float; -}; -template<> -struct AccumulateType { - using type = double; -}; - -template -using acc_type = typename AccumulateType::type; - -#define H2F(input) static_cast(input) -#define F2H(input) static_cast(input) - -template -__device__ __forceinline__ T sigmoid(T in) { - T one = static_cast(1.0); - return one / (one + ::exp(-in)); -} - -template -#if __CUDA_ARCH__ >= 350 -OF_LAUNCH_BOUNDS_2(512, 4) -#endif -__global__ - void lstm_cell_forward(const IDX_TYPE numel, const IDX_TYPE hidden_size, - const T* input_gates_ptr, const T* hidden_gates_ptr, const T* cx_ptr, - const T* input_bias_ptr, const T* hidden_bias_ptr, T* hy_ptr, T* cy_ptr, - T* workspace_ptr) { - bool has_bias = input_bias_ptr != nullptr; - for (IDX_TYPE linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < numel; - linearIndex += gridDim.x * blockDim.x) { - IDX_TYPE offset = (linearIndex / hidden_size) * 4 * hidden_size + linearIndex % hidden_size; - - T iig = input_gates_ptr[offset + 0 * hidden_size]; - T ifg = input_gates_ptr[offset + 1 * hidden_size]; - T icg = input_gates_ptr[offset + 2 * hidden_size]; - T iog = input_gates_ptr[offset + 3 * hidden_size]; - - T hig = hidden_gates_ptr[offset + 0 * hidden_size]; - T hfg = hidden_gates_ptr[offset + 1 * hidden_size]; - T hcg = hidden_gates_ptr[offset + 2 * hidden_size]; - T hog = hidden_gates_ptr[offset + 3 * hidden_size]; - - T* wig = &(workspace_ptr[offset + 0 * hidden_size]); - T* wfg = &(workspace_ptr[offset + 1 * hidden_size]); - T* wcg = &(workspace_ptr[offset + 2 * hidden_size]); - T* wog = &(workspace_ptr[offset + 3 * hidden_size]); - - T cx = cx_ptr[linearIndex]; - - T* hy = &(hy_ptr[linearIndex]); - T* cy = &(cy_ptr[linearIndex]); - - T b1i, b1f, b1c, b1o; - T b2i, b2f, b2c, b2o; - - if (has_bias) { - b1i = input_bias_ptr[linearIndex % hidden_size + 0 * hidden_size]; - b1f = input_bias_ptr[linearIndex % hidden_size + 1 * hidden_size]; - b1c = input_bias_ptr[linearIndex % hidden_size + 2 * hidden_size]; - b1o = input_bias_ptr[linearIndex % hidden_size + 3 * hidden_size]; - - b2i = hidden_bias_ptr[linearIndex % hidden_size + 0 * hidden_size]; - b2f = hidden_bias_ptr[linearIndex % hidden_size + 1 * hidden_size]; - b2c = hidden_bias_ptr[linearIndex % hidden_size + 2 * hidden_size]; - b2o = hidden_bias_ptr[linearIndex % hidden_size + 3 * hidden_size]; - } else { - b1i = F2H(0.0); - b1f = F2H(0.0); - b1c = F2H(0.0); - b1o = F2H(0.0); - b2i = F2H(0.0); - b2f = F2H(0.0); - b2c = F2H(0.0); - b2o = F2H(0.0); - } - - ACC_T ig, fg, cg, og; - ACC_T f_hy, f_cy; - - ig = sigmoid(H2F(iig) + H2F(hig) + H2F(b1i) + H2F(b2i)); - fg = sigmoid(H2F(ifg) + H2F(hfg) + H2F(b1f) + H2F(b2f)); - cg = ::tanh(H2F(icg) + H2F(hcg) + H2F(b1c) + H2F(b2c)); - og = sigmoid(H2F(iog) + H2F(hog) + H2F(b1o) + H2F(b2o)); - - f_cy = (fg * H2F(cx)) + (ig * cg); - f_hy = og * ::tanh(f_cy); - - *hy = F2H(f_hy); - *cy = F2H(f_cy); - - // SAVE FOR BACKWARDS - // Also need cy and cx but can be saved easily in python - *wig = F2H(ig); - *wfg = F2H(fg); - *wcg = F2H(cg); - *wog = F2H(og); - } -} - -template -#if __CUDA_ARCH__ >= 350 -OF_LAUNCH_BOUNDS_2(512, 4) -#endif -__global__ - void lstm_cell_backward(const IDX_TYPE numel, const IDX_TYPE hidden_size, const T* grad_hy_ptr, - const T* grad_cy_ptr, const T* cx_ptr, const T* cy_ptr, - const T* workspace_ptr, T* grad_gates_ptr, T* grad_cx_ptr) { - for (IDX_TYPE linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < numel; - linearIndex += gridDim.x * blockDim.x) { - IDX_TYPE offset = (linearIndex / hidden_size) * 4 * hidden_size + linearIndex % hidden_size; - - T ig = workspace_ptr[offset + 0 * hidden_size]; - T fg = workspace_ptr[offset + 1 * hidden_size]; - T cg = workspace_ptr[offset + 2 * hidden_size]; - T og = workspace_ptr[offset + 3 * hidden_size]; - - T* ih = &(grad_gates_ptr[offset + 0 * hidden_size]); - T* fh = &(grad_gates_ptr[offset + 1 * hidden_size]); - T* ch = &(grad_gates_ptr[offset + 2 * hidden_size]); - T* oh = &(grad_gates_ptr[offset + 3 * hidden_size]); - - // will return hidden grads here - T cx = cx_ptr[linearIndex]; - T cy = cy_ptr[linearIndex]; - - ACC_T go = H2F(grad_hy_ptr[linearIndex]); - ACC_T goc = H2F(grad_cy_ptr[linearIndex]); - - ACC_T gcx = ::tanh(H2F(cy)); - - ACC_T gog = go * gcx; - gcx = go * H2F(og) * (1 - gcx * gcx) + goc; - - ACC_T gig = gcx * H2F(cg); - ACC_T gfg = gcx * H2F(cx); - ACC_T gcg = gcx * H2F(ig); - - gig = gig * (1 - H2F(ig)) * H2F(ig); - gfg = gfg * (1 - H2F(fg)) * H2F(fg); - gcg = gcg * (1 - H2F(cg) * H2F(cg)); - gog = gog * (1 - H2F(og)) * H2F(og); - - *ih = F2H(gig); - *fh = F2H(gfg); - *ch = F2H(gcg); - *oh = F2H(gog); - - if (grad_cx_ptr != nullptr) { - gcx = gcx * H2F(fg); - T* gi = &(grad_cx_ptr[linearIndex]); - *gi = F2H(gcx); - } - } -} - -template -struct FusedLstmCellFunctor final { - void operator()(ep::Stream* stream, const int64_t cx_numel, const int64_t workspace_numel, - const int64_t hidden_size, const T* input_gates_ptr, const T* hidden_gates_ptr, - const T* cx_ptr, const T* input_bias_ptr, const T* hidden_bias_ptr, T* hy_ptr, - T* cy_ptr, T* workspace_ptr) { - using ACC_T = acc_type; - if (workspace_numel < std::numeric_limits::max()) { - RUN_CUDA_KERNEL((lstm_cell_forward), stream, cx_numel, - static_cast(cx_numel), static_cast(hidden_size), - input_gates_ptr, hidden_gates_ptr, cx_ptr, input_bias_ptr, hidden_bias_ptr, - hy_ptr, cy_ptr, workspace_ptr); - } else { - RUN_CUDA_KERNEL((lstm_cell_forward), stream, cx_numel, cx_numel, - hidden_size, input_gates_ptr, hidden_gates_ptr, cx_ptr, input_bias_ptr, - hidden_bias_ptr, hy_ptr, cy_ptr, workspace_ptr); - } - } -}; - -template<> -void FusedLstmCellFunctor::operator()( - ep::Stream* stream, const int64_t cx_numel, const int64_t workspace_numel, - const int64_t hidden_size, const float16* input_gates_ptr, const float16* hidden_gates_ptr, - const float16* cx_ptr, const float16* input_bias_ptr, const float16* hidden_bias_ptr, - float16* hy_ptr, float16* cy_ptr, float16* workspace_ptr) { - if (workspace_numel < std::numeric_limits::max()) { - RUN_CUDA_KERNEL( - (lstm_cell_forward), stream, cx_numel, static_cast(cx_numel), - static_cast(hidden_size), reinterpret_cast(input_gates_ptr), - reinterpret_cast(hidden_gates_ptr), reinterpret_cast(cx_ptr), - reinterpret_cast(input_bias_ptr), - reinterpret_cast(hidden_bias_ptr), reinterpret_cast(hy_ptr), - reinterpret_cast(cy_ptr), reinterpret_cast(workspace_ptr)); - } else { - RUN_CUDA_KERNEL((lstm_cell_forward), stream, cx_numel, cx_numel, - hidden_size, reinterpret_cast(input_gates_ptr), - reinterpret_cast(hidden_gates_ptr), - reinterpret_cast(cx_ptr), - reinterpret_cast(input_bias_ptr), - reinterpret_cast(hidden_bias_ptr), reinterpret_cast(hy_ptr), - reinterpret_cast(cy_ptr), reinterpret_cast(workspace_ptr)); - } -} - -template -struct FusedLstmCellGradFunctor final { - void operator()(ep::Stream* stream, const int64_t cx_numel, const int64_t workspace_numel, - const int64_t hidden_size, const T* grad_hy_ptr, const T* grad_cy_ptr, - const T* cx_ptr, const T* cy_ptr, const T* workspace_ptr, T* grad_gates_ptr, - T* grad_cx_ptr) { - using ACC_T = acc_type; - if (workspace_numel < std::numeric_limits::max()) { - RUN_CUDA_KERNEL((lstm_cell_backward), stream, cx_numel, - static_cast(cx_numel), static_cast(hidden_size), - grad_hy_ptr, grad_cy_ptr, cx_ptr, cy_ptr, workspace_ptr, grad_gates_ptr, - grad_cx_ptr); - } else { - RUN_CUDA_KERNEL((lstm_cell_backward), stream, cx_numel, cx_numel, - hidden_size, grad_hy_ptr, grad_cy_ptr, cx_ptr, cy_ptr, workspace_ptr, - grad_gates_ptr, grad_cx_ptr); - } - } -}; - -template<> -void FusedLstmCellGradFunctor::operator()( - ep::Stream* stream, const int64_t cx_numel, const int64_t workspace_numel, - const int64_t hidden_size, const float16* grad_hy_ptr, const float16* grad_cy_ptr, - const float16* cx_ptr, const float16* cy_ptr, const float16* workspace_ptr, - float16* grad_gates_ptr, float16* grad_cx_ptr) { - if (workspace_numel < std::numeric_limits::max()) { - RUN_CUDA_KERNEL((lstm_cell_backward), stream, cx_numel, - static_cast(cx_numel), static_cast(hidden_size), - reinterpret_cast(grad_hy_ptr), - reinterpret_cast(grad_cy_ptr), - reinterpret_cast(cx_ptr), reinterpret_cast(cy_ptr), - reinterpret_cast(workspace_ptr), - reinterpret_cast(grad_gates_ptr), reinterpret_cast(grad_cx_ptr)); - } else { - RUN_CUDA_KERNEL((lstm_cell_backward), stream, cx_numel, cx_numel, - hidden_size, reinterpret_cast(grad_hy_ptr), - reinterpret_cast(grad_cy_ptr), - reinterpret_cast(cx_ptr), reinterpret_cast(cy_ptr), - reinterpret_cast(workspace_ptr), - reinterpret_cast(grad_gates_ptr), reinterpret_cast(grad_cx_ptr)); - } -} - -} // namespace - -template -class GpuFusedLstmCellKernel final : public user_op::OpKernel { - public: - GpuFusedLstmCellKernel() = default; - ~GpuFusedLstmCellKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* input_gates = ctx->Tensor4ArgNameAndIndex("input_gates", 0); - const user_op::Tensor* hidden_gates = ctx->Tensor4ArgNameAndIndex("hidden_gates", 0); - const user_op::Tensor* cx = ctx->Tensor4ArgNameAndIndex("cx", 0); - user_op::Tensor* hy = ctx->Tensor4ArgNameAndIndex("hy", 0); - user_op::Tensor* cy = ctx->Tensor4ArgNameAndIndex("cy", 0); - user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0); - - const T* input_bias_ptr = nullptr; - const T* hidden_bias_ptr = nullptr; - if (ctx->has_input("input_bias", 0)) { - CHECK(ctx->has_input("hidden_bias", 0)); - input_bias_ptr = ctx->Tensor4ArgNameAndIndex("input_bias", 0)->dptr(); - hidden_bias_ptr = ctx->Tensor4ArgNameAndIndex("hidden_bias", 0)->dptr(); - } - const T* input_gates_ptr = input_gates->dptr(); - const T* hidden_gates_ptr = hidden_gates->dptr(); - const T* cx_ptr = cx->dptr(); - - T* hy_ptr = hy->mut_dptr(); - T* cy_ptr = cy->mut_dptr(); - T* workspace_ptr = workspace->mut_dptr(); - const int64_t cx_numel = cx->shape_view().elem_cnt(); - const int64_t workspace_numel = workspace->shape_view().elem_cnt(); - const int64_t hidden_size = cx->shape_view().At(cx->shape_view().NumAxes() - 1); - FusedLstmCellFunctor()(ctx->stream(), cx_numel, workspace_numel, hidden_size, - input_gates_ptr, hidden_gates_ptr, cx_ptr, input_bias_ptr, - hidden_bias_ptr, hy_ptr, cy_ptr, workspace_ptr); - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_FUSED_LSTM_CELL_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fused_lstm_cell") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("cx", 0) == GetDataType::value) \ - && (user_op::HobDataType("input_gates", 0) == GetDataType::value) \ - && (user_op::HobDataType("hidden_gates", 0) == GetDataType::value)) - -REGISTER_FUSED_LSTM_CELL_KERNEL(float); -REGISTER_FUSED_LSTM_CELL_KERNEL(float16); - -class GpuFusedLstmCellGradFloatKernel final : public user_op::OpKernel { - public: - GpuFusedLstmCellGradFloatKernel() = default; - ~GpuFusedLstmCellGradFloatKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* grad_hy = ctx->Tensor4ArgNameAndIndex("grad_hy", 0); - const user_op::Tensor* grad_cy = ctx->Tensor4ArgNameAndIndex("grad_cy", 0); - const user_op::Tensor* cx = ctx->Tensor4ArgNameAndIndex("cx", 0); - const user_op::Tensor* cy = ctx->Tensor4ArgNameAndIndex("cy", 0); - const user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0); - user_op::Tensor* grad_gates = ctx->Tensor4ArgNameAndIndex("grad_gates", 0); - user_op::Tensor* grad_cx = ctx->Tensor4ArgNameAndIndex("grad_cx", 0); - - const float* grad_hy_ptr = grad_hy->dptr(); - const float* grad_cy_ptr = grad_cy->dptr(); - const float* cx_ptr = cx->dptr(); - const float* cy_ptr = cy->dptr(); - const float* workspace_ptr = workspace->dptr(); - - float* grad_gates_ptr = grad_gates->mut_dptr(); - float* grad_cx_ptr = nullptr; - - if (ctx->has_output("grad_cx", 0)) { grad_cx_ptr = grad_cx->mut_dptr(); } - - const int64_t cx_numel = cx->shape_view().elem_cnt(); - const int64_t workspace_numel = workspace->shape_view().elem_cnt(); - const int64_t hidden_size = cx->shape_view().At(cx->shape_view().NumAxes() - 1); - FusedLstmCellGradFunctor()(ctx->stream(), cx_numel, workspace_numel, hidden_size, - grad_hy_ptr, grad_cy_ptr, cx_ptr, cy_ptr, workspace_ptr, - grad_gates_ptr, grad_cx_ptr); - - if (ctx->has_output("grad_bias", 0)) { - float* grad_bias_ptr = ctx->Tensor4ArgNameAndIndex("grad_bias", 0)->mut_dptr(); - std::vector axis; - axis.push_back(0); - const Shape& reduced_shape = - CreateReducedShape(workspace->shape_view(), {axis.begin(), axis.end()}); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - NdarrayReduce::Reduce( - ctx->stream(), XpuVarNdarray(reduced_shape, grad_bias_ptr), - XpuVarNdarray(grad_gates->shape_view(), grad_gates->dptr()), - XpuVarNdarray(tmp_buffer->shape_view(), tmp_buffer->mut_dptr())); - } - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -REGISTER_USER_KERNEL("fused_lstm_cell_grad") - .SetCreateFn() - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) - && (user_op::HobDataType("grad_hy", 0) == GetDataType::value) - && (user_op::HobDataType("grad_cy", 0) == GetDataType::value) - && (user_op::HobDataType("cx", 0) == GetDataType::value) - && (user_op::HobDataType("cy", 0) == GetDataType::value) - && (user_op::HobDataType("workspace", 0) == GetDataType::value)) - .SetInferTmpSizeFn([](user_op::InferContext* ctx) { - size_t tmp_bytes = 0; - if (ctx->has_output("grad_bias", 0)) { - const Shape& in_shape = ctx->InputTensorDesc("workspace", 0).shape(); - tmp_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float)); - } else { - tmp_bytes = 0; - } - return tmp_bytes; - }); - -class GpuFusedLstmCellGradHalfKernel final : public user_op::OpKernel { - public: - GpuFusedLstmCellGradHalfKernel() = default; - ~GpuFusedLstmCellGradHalfKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* grad_hy = ctx->Tensor4ArgNameAndIndex("grad_hy", 0); - const user_op::Tensor* grad_cy = ctx->Tensor4ArgNameAndIndex("grad_cy", 0); - const user_op::Tensor* cx = ctx->Tensor4ArgNameAndIndex("cx", 0); - const user_op::Tensor* cy = ctx->Tensor4ArgNameAndIndex("cy", 0); - const user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0); - user_op::Tensor* grad_gates = ctx->Tensor4ArgNameAndIndex("grad_gates", 0); - user_op::Tensor* grad_cx = ctx->Tensor4ArgNameAndIndex("grad_cx", 0); - - const float16* grad_hy_ptr = grad_hy->dptr(); - const float16* grad_cy_ptr = grad_cy->dptr(); - const float16* cx_ptr = cx->dptr(); - const float16* cy_ptr = cy->dptr(); - const float16* workspace_ptr = workspace->dptr(); - - float16* grad_gates_ptr = grad_gates->mut_dptr(); - float16* grad_cx_ptr = nullptr; - - if (ctx->has_output("grad_cx", 0)) { grad_cx_ptr = grad_cx->mut_dptr(); } - - const int64_t cx_numel = cx->shape_view().elem_cnt(); - const int64_t workspace_numel = workspace->shape_view().elem_cnt(); - const int64_t hidden_size = cx->shape_view().At(cx->shape_view().NumAxes() - 1); - FusedLstmCellGradFunctor()(ctx->stream(), cx_numel, workspace_numel, hidden_size, - grad_hy_ptr, grad_cy_ptr, cx_ptr, cy_ptr, workspace_ptr, - grad_gates_ptr, grad_cx_ptr); - - if (ctx->has_output("grad_bias", 0)) { - std::vector axis; - axis.push_back(0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const ShapeView& in_shape = grad_gates->shape_view(); - const Shape& reduced_shape = CreateReducedShape(in_shape, {axis.begin(), axis.end()}); - float* in_tmp_buffer = tmp_buffer->mut_dptr(); - const size_t in_tmp_buffer_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float)); - float* out_tmp_buffer = - reinterpret_cast(tmp_buffer->mut_dptr() + in_tmp_buffer_bytes); - const size_t out_tmp_buffer_bytes = - GetCudaAlignedSize(reduced_shape.elem_cnt() * sizeof(float)); - float* reduce_tmp_buffer = reinterpret_cast( - tmp_buffer->mut_dptr() + in_tmp_buffer_bytes + out_tmp_buffer_bytes); - const size_t reduce_tmp_buffer_bytes = - GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float)); - CHECK_LE(in_tmp_buffer_bytes + out_tmp_buffer_bytes + reduce_tmp_buffer_bytes, - tmp_buffer->shape_view().elem_cnt()); - auto h2f = ep::primitive::NewPrimitive( - ctx->device_type(), DataType::kFloat16, DataType::kFloat); - CHECK(h2f); - auto f2h = ep::primitive::NewPrimitive( - ctx->device_type(), DataType::kFloat, DataType::kFloat16); - CHECK(f2h); - h2f->Launch(ctx->stream(), grad_gates->dptr(), in_tmp_buffer, in_shape.elem_cnt()); - - NdarrayReduce::Reduce( - ctx->stream(), XpuVarNdarray(reduced_shape, out_tmp_buffer), - XpuVarNdarray(in_shape, in_tmp_buffer), - XpuVarNdarray(in_shape, reduce_tmp_buffer)); - - user_op::Tensor* output_tensor = ctx->Tensor4ArgNameAndIndex("grad_bias", 0); - f2h->Launch(ctx->stream(), out_tmp_buffer, output_tensor->mut_dptr(), - output_tensor->shape_view().elem_cnt()); - } - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -REGISTER_USER_KERNEL("fused_lstm_cell_grad") - .SetCreateFn() - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) - && (user_op::HobDataType("grad_hy", 0) == GetDataType::value) - && (user_op::HobDataType("grad_cy", 0) == GetDataType::value) - && (user_op::HobDataType("cx", 0) == GetDataType::value) - && (user_op::HobDataType("cy", 0) == GetDataType::value) - && (user_op::HobDataType("workspace", 0) == GetDataType::value)) - .SetInferTmpSizeFn([](user_op::InferContext* ctx) { - size_t tmp_bytes = 0; - if (ctx->has_output("grad_bias", 0)) { - const Shape& in_shape = ctx->InputTensorDesc("workspace", 0).shape(); - const Shape& out_shape = ctx->OutputTensorDesc("grad_bias", 0)->shape(); - tmp_bytes = (2 * GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float)) - + GetCudaAlignedSize(out_shape.elem_cnt() * sizeof(float))); - } else { - tmp_bytes = 0; - } - return tmp_bytes; - }); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/ndarray/ndarray_util.h" +#include "oneflow/core/ndarray/xpu_var_ndarray.h" +#include "oneflow/core/kernel/kernel_util.h" +#include "oneflow/core/kernel/cuda_graph_support.h" +#include "oneflow/core/ep/include/primitive/cast.h" +#include "oneflow/core/ep/include/primitive/fill.h" +#include "oneflow/core/ep/rocm/cuda_device.h" +#include "oneflow/core/ep/include/primitive/matmul.h" +#include "oneflow/user/kernels/fused_rnn_cell_kernel_util.h" + +// NOTE(Liang Depeng): The implementation of fused_lstm_cell is modified from +// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/RNN.cu + +namespace oneflow { + +namespace { + +template +struct AccumulateType {}; +template<> +struct AccumulateType { + using type = float; +}; +template<> +struct AccumulateType { + using type = double; +}; + +template +using acc_type = typename AccumulateType::type; + +#define H2F(input) static_cast(input) +#define F2H(input) static_cast(input) + +template +__device__ __forceinline__ T sigmoid(T in) { + T one = static_cast(1.0); + return one / (one + ::exp(-in)); +} + +template +#if __CUDA_ARCH__ >= 350 +OF_LAUNCH_BOUNDS_2(512, 4) +#endif +__global__ + void lstm_cell_forward(const IDX_TYPE numel, const IDX_TYPE hidden_size, + const T* input_gates_ptr, const T* hidden_gates_ptr, const T* cx_ptr, + const T* input_bias_ptr, const T* hidden_bias_ptr, T* hy_ptr, T* cy_ptr, + T* workspace_ptr) { + bool has_bias = input_bias_ptr != nullptr; + for (IDX_TYPE linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < numel; + linearIndex += gridDim.x * blockDim.x) { + IDX_TYPE offset = (linearIndex / hidden_size) * 4 * hidden_size + linearIndex % hidden_size; + + T iig = input_gates_ptr[offset + 0 * hidden_size]; + T ifg = input_gates_ptr[offset + 1 * hidden_size]; + T icg = input_gates_ptr[offset + 2 * hidden_size]; + T iog = input_gates_ptr[offset + 3 * hidden_size]; + + T hig = hidden_gates_ptr[offset + 0 * hidden_size]; + T hfg = hidden_gates_ptr[offset + 1 * hidden_size]; + T hcg = hidden_gates_ptr[offset + 2 * hidden_size]; + T hog = hidden_gates_ptr[offset + 3 * hidden_size]; + + T* wig = &(workspace_ptr[offset + 0 * hidden_size]); + T* wfg = &(workspace_ptr[offset + 1 * hidden_size]); + T* wcg = &(workspace_ptr[offset + 2 * hidden_size]); + T* wog = &(workspace_ptr[offset + 3 * hidden_size]); + + T cx = cx_ptr[linearIndex]; + + T* hy = &(hy_ptr[linearIndex]); + T* cy = &(cy_ptr[linearIndex]); + + T b1i, b1f, b1c, b1o; + T b2i, b2f, b2c, b2o; + + if (has_bias) { + b1i = input_bias_ptr[linearIndex % hidden_size + 0 * hidden_size]; + b1f = input_bias_ptr[linearIndex % hidden_size + 1 * hidden_size]; + b1c = input_bias_ptr[linearIndex % hidden_size + 2 * hidden_size]; + b1o = input_bias_ptr[linearIndex % hidden_size + 3 * hidden_size]; + + b2i = hidden_bias_ptr[linearIndex % hidden_size + 0 * hidden_size]; + b2f = hidden_bias_ptr[linearIndex % hidden_size + 1 * hidden_size]; + b2c = hidden_bias_ptr[linearIndex % hidden_size + 2 * hidden_size]; + b2o = hidden_bias_ptr[linearIndex % hidden_size + 3 * hidden_size]; + } else { + b1i = F2H(0.0); + b1f = F2H(0.0); + b1c = F2H(0.0); + b1o = F2H(0.0); + b2i = F2H(0.0); + b2f = F2H(0.0); + b2c = F2H(0.0); + b2o = F2H(0.0); + } + + ACC_T ig, fg, cg, og; + ACC_T f_hy, f_cy; + + ig = sigmoid(H2F(iig) + H2F(hig) + H2F(b1i) + H2F(b2i)); + fg = sigmoid(H2F(ifg) + H2F(hfg) + H2F(b1f) + H2F(b2f)); + cg = ::tanh(H2F(icg) + H2F(hcg) + H2F(b1c) + H2F(b2c)); + og = sigmoid(H2F(iog) + H2F(hog) + H2F(b1o) + H2F(b2o)); + + f_cy = (fg * H2F(cx)) + (ig * cg); + f_hy = og * ::tanh(f_cy); + + *hy = F2H(f_hy); + *cy = F2H(f_cy); + + // SAVE FOR BACKWARDS + // Also need cy and cx but can be saved easily in python + *wig = F2H(ig); + *wfg = F2H(fg); + *wcg = F2H(cg); + *wog = F2H(og); + } +} + +template +#if __CUDA_ARCH__ >= 350 +OF_LAUNCH_BOUNDS_2(512, 4) +#endif +__global__ + void lstm_cell_backward(const IDX_TYPE numel, const IDX_TYPE hidden_size, const T* grad_hy_ptr, + const T* grad_cy_ptr, const T* cx_ptr, const T* cy_ptr, + const T* workspace_ptr, T* grad_gates_ptr, T* grad_cx_ptr) { + for (IDX_TYPE linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < numel; + linearIndex += gridDim.x * blockDim.x) { + IDX_TYPE offset = (linearIndex / hidden_size) * 4 * hidden_size + linearIndex % hidden_size; + + T ig = workspace_ptr[offset + 0 * hidden_size]; + T fg = workspace_ptr[offset + 1 * hidden_size]; + T cg = workspace_ptr[offset + 2 * hidden_size]; + T og = workspace_ptr[offset + 3 * hidden_size]; + + T* ih = &(grad_gates_ptr[offset + 0 * hidden_size]); + T* fh = &(grad_gates_ptr[offset + 1 * hidden_size]); + T* ch = &(grad_gates_ptr[offset + 2 * hidden_size]); + T* oh = &(grad_gates_ptr[offset + 3 * hidden_size]); + + // will return hidden grads here + T cx = cx_ptr[linearIndex]; + T cy = cy_ptr[linearIndex]; + + ACC_T go = H2F(grad_hy_ptr[linearIndex]); + ACC_T goc = H2F(grad_cy_ptr[linearIndex]); + + ACC_T gcx = ::tanh(H2F(cy)); + + ACC_T gog = go * gcx; + gcx = go * H2F(og) * (1 - gcx * gcx) + goc; + + ACC_T gig = gcx * H2F(cg); + ACC_T gfg = gcx * H2F(cx); + ACC_T gcg = gcx * H2F(ig); + + gig = gig * (1 - H2F(ig)) * H2F(ig); + gfg = gfg * (1 - H2F(fg)) * H2F(fg); + gcg = gcg * (1 - H2F(cg) * H2F(cg)); + gog = gog * (1 - H2F(og)) * H2F(og); + + *ih = F2H(gig); + *fh = F2H(gfg); + *ch = F2H(gcg); + *oh = F2H(gog); + + if (grad_cx_ptr != nullptr) { + gcx = gcx * H2F(fg); + T* gi = &(grad_cx_ptr[linearIndex]); + *gi = F2H(gcx); + } + } +} + +template +struct FusedLstmCellFunctor final { + void operator()(ep::Stream* stream, const int64_t cx_numel, const int64_t workspace_numel, + const int64_t hidden_size, const T* input_gates_ptr, const T* hidden_gates_ptr, + const T* cx_ptr, const T* input_bias_ptr, const T* hidden_bias_ptr, T* hy_ptr, + T* cy_ptr, T* workspace_ptr) { + using ACC_T = acc_type; + if (workspace_numel < std::numeric_limits::max()) { + RUN_CUDA_KERNEL((lstm_cell_forward), stream, cx_numel, + static_cast(cx_numel), static_cast(hidden_size), + input_gates_ptr, hidden_gates_ptr, cx_ptr, input_bias_ptr, hidden_bias_ptr, + hy_ptr, cy_ptr, workspace_ptr); + } else { + RUN_CUDA_KERNEL((lstm_cell_forward), stream, cx_numel, cx_numel, + hidden_size, input_gates_ptr, hidden_gates_ptr, cx_ptr, input_bias_ptr, + hidden_bias_ptr, hy_ptr, cy_ptr, workspace_ptr); + } + } +}; + +template<> +void FusedLstmCellFunctor::operator()( + ep::Stream* stream, const int64_t cx_numel, const int64_t workspace_numel, + const int64_t hidden_size, const float16* input_gates_ptr, const float16* hidden_gates_ptr, + const float16* cx_ptr, const float16* input_bias_ptr, const float16* hidden_bias_ptr, + float16* hy_ptr, float16* cy_ptr, float16* workspace_ptr) { + if (workspace_numel < std::numeric_limits::max()) { + RUN_CUDA_KERNEL( + (lstm_cell_forward), stream, cx_numel, static_cast(cx_numel), + static_cast(hidden_size), reinterpret_cast(input_gates_ptr), + reinterpret_cast(hidden_gates_ptr), reinterpret_cast(cx_ptr), + reinterpret_cast(input_bias_ptr), + reinterpret_cast(hidden_bias_ptr), reinterpret_cast(hy_ptr), + reinterpret_cast(cy_ptr), reinterpret_cast(workspace_ptr)); + } else { + RUN_CUDA_KERNEL((lstm_cell_forward), stream, cx_numel, cx_numel, + hidden_size, reinterpret_cast(input_gates_ptr), + reinterpret_cast(hidden_gates_ptr), + reinterpret_cast(cx_ptr), + reinterpret_cast(input_bias_ptr), + reinterpret_cast(hidden_bias_ptr), reinterpret_cast(hy_ptr), + reinterpret_cast(cy_ptr), reinterpret_cast(workspace_ptr)); + } +} + +template +struct FusedLstmCellGradFunctor final { + void operator()(ep::Stream* stream, const int64_t cx_numel, const int64_t workspace_numel, + const int64_t hidden_size, const T* grad_hy_ptr, const T* grad_cy_ptr, + const T* cx_ptr, const T* cy_ptr, const T* workspace_ptr, T* grad_gates_ptr, + T* grad_cx_ptr) { + using ACC_T = acc_type; + if (workspace_numel < std::numeric_limits::max()) { + RUN_CUDA_KERNEL((lstm_cell_backward), stream, cx_numel, + static_cast(cx_numel), static_cast(hidden_size), + grad_hy_ptr, grad_cy_ptr, cx_ptr, cy_ptr, workspace_ptr, grad_gates_ptr, + grad_cx_ptr); + } else { + RUN_CUDA_KERNEL((lstm_cell_backward), stream, cx_numel, cx_numel, + hidden_size, grad_hy_ptr, grad_cy_ptr, cx_ptr, cy_ptr, workspace_ptr, + grad_gates_ptr, grad_cx_ptr); + } + } +}; + +template<> +void FusedLstmCellGradFunctor::operator()( + ep::Stream* stream, const int64_t cx_numel, const int64_t workspace_numel, + const int64_t hidden_size, const float16* grad_hy_ptr, const float16* grad_cy_ptr, + const float16* cx_ptr, const float16* cy_ptr, const float16* workspace_ptr, + float16* grad_gates_ptr, float16* grad_cx_ptr) { + if (workspace_numel < std::numeric_limits::max()) { + RUN_CUDA_KERNEL((lstm_cell_backward), stream, cx_numel, + static_cast(cx_numel), static_cast(hidden_size), + reinterpret_cast(grad_hy_ptr), + reinterpret_cast(grad_cy_ptr), + reinterpret_cast(cx_ptr), reinterpret_cast(cy_ptr), + reinterpret_cast(workspace_ptr), + reinterpret_cast(grad_gates_ptr), reinterpret_cast(grad_cx_ptr)); + } else { + RUN_CUDA_KERNEL((lstm_cell_backward), stream, cx_numel, cx_numel, + hidden_size, reinterpret_cast(grad_hy_ptr), + reinterpret_cast(grad_cy_ptr), + reinterpret_cast(cx_ptr), reinterpret_cast(cy_ptr), + reinterpret_cast(workspace_ptr), + reinterpret_cast(grad_gates_ptr), reinterpret_cast(grad_cx_ptr)); + } +} + +} // namespace + +template +class GpuFusedLstmCellKernel final : public user_op::OpKernel { + public: + GpuFusedLstmCellKernel() = default; + ~GpuFusedLstmCellKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* input_gates = ctx->Tensor4ArgNameAndIndex("input_gates", 0); + const user_op::Tensor* hidden_gates = ctx->Tensor4ArgNameAndIndex("hidden_gates", 0); + const user_op::Tensor* cx = ctx->Tensor4ArgNameAndIndex("cx", 0); + user_op::Tensor* hy = ctx->Tensor4ArgNameAndIndex("hy", 0); + user_op::Tensor* cy = ctx->Tensor4ArgNameAndIndex("cy", 0); + user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0); + + const T* input_bias_ptr = nullptr; + const T* hidden_bias_ptr = nullptr; + if (ctx->has_input("input_bias", 0)) { + CHECK(ctx->has_input("hidden_bias", 0)); + input_bias_ptr = ctx->Tensor4ArgNameAndIndex("input_bias", 0)->dptr(); + hidden_bias_ptr = ctx->Tensor4ArgNameAndIndex("hidden_bias", 0)->dptr(); + } + const T* input_gates_ptr = input_gates->dptr(); + const T* hidden_gates_ptr = hidden_gates->dptr(); + const T* cx_ptr = cx->dptr(); + + T* hy_ptr = hy->mut_dptr(); + T* cy_ptr = cy->mut_dptr(); + T* workspace_ptr = workspace->mut_dptr(); + const int64_t cx_numel = cx->shape_view().elem_cnt(); + const int64_t workspace_numel = workspace->shape_view().elem_cnt(); + const int64_t hidden_size = cx->shape_view().At(cx->shape_view().NumAxes() - 1); + FusedLstmCellFunctor()(ctx->stream(), cx_numel, workspace_numel, hidden_size, + input_gates_ptr, hidden_gates_ptr, cx_ptr, input_bias_ptr, + hidden_bias_ptr, hy_ptr, cy_ptr, workspace_ptr); + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_FUSED_LSTM_CELL_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fused_lstm_cell") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("cx", 0) == GetDataType::value) \ + && (user_op::HobDataType("input_gates", 0) == GetDataType::value) \ + && (user_op::HobDataType("hidden_gates", 0) == GetDataType::value)) + +REGISTER_FUSED_LSTM_CELL_KERNEL(float); +REGISTER_FUSED_LSTM_CELL_KERNEL(float16); + +class GpuFusedLstmCellGradFloatKernel final : public user_op::OpKernel { + public: + GpuFusedLstmCellGradFloatKernel() = default; + ~GpuFusedLstmCellGradFloatKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* grad_hy = ctx->Tensor4ArgNameAndIndex("grad_hy", 0); + const user_op::Tensor* grad_cy = ctx->Tensor4ArgNameAndIndex("grad_cy", 0); + const user_op::Tensor* cx = ctx->Tensor4ArgNameAndIndex("cx", 0); + const user_op::Tensor* cy = ctx->Tensor4ArgNameAndIndex("cy", 0); + const user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0); + user_op::Tensor* grad_gates = ctx->Tensor4ArgNameAndIndex("grad_gates", 0); + user_op::Tensor* grad_cx = ctx->Tensor4ArgNameAndIndex("grad_cx", 0); + + const float* grad_hy_ptr = grad_hy->dptr(); + const float* grad_cy_ptr = grad_cy->dptr(); + const float* cx_ptr = cx->dptr(); + const float* cy_ptr = cy->dptr(); + const float* workspace_ptr = workspace->dptr(); + + float* grad_gates_ptr = grad_gates->mut_dptr(); + float* grad_cx_ptr = nullptr; + + if (ctx->has_output("grad_cx", 0)) { grad_cx_ptr = grad_cx->mut_dptr(); } + + const int64_t cx_numel = cx->shape_view().elem_cnt(); + const int64_t workspace_numel = workspace->shape_view().elem_cnt(); + const int64_t hidden_size = cx->shape_view().At(cx->shape_view().NumAxes() - 1); + FusedLstmCellGradFunctor()(ctx->stream(), cx_numel, workspace_numel, hidden_size, + grad_hy_ptr, grad_cy_ptr, cx_ptr, cy_ptr, workspace_ptr, + grad_gates_ptr, grad_cx_ptr); + + if (ctx->has_output("grad_bias", 0)) { + float* grad_bias_ptr = ctx->Tensor4ArgNameAndIndex("grad_bias", 0)->mut_dptr(); + std::vector axis; + axis.push_back(0); + const Shape& reduced_shape = + CreateReducedShape(workspace->shape_view(), {axis.begin(), axis.end()}); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + NdarrayReduce::Reduce( + ctx->stream(), XpuVarNdarray(reduced_shape, grad_bias_ptr), + XpuVarNdarray(grad_gates->shape_view(), grad_gates->dptr()), + XpuVarNdarray(tmp_buffer->shape_view(), tmp_buffer->mut_dptr())); + } + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +REGISTER_USER_KERNEL("fused_lstm_cell_grad") + .SetCreateFn() + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) + && (user_op::HobDataType("grad_hy", 0) == GetDataType::value) + && (user_op::HobDataType("grad_cy", 0) == GetDataType::value) + && (user_op::HobDataType("cx", 0) == GetDataType::value) + && (user_op::HobDataType("cy", 0) == GetDataType::value) + && (user_op::HobDataType("workspace", 0) == GetDataType::value)) + .SetInferTmpSizeFn([](user_op::InferContext* ctx) { + size_t tmp_bytes = 0; + if (ctx->has_output("grad_bias", 0)) { + const Shape& in_shape = ctx->InputTensorDesc("workspace", 0).shape(); + tmp_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float)); + } else { + tmp_bytes = 0; + } + return tmp_bytes; + }); + +class GpuFusedLstmCellGradHalfKernel final : public user_op::OpKernel { + public: + GpuFusedLstmCellGradHalfKernel() = default; + ~GpuFusedLstmCellGradHalfKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* grad_hy = ctx->Tensor4ArgNameAndIndex("grad_hy", 0); + const user_op::Tensor* grad_cy = ctx->Tensor4ArgNameAndIndex("grad_cy", 0); + const user_op::Tensor* cx = ctx->Tensor4ArgNameAndIndex("cx", 0); + const user_op::Tensor* cy = ctx->Tensor4ArgNameAndIndex("cy", 0); + const user_op::Tensor* workspace = ctx->Tensor4ArgNameAndIndex("workspace", 0); + user_op::Tensor* grad_gates = ctx->Tensor4ArgNameAndIndex("grad_gates", 0); + user_op::Tensor* grad_cx = ctx->Tensor4ArgNameAndIndex("grad_cx", 0); + + const float16* grad_hy_ptr = grad_hy->dptr(); + const float16* grad_cy_ptr = grad_cy->dptr(); + const float16* cx_ptr = cx->dptr(); + const float16* cy_ptr = cy->dptr(); + const float16* workspace_ptr = workspace->dptr(); + + float16* grad_gates_ptr = grad_gates->mut_dptr(); + float16* grad_cx_ptr = nullptr; + + if (ctx->has_output("grad_cx", 0)) { grad_cx_ptr = grad_cx->mut_dptr(); } + + const int64_t cx_numel = cx->shape_view().elem_cnt(); + const int64_t workspace_numel = workspace->shape_view().elem_cnt(); + const int64_t hidden_size = cx->shape_view().At(cx->shape_view().NumAxes() - 1); + FusedLstmCellGradFunctor()(ctx->stream(), cx_numel, workspace_numel, hidden_size, + grad_hy_ptr, grad_cy_ptr, cx_ptr, cy_ptr, workspace_ptr, + grad_gates_ptr, grad_cx_ptr); + + if (ctx->has_output("grad_bias", 0)) { + std::vector axis; + axis.push_back(0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + const ShapeView& in_shape = grad_gates->shape_view(); + const Shape& reduced_shape = CreateReducedShape(in_shape, {axis.begin(), axis.end()}); + float* in_tmp_buffer = tmp_buffer->mut_dptr(); + const size_t in_tmp_buffer_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float)); + float* out_tmp_buffer = + reinterpret_cast(tmp_buffer->mut_dptr() + in_tmp_buffer_bytes); + const size_t out_tmp_buffer_bytes = + GetCudaAlignedSize(reduced_shape.elem_cnt() * sizeof(float)); + float* reduce_tmp_buffer = reinterpret_cast( + tmp_buffer->mut_dptr() + in_tmp_buffer_bytes + out_tmp_buffer_bytes); + const size_t reduce_tmp_buffer_bytes = + GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float)); + CHECK_LE(in_tmp_buffer_bytes + out_tmp_buffer_bytes + reduce_tmp_buffer_bytes, + tmp_buffer->shape_view().elem_cnt()); + auto h2f = ep::primitive::NewPrimitive( + ctx->device_type(), DataType::kFloat16, DataType::kFloat); + CHECK(h2f); + auto f2h = ep::primitive::NewPrimitive( + ctx->device_type(), DataType::kFloat, DataType::kFloat16); + CHECK(f2h); + h2f->Launch(ctx->stream(), grad_gates->dptr(), in_tmp_buffer, in_shape.elem_cnt()); + + NdarrayReduce::Reduce( + ctx->stream(), XpuVarNdarray(reduced_shape, out_tmp_buffer), + XpuVarNdarray(in_shape, in_tmp_buffer), + XpuVarNdarray(in_shape, reduce_tmp_buffer)); + + user_op::Tensor* output_tensor = ctx->Tensor4ArgNameAndIndex("grad_bias", 0); + f2h->Launch(ctx->stream(), out_tmp_buffer, output_tensor->mut_dptr(), + output_tensor->shape_view().elem_cnt()); + } + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +REGISTER_USER_KERNEL("fused_lstm_cell_grad") + .SetCreateFn() + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) + && (user_op::HobDataType("grad_hy", 0) == GetDataType::value) + && (user_op::HobDataType("grad_cy", 0) == GetDataType::value) + && (user_op::HobDataType("cx", 0) == GetDataType::value) + && (user_op::HobDataType("cy", 0) == GetDataType::value) + && (user_op::HobDataType("workspace", 0) == GetDataType::value)) + .SetInferTmpSizeFn([](user_op::InferContext* ctx) { + size_t tmp_bytes = 0; + if (ctx->has_output("grad_bias", 0)) { + const Shape& in_shape = ctx->InputTensorDesc("workspace", 0).shape(); + const Shape& out_shape = ctx->OutputTensorDesc("grad_bias", 0)->shape(); + tmp_bytes = (2 * GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(float)) + + GetCudaAlignedSize(out_shape.elem_cnt() * sizeof(float))); + } else { + tmp_bytes = 0; + } + return tmp_bytes; + }); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/fused_relu_dropout_grad_kernel.hip.cpp b/oneflow/user/kernels/fused_relu_dropout_grad_kernel.hip.cpp index e1f6dc6..b7f1bd0 100644 --- a/oneflow/user/kernels/fused_relu_dropout_grad_kernel.hip.cpp +++ b/oneflow/user/kernels/fused_relu_dropout_grad_kernel.hip.cpp @@ -1,146 +1,146 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include "oneflow/core/kernel/cuda_graph_support.h" -#include "oneflow/core/hip/elementwise.hip.h" - -namespace oneflow { - -namespace { - -constexpr int32_t kWarpSize = 64; - -template -__global__ void VectorizedReluDropoutBitmaskBackwardKernel( - const IndexType elem_cnt, const IndexType cols, const IndexType aux_ld, const float scale, - const IndexType n_tail, const IndexType tail_offset, const T* dy, const int32_t* mask, T* dx) { - int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - using LoadStoreType = cuda::elementwise::PackType; - using LoadStorePack = cuda::elementwise::Pack; - - T t_scale = static_cast(scale); - for (IndexType linear_pack_index = global_thread_id * pack_size; linear_pack_index < elem_cnt; - linear_pack_index += gridDim.x * blockDim.x * pack_size) { - const LoadStoreType* dy_load = reinterpret_cast(dy + linear_pack_index); - LoadStorePack dy_vec; - dy_vec.storage = *dy_load; - - LoadStorePack dx_vec; -#pragma unroll - for (int i = 0; i < pack_size; i++) { - const IndexType linear_index = (linear_pack_index + i); - const IndexType row = linear_index / cols; - const IndexType col = linear_index - row * cols; - const int32_t col_mod_warpsize = col % kWarpSize; - const IndexType aux_idx = ((row * aux_ld) + col) / kWarpSize; - bool is_positive = mask[aux_idx] & (1 << col_mod_warpsize); - dx_vec.elem[i] = - dy_vec.elem[i] * static_cast(static_cast(is_positive)) * static_cast(scale); - } - *(reinterpret_cast(dx + linear_pack_index)) = dx_vec.storage; - } - - if (tail && global_thread_id < n_tail) { - const IndexType tail_index = tail_offset + global_thread_id; - const IndexType tail_row = tail_index / cols; - const IndexType tail_col = tail_index - tail_row * cols; - const IndexType tail_col_mod_warpsize = tail_col % kWarpSize; - const IndexType tail_aux_idx = ((tail_row * aux_ld) + tail_col) / kWarpSize; - bool is_positive = mask[tail_aux_idx] & (1 << tail_col_mod_warpsize); - dx[tail_index] = - dy[tail_index] * static_cast(static_cast(is_positive)) * static_cast(scale); - } -} - -template -void LaunchVectorizedReluDropoutBackwardKernel(ep::Stream* stream, const int64_t elem_cnt, - const int64_t cols, const int64_t aux_ld, - float scale, const T* dy, const int32_t* mask, - T* dx) { - constexpr int pack_size = cuda::elementwise::PackSize(); - const int64_t pack_num = elem_cnt / pack_size; - const int64_t tail_offset = pack_num * pack_size; - const int64_t n_tail = elem_cnt - tail_offset; - const bool tail = n_tail > 0 ? true : false; - if (tail) { - if (elem_cnt < GetMaxVal()) { - stream->As()->LaunchKernelDefaultWaves( - (VectorizedReluDropoutBitmaskBackwardKernel), - std::max(1, pack_num), elem_cnt, cols, aux_ld, scale, n_tail, tail_offset, dy, - mask, dx); - } else { - stream->As()->LaunchKernelDefaultWaves( - (VectorizedReluDropoutBitmaskBackwardKernel), - std::max(1, pack_num), elem_cnt, cols, aux_ld, scale, n_tail, tail_offset, dy, - mask, dx); - } - } else { - if (elem_cnt < GetMaxVal()) { - stream->As()->LaunchKernelDefaultWaves( - (VectorizedReluDropoutBitmaskBackwardKernel), - std::max(1, pack_num), elem_cnt, cols, aux_ld, scale, /*n_tail=*/0, tail_offset, - dy, mask, dx); - } else { - stream->As()->LaunchKernelDefaultWaves( - (VectorizedReluDropoutBitmaskBackwardKernel), - std::max(1, pack_num), elem_cnt, cols, aux_ld, scale, /*n_tail=*/0, tail_offset, - dy, mask, dx); - } - } -} - -template -class FusedReluDropoutGradKernel final : public user_op::OpKernel, - public user_op::CudaGraphSupport { - public: - FusedReluDropoutGradKernel() = default; - ~FusedReluDropoutGradKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); - user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const float scale = ctx->Attr("scale"); - - const int64_t cols = dy->shape_view().At(1); - const int64_t aux_ld = mask->shape_view().At(1) * 32; - const int64_t elem_cnt = dy->shape_view().elem_cnt(); - LaunchVectorizedReluDropoutBackwardKernel( - ctx->stream(), elem_cnt, cols, aux_ld, scale, reinterpret_cast(dy->dptr()), - mask->dptr(), reinterpret_cast(dx->mut_dptr())); - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_FUSED_RELU_DROPOUT_GRAD_KERNEL_GPU(cpp_type, data_type) \ - REGISTER_USER_KERNEL("fused_relu_dropout_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dx", 0) == data_type)); - -REGISTER_FUSED_RELU_DROPOUT_GRAD_KERNEL_GPU(float, DataType::kFloat) -REGISTER_FUSED_RELU_DROPOUT_GRAD_KERNEL_GPU(half, DataType::kFloat16) - - -} // namespace - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include "oneflow/core/kernel/cuda_graph_support.h" +#include "oneflow/core/hip/elementwise.hip.h" + +namespace oneflow { + +namespace { + +constexpr int32_t kWarpSize = 64; + +template +__global__ void VectorizedReluDropoutBitmaskBackwardKernel( + const IndexType elem_cnt, const IndexType cols, const IndexType aux_ld, const float scale, + const IndexType n_tail, const IndexType tail_offset, const T* dy, const int32_t* mask, T* dx) { + int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + using LoadStoreType = cuda::elementwise::PackType; + using LoadStorePack = cuda::elementwise::Pack; + + T t_scale = static_cast(scale); + for (IndexType linear_pack_index = global_thread_id * pack_size; linear_pack_index < elem_cnt; + linear_pack_index += gridDim.x * blockDim.x * pack_size) { + const LoadStoreType* dy_load = reinterpret_cast(dy + linear_pack_index); + LoadStorePack dy_vec; + dy_vec.storage = *dy_load; + + LoadStorePack dx_vec; +#pragma unroll + for (int i = 0; i < pack_size; i++) { + const IndexType linear_index = (linear_pack_index + i); + const IndexType row = linear_index / cols; + const IndexType col = linear_index - row * cols; + const int32_t col_mod_warpsize = col % kWarpSize; + const IndexType aux_idx = ((row * aux_ld) + col) / kWarpSize; + bool is_positive = mask[aux_idx] & (1 << col_mod_warpsize); + dx_vec.elem[i] = + dy_vec.elem[i] * static_cast(static_cast(is_positive)) * static_cast(scale); + } + *(reinterpret_cast(dx + linear_pack_index)) = dx_vec.storage; + } + + if (tail && global_thread_id < n_tail) { + const IndexType tail_index = tail_offset + global_thread_id; + const IndexType tail_row = tail_index / cols; + const IndexType tail_col = tail_index - tail_row * cols; + const IndexType tail_col_mod_warpsize = tail_col % kWarpSize; + const IndexType tail_aux_idx = ((tail_row * aux_ld) + tail_col) / kWarpSize; + bool is_positive = mask[tail_aux_idx] & (1 << tail_col_mod_warpsize); + dx[tail_index] = + dy[tail_index] * static_cast(static_cast(is_positive)) * static_cast(scale); + } +} + +template +void LaunchVectorizedReluDropoutBackwardKernel(ep::Stream* stream, const int64_t elem_cnt, + const int64_t cols, const int64_t aux_ld, + float scale, const T* dy, const int32_t* mask, + T* dx) { + constexpr int pack_size = cuda::elementwise::PackSize(); + const int64_t pack_num = elem_cnt / pack_size; + const int64_t tail_offset = pack_num * pack_size; + const int64_t n_tail = elem_cnt - tail_offset; + const bool tail = n_tail > 0 ? true : false; + if (tail) { + if (elem_cnt < GetMaxVal()) { + stream->As()->LaunchKernelDefaultWaves( + (VectorizedReluDropoutBitmaskBackwardKernel), + std::max(1, pack_num), elem_cnt, cols, aux_ld, scale, n_tail, tail_offset, dy, + mask, dx); + } else { + stream->As()->LaunchKernelDefaultWaves( + (VectorizedReluDropoutBitmaskBackwardKernel), + std::max(1, pack_num), elem_cnt, cols, aux_ld, scale, n_tail, tail_offset, dy, + mask, dx); + } + } else { + if (elem_cnt < GetMaxVal()) { + stream->As()->LaunchKernelDefaultWaves( + (VectorizedReluDropoutBitmaskBackwardKernel), + std::max(1, pack_num), elem_cnt, cols, aux_ld, scale, /*n_tail=*/0, tail_offset, + dy, mask, dx); + } else { + stream->As()->LaunchKernelDefaultWaves( + (VectorizedReluDropoutBitmaskBackwardKernel), + std::max(1, pack_num), elem_cnt, cols, aux_ld, scale, /*n_tail=*/0, tail_offset, + dy, mask, dx); + } + } +} + +template +class FusedReluDropoutGradKernel final : public user_op::OpKernel, + public user_op::CudaGraphSupport { + public: + FusedReluDropoutGradKernel() = default; + ~FusedReluDropoutGradKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); + user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + const float scale = ctx->Attr("scale"); + + const int64_t cols = dy->shape_view().At(1); + const int64_t aux_ld = mask->shape_view().At(1) * 32; + const int64_t elem_cnt = dy->shape_view().elem_cnt(); + LaunchVectorizedReluDropoutBackwardKernel( + ctx->stream(), elem_cnt, cols, aux_ld, scale, reinterpret_cast(dy->dptr()), + mask->dptr(), reinterpret_cast(dx->mut_dptr())); + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_FUSED_RELU_DROPOUT_GRAD_KERNEL_GPU(cpp_type, data_type) \ + REGISTER_USER_KERNEL("fused_relu_dropout_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dx", 0) == data_type)); + +REGISTER_FUSED_RELU_DROPOUT_GRAD_KERNEL_GPU(float, DataType::kFloat) +REGISTER_FUSED_RELU_DROPOUT_GRAD_KERNEL_GPU(half, DataType::kFloat16) + + +} // namespace + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/fused_scale_mask_softmax.hip.cpp b/oneflow/user/kernels/fused_scale_mask_softmax.hip.cpp index 8813fb7..2a5e18c 100644 --- a/oneflow/user/kernels/fused_scale_mask_softmax.hip.cpp +++ b/oneflow/user/kernels/fused_scale_mask_softmax.hip.cpp @@ -1,236 +1,236 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/hip/softmax.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include "oneflow/user/kernels/fused_scale_mask_softmax.hip.h" -namespace oneflow { - -namespace { - -template -void LaunchBroadcastForwardKernel(hipStream_t stream, const T* x, T* y, const MASK* mask, - const int64_t elem_cnt, const int64_t rows, const int64_t cols, - const float fill, const float scale, const int64_t* input_dims, - const int64_t* mask_dims) { - NdIndexOffsetHelper input_index_helper(input_dims); - NdIndexOffsetHelper mask_index_helper(mask_dims); - fused_scale_mask_softmax::BroadcastMaskSoftmaxParams params; - params.src_index_helper = input_index_helper; - params.mask_index_helper = mask_index_helper; - params.mask_dims = mask_dims; - params.row_size = cols; - params.fill = fill; - params.scale = scale; - fused_scale_mask_softmax::BroadcastScaleMaskLoad load( - x, mask, params); - cuda::softmax::DirectStore store(y, cols); - OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax( - stream, load, store, rows, cols))); -} - -template -void LaunchElementwiseForwardKernel(hipStream_t stream, const T* x, T* y, const MASK* mask, - const int64_t rows, const int64_t cols, const float fill, - const float scale) { - oneflow::fused_scale_mask_softmax::ElementwiseMaskSoftmaxParams params; - params.row_size = cols; - params.fill = fill; - params.scale = scale; - fused_scale_mask_softmax::ElementwiseScaleMaskLoad load(x, mask, params); - cuda::softmax::DirectStore store(y, cols); - OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax( - stream, load, store, rows, cols))); -} - -template -void LaunchBroadcastBackwardKernel(hipStream_t stream, const T* y, const T* dy, T* dx, - const MASK* mask, const int64_t elem_cnt, const int64_t rows, - const int64_t cols, const float fill, const float scale, - const int64_t* input_dims, const int64_t* mask_dims) { - NdIndexOffsetHelper input_index_helper(input_dims); - NdIndexOffsetHelper mask_index_helper(mask_dims); - fused_scale_mask_softmax::BroadcastMaskSoftmaxParams params; - params.src_index_helper = input_index_helper; - params.mask_index_helper = mask_index_helper; - params.mask_dims = mask_dims; - params.row_size = cols; - params.fill = fill; - params.scale = scale; - cuda::softmax::DirectLoad load_y(y, cols); - cuda::softmax::DirectLoad load_dy(dy, cols); - fused_scale_mask_softmax::BroadcastScaleMaskStore store( - dx, mask, params); - OF_CUDA_CHECK(( - cuda::softmax::DispatchSoftmaxGrad(stream, load_y, load_dy, store, rows, cols))); -} - -template -void LaunchElementwiseBackwardKernel(hipStream_t stream, const T* y, const T* dy, T* dx, - const MASK* mask, const int64_t rows, const int64_t cols, - const float fill, const float scale) { - fused_scale_mask_softmax::ElementwiseMaskSoftmaxParams params; - params.row_size = cols; - params.fill = fill; - params.scale = scale; - cuda::softmax::DirectLoad load_y(y, cols); - cuda::softmax::DirectLoad load_dy(dy, cols); - fused_scale_mask_softmax::ElementwiseScaleMaskStore store(dx, mask, params); - OF_CUDA_CHECK(( - cuda::softmax::DispatchSoftmaxGrad(stream, load_y, load_dy, store, rows, cols))); -} - -constexpr int32_t kMaxNumDims = 5; - -template -class FusedScaleMaskSoftmaxKernel final : public user_op::OpKernel { - public: - FusedScaleMaskSoftmaxKernel() = default; - ~FusedScaleMaskSoftmaxKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); - user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - const float mask_fill_value = ctx->Attr("mask_fill_value"); - const float scale_value = ctx->Attr("scale_value"); - const ShapeView& x_shape = x->shape_view(); - const ShapeView& mask_shape = mask->shape_view(); - CHECK_GE(x_shape.NumAxes(), 2); - const int64_t elem_cnt = x_shape.elem_cnt(); - const int64_t cols = x_shape.At(x_shape.NumAxes() - 1); - const int64_t rows = x_shape.Count(0, x_shape.NumAxes() - 1); - const size_t num_input_dims = x_shape.NumAxes(); - const int64_t* input_dims = x_shape.ptr(); - const size_t num_mask_dims = mask_shape.NumAxes(); - const int64_t* mask_dims = mask_shape.ptr(); - using ComputeType = typename cuda::softmax::DefaultComputeType::type; - - size_t simplified_num_dims = 0; - int64_t simplified_input_dims[kMaxNumDims]; - int64_t simplified_mask_dims[kMaxNumDims]; - fused_scale_mask_softmax::SimplifyBroadcastDims(num_input_dims, input_dims, num_mask_dims, - mask_dims, &simplified_num_dims, - simplified_input_dims, simplified_mask_dims); - if (simplified_num_dims == 1) { - LaunchElementwiseForwardKernel( - ctx->stream()->As()->cuda_stream(), x->dptr(), y->mut_dptr(), - mask->dptr(), rows, cols, mask_fill_value, scale_value); - } -#define DEFINE_ONE_ELIF(dims) \ - else if (simplified_num_dims == dims) { \ - LaunchBroadcastForwardKernel( \ - ctx->stream()->As()->cuda_stream(), x->dptr(), y->mut_dptr(), \ - mask->dptr(), elem_cnt, rows, cols, mask_fill_value, scale_value, \ - simplified_input_dims, simplified_mask_dims); \ - } - DEFINE_ONE_ELIF(2) - DEFINE_ONE_ELIF(3) - DEFINE_ONE_ELIF(4) -#undef DEFINE_ONE_ELIF - else { - UNIMPLEMENTED(); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class FusedScaleMaskSoftmaxGradKernel final : public user_op::OpKernel { - public: - FusedScaleMaskSoftmaxGradKernel() = default; - ~FusedScaleMaskSoftmaxGradKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); - user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const float scale_value = ctx->Attr("scale_value"); - const float mask_fill_value = static_cast(0.0); - const ShapeView& dy_shape = dy->shape_view(); - const ShapeView& mask_shape = mask->shape_view(); - CHECK_GE(dy_shape.NumAxes(), 2); - const int64_t elem_cnt = dy_shape.elem_cnt(); - const int64_t cols = dy_shape.At(dy_shape.NumAxes() - 1); - const int64_t rows = dy_shape.Count(0, dy_shape.NumAxes() - 1); - const int64_t* input_dims = dy_shape.ptr(); - const size_t num_input_dims = dy_shape.NumAxes(); - const int64_t* mask_dims = mask_shape.ptr(); - const size_t num_mask_dims = mask_shape.NumAxes(); - - using ComputeType = typename cuda::softmax::DefaultComputeType::type; - - size_t simplified_num_dims = 0; - int64_t simplified_input_dims[kMaxNumDims]; - int64_t simplified_mask_dims[kMaxNumDims]; - fused_scale_mask_softmax::SimplifyBroadcastDims(num_input_dims, input_dims, num_mask_dims, - mask_dims, &simplified_num_dims, - simplified_input_dims, simplified_mask_dims); - if (simplified_num_dims == 1) { - LaunchElementwiseBackwardKernel( - ctx->stream()->As()->cuda_stream(), y->dptr(), dy->dptr(), - dx->mut_dptr(), mask->dptr(), rows, cols, mask_fill_value, scale_value); - } -#define DEFINE_ONE_ELIF(dims) \ - else if (simplified_num_dims == dims) { \ - LaunchBroadcastBackwardKernel( \ - ctx->stream()->As()->cuda_stream(), y->dptr(), dy->dptr(), \ - dx->mut_dptr(), mask->dptr(), elem_cnt, rows, cols, mask_fill_value, scale_value, \ - simplified_input_dims, simplified_mask_dims); \ - } - DEFINE_ONE_ELIF(2) - DEFINE_ONE_ELIF(3) - DEFINE_ONE_ELIF(4) -#undef DEFINE_ONE_ELIF - else { - UNIMPLEMENTED(); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -} // namespace - -#define REGISTER_FUSED_SCALE_MASK_SOFTMAX_CUDA_KERNEL(dtype, mask_dtype) \ - REGISTER_USER_KERNEL("fused_scale_mask_softmax") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("x", 0) == GetDataType::value) \ - && (user_op::HobDataType("mask", 0) == GetDataType::value)); - -REGISTER_FUSED_SCALE_MASK_SOFTMAX_CUDA_KERNEL(half, bool) -REGISTER_FUSED_SCALE_MASK_SOFTMAX_CUDA_KERNEL(float, bool) -#undef REGISTER_FUSED_SCALE_MASK_SOFTMAX_CUDA_KERNEL - -#define REGISTER_FUSED_SCALE_MASK_SOFTMAX_GRAD_KERNEL(dtype, mask_dtype) \ - REGISTER_USER_KERNEL("fused_scale_mask_softmax_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dy", 0) == GetDataType::value) \ - && (user_op::HobDataType("mask", 0) == GetDataType::value)); - -REGISTER_FUSED_SCALE_MASK_SOFTMAX_GRAD_KERNEL(half, bool) -REGISTER_FUSED_SCALE_MASK_SOFTMAX_GRAD_KERNEL(float, bool) -#undef REGISTER_FUSED_SCALE_MASK_SOFTMAX_GRAD_KERNEL - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/hip/softmax.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include "oneflow/user/kernels/fused_scale_mask_softmax.hip.h" +namespace oneflow { + +namespace { + +template +void LaunchBroadcastForwardKernel(hipStream_t stream, const T* x, T* y, const MASK* mask, + const int64_t elem_cnt, const int64_t rows, const int64_t cols, + const float fill, const float scale, const int64_t* input_dims, + const int64_t* mask_dims) { + NdIndexOffsetHelper input_index_helper(input_dims); + NdIndexOffsetHelper mask_index_helper(mask_dims); + fused_scale_mask_softmax::BroadcastMaskSoftmaxParams params; + params.src_index_helper = input_index_helper; + params.mask_index_helper = mask_index_helper; + params.mask_dims = mask_dims; + params.row_size = cols; + params.fill = fill; + params.scale = scale; + fused_scale_mask_softmax::BroadcastScaleMaskLoad load( + x, mask, params); + cuda::softmax::DirectStore store(y, cols); + OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax( + stream, load, store, rows, cols))); +} + +template +void LaunchElementwiseForwardKernel(hipStream_t stream, const T* x, T* y, const MASK* mask, + const int64_t rows, const int64_t cols, const float fill, + const float scale) { + oneflow::fused_scale_mask_softmax::ElementwiseMaskSoftmaxParams params; + params.row_size = cols; + params.fill = fill; + params.scale = scale; + fused_scale_mask_softmax::ElementwiseScaleMaskLoad load(x, mask, params); + cuda::softmax::DirectStore store(y, cols); + OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax( + stream, load, store, rows, cols))); +} + +template +void LaunchBroadcastBackwardKernel(hipStream_t stream, const T* y, const T* dy, T* dx, + const MASK* mask, const int64_t elem_cnt, const int64_t rows, + const int64_t cols, const float fill, const float scale, + const int64_t* input_dims, const int64_t* mask_dims) { + NdIndexOffsetHelper input_index_helper(input_dims); + NdIndexOffsetHelper mask_index_helper(mask_dims); + fused_scale_mask_softmax::BroadcastMaskSoftmaxParams params; + params.src_index_helper = input_index_helper; + params.mask_index_helper = mask_index_helper; + params.mask_dims = mask_dims; + params.row_size = cols; + params.fill = fill; + params.scale = scale; + cuda::softmax::DirectLoad load_y(y, cols); + cuda::softmax::DirectLoad load_dy(dy, cols); + fused_scale_mask_softmax::BroadcastScaleMaskStore store( + dx, mask, params); + OF_CUDA_CHECK(( + cuda::softmax::DispatchSoftmaxGrad(stream, load_y, load_dy, store, rows, cols))); +} + +template +void LaunchElementwiseBackwardKernel(hipStream_t stream, const T* y, const T* dy, T* dx, + const MASK* mask, const int64_t rows, const int64_t cols, + const float fill, const float scale) { + fused_scale_mask_softmax::ElementwiseMaskSoftmaxParams params; + params.row_size = cols; + params.fill = fill; + params.scale = scale; + cuda::softmax::DirectLoad load_y(y, cols); + cuda::softmax::DirectLoad load_dy(dy, cols); + fused_scale_mask_softmax::ElementwiseScaleMaskStore store(dx, mask, params); + OF_CUDA_CHECK(( + cuda::softmax::DispatchSoftmaxGrad(stream, load_y, load_dy, store, rows, cols))); +} + +constexpr int32_t kMaxNumDims = 5; + +template +class FusedScaleMaskSoftmaxKernel final : public user_op::OpKernel { + public: + FusedScaleMaskSoftmaxKernel() = default; + ~FusedScaleMaskSoftmaxKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); + user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); + const float mask_fill_value = ctx->Attr("mask_fill_value"); + const float scale_value = ctx->Attr("scale_value"); + const ShapeView& x_shape = x->shape_view(); + const ShapeView& mask_shape = mask->shape_view(); + CHECK_GE(x_shape.NumAxes(), 2); + const int64_t elem_cnt = x_shape.elem_cnt(); + const int64_t cols = x_shape.At(x_shape.NumAxes() - 1); + const int64_t rows = x_shape.Count(0, x_shape.NumAxes() - 1); + const size_t num_input_dims = x_shape.NumAxes(); + const int64_t* input_dims = x_shape.ptr(); + const size_t num_mask_dims = mask_shape.NumAxes(); + const int64_t* mask_dims = mask_shape.ptr(); + using ComputeType = typename cuda::softmax::DefaultComputeType::type; + + size_t simplified_num_dims = 0; + int64_t simplified_input_dims[kMaxNumDims]; + int64_t simplified_mask_dims[kMaxNumDims]; + fused_scale_mask_softmax::SimplifyBroadcastDims(num_input_dims, input_dims, num_mask_dims, + mask_dims, &simplified_num_dims, + simplified_input_dims, simplified_mask_dims); + if (simplified_num_dims == 1) { + LaunchElementwiseForwardKernel( + ctx->stream()->As()->cuda_stream(), x->dptr(), y->mut_dptr(), + mask->dptr(), rows, cols, mask_fill_value, scale_value); + } +#define DEFINE_ONE_ELIF(dims) \ + else if (simplified_num_dims == dims) { \ + LaunchBroadcastForwardKernel( \ + ctx->stream()->As()->cuda_stream(), x->dptr(), y->mut_dptr(), \ + mask->dptr(), elem_cnt, rows, cols, mask_fill_value, scale_value, \ + simplified_input_dims, simplified_mask_dims); \ + } + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(3) + DEFINE_ONE_ELIF(4) +#undef DEFINE_ONE_ELIF + else { + UNIMPLEMENTED(); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class FusedScaleMaskSoftmaxGradKernel final : public user_op::OpKernel { + public: + FusedScaleMaskSoftmaxGradKernel() = default; + ~FusedScaleMaskSoftmaxGradKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); + const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); + user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + const float scale_value = ctx->Attr("scale_value"); + const float mask_fill_value = static_cast(0.0); + const ShapeView& dy_shape = dy->shape_view(); + const ShapeView& mask_shape = mask->shape_view(); + CHECK_GE(dy_shape.NumAxes(), 2); + const int64_t elem_cnt = dy_shape.elem_cnt(); + const int64_t cols = dy_shape.At(dy_shape.NumAxes() - 1); + const int64_t rows = dy_shape.Count(0, dy_shape.NumAxes() - 1); + const int64_t* input_dims = dy_shape.ptr(); + const size_t num_input_dims = dy_shape.NumAxes(); + const int64_t* mask_dims = mask_shape.ptr(); + const size_t num_mask_dims = mask_shape.NumAxes(); + + using ComputeType = typename cuda::softmax::DefaultComputeType::type; + + size_t simplified_num_dims = 0; + int64_t simplified_input_dims[kMaxNumDims]; + int64_t simplified_mask_dims[kMaxNumDims]; + fused_scale_mask_softmax::SimplifyBroadcastDims(num_input_dims, input_dims, num_mask_dims, + mask_dims, &simplified_num_dims, + simplified_input_dims, simplified_mask_dims); + if (simplified_num_dims == 1) { + LaunchElementwiseBackwardKernel( + ctx->stream()->As()->cuda_stream(), y->dptr(), dy->dptr(), + dx->mut_dptr(), mask->dptr(), rows, cols, mask_fill_value, scale_value); + } +#define DEFINE_ONE_ELIF(dims) \ + else if (simplified_num_dims == dims) { \ + LaunchBroadcastBackwardKernel( \ + ctx->stream()->As()->cuda_stream(), y->dptr(), dy->dptr(), \ + dx->mut_dptr(), mask->dptr(), elem_cnt, rows, cols, mask_fill_value, scale_value, \ + simplified_input_dims, simplified_mask_dims); \ + } + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(3) + DEFINE_ONE_ELIF(4) +#undef DEFINE_ONE_ELIF + else { + UNIMPLEMENTED(); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +} // namespace + +#define REGISTER_FUSED_SCALE_MASK_SOFTMAX_CUDA_KERNEL(dtype, mask_dtype) \ + REGISTER_USER_KERNEL("fused_scale_mask_softmax") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == GetDataType::value) \ + && (user_op::HobDataType("mask", 0) == GetDataType::value)); + +REGISTER_FUSED_SCALE_MASK_SOFTMAX_CUDA_KERNEL(half, bool) +REGISTER_FUSED_SCALE_MASK_SOFTMAX_CUDA_KERNEL(float, bool) +#undef REGISTER_FUSED_SCALE_MASK_SOFTMAX_CUDA_KERNEL + +#define REGISTER_FUSED_SCALE_MASK_SOFTMAX_GRAD_KERNEL(dtype, mask_dtype) \ + REGISTER_USER_KERNEL("fused_scale_mask_softmax_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dy", 0) == GetDataType::value) \ + && (user_op::HobDataType("mask", 0) == GetDataType::value)); + +REGISTER_FUSED_SCALE_MASK_SOFTMAX_GRAD_KERNEL(half, bool) +REGISTER_FUSED_SCALE_MASK_SOFTMAX_GRAD_KERNEL(float, bool) +#undef REGISTER_FUSED_SCALE_MASK_SOFTMAX_GRAD_KERNEL + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/fused_scale_mask_softmax.hip.h b/oneflow/user/kernels/fused_scale_mask_softmax.hip.h index 84adfb6..43e49ce 100644 --- a/oneflow/user/kernels/fused_scale_mask_softmax.hip.h +++ b/oneflow/user/kernels/fused_scale_mask_softmax.hip.h @@ -1,216 +1,216 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/common/nd_index_offset_helper.h" - -namespace oneflow { - -namespace fused_scale_mask_softmax { - -namespace { - -void SimplifyBroadcastDims(size_t num_a_dims, const int64_t* a_dims, size_t num_b_dims, - const int64_t* b_dims, size_t* simplified_num_dims, - int64_t* simplified_a_dims, int64_t* simplified_b_dims) { - const size_t num_max_dims = std::max(num_a_dims, num_b_dims); - auto MakeGetDim = [num_max_dims](size_t num_dims, const int64_t* dims) { - const int64_t num_padding_dims = num_max_dims - num_dims; - return [num_padding_dims, dims](size_t index) { - return index < num_padding_dims ? 1 : dims[index - num_padding_dims]; - }; - }; - auto GetADim = MakeGetDim(num_a_dims, a_dims); - auto GetBDim = MakeGetDim(num_b_dims, b_dims); - *simplified_num_dims = 0; - bool prev_broadcast_a = false; - bool prev_broadcast_b = false; - for (int64_t i = 0; i < num_max_dims; ++i) { - const int64_t a_dim = GetADim(i); - const int64_t b_dim = GetBDim(i); - const int64_t broadcast_dim = std::max(a_dim, b_dim); - CHECK_GT(broadcast_dim, 0); - const bool broadcast_a = (a_dim == 1); - const bool broadcast_b = (b_dim == 1); - CHECK((a_dim == broadcast_dim) || broadcast_a); - CHECK((b_dim == broadcast_dim) || broadcast_b); - if (broadcast_dim == 1) { - continue; - } else if (*simplified_num_dims != 0 - && (prev_broadcast_a == broadcast_a && prev_broadcast_b == broadcast_b)) { - simplified_a_dims[*simplified_num_dims - 1] *= a_dim; - simplified_b_dims[*simplified_num_dims - 1] *= b_dim; - } else { - simplified_a_dims[*simplified_num_dims] = a_dim; - simplified_b_dims[*simplified_num_dims] = b_dim; - *simplified_num_dims += 1; - prev_broadcast_a = broadcast_a; - prev_broadcast_b = broadcast_b; - } - } -} - -template -struct BroadcastMaskSoftmaxParams { - NdIndexOffsetHelper src_index_helper; - NdIndexOffsetHelper mask_index_helper; - const int64_t* mask_dims{}; - int64_t row_size; - float fill; - float scale; -}; - -struct ElementwiseMaskSoftmaxParams { - int64_t row_size; - float fill; - float scale; -}; - -template -struct BroadcastScaleMaskLoad { - BroadcastScaleMaskLoad(const SRC* src, const MASK* mask, - BroadcastMaskSoftmaxParams params) - : src(src), mask(mask), params(params) { - for (int i = 0; i < num_dims; i++) { mask_dims[i] = params.mask_dims[i]; } - } - template - __device__ void load(DST* dst, int64_t row, int64_t col) { - cuda::softmax::Pack pack; - cuda::softmax::Pack mask_pack; - const IndexType offset = row * params.row_size + col; - IndexType input_index[num_dims]; - IndexType mask_index[num_dims]; - params.src_index_helper.OffsetToNdIndex(offset, input_index); - for (int dim = 0; dim < num_dims; ++dim) { - if (mask_dims[dim] == 1) { - mask_index[dim] = 0; - } else { - mask_index[dim] = input_index[dim]; - } - } - const IndexType mask_offset = params.mask_index_helper.NdIndexToOffset(mask_index); - pack.storage = *(reinterpret_cast*>(src) + offset / N); - mask_pack.storage = - *(reinterpret_cast*>(mask) + mask_offset / N); -#pragma unroll - for (int i = 0; i < N; ++i) { - if (mask_pack.elem[i] == 0) { - dst[i] = static_cast(params.fill); - } else { - dst[i] = static_cast(pack.elem[i]) * static_cast(params.scale); - } - } - } - const SRC* src; - const MASK* mask; - int64_t mask_dims[num_dims]; - BroadcastMaskSoftmaxParams params; -}; - -template -struct ElementwiseScaleMaskLoad { - ElementwiseScaleMaskLoad(const SRC* src, const MASK* mask, ElementwiseMaskSoftmaxParams param) - : src(src), mask(mask), param(param) {} - template - __device__ void load(DST* dst, int64_t row, int64_t col) { - cuda::softmax::Pack pack; - const int64_t offset = (row * param.row_size + col) / N; - pack.storage = *(reinterpret_cast*>(src) + offset); - cuda::softmax::Pack mask_pack; - mask_pack.storage = *(reinterpret_cast*>(mask) + offset); -#pragma unroll - for (int i = 0; i < N; ++i) { - if (mask_pack.elem[i] == 0) { - dst[i] = static_cast(param.fill); - } else { - dst[i] = static_cast(pack.elem[i]) * static_cast(param.scale); - } - } - } - const SRC* src; - const MASK* mask; - ElementwiseMaskSoftmaxParams param; -}; - -template -struct BroadcastScaleMaskStore { - BroadcastScaleMaskStore(DST* dst, const MASK* mask, - BroadcastMaskSoftmaxParams params) - : dst(dst), mask(mask), params(params) { - for (int i = 0; i < num_dims; ++i) { mask_dims[i] = params.mask_dims[i]; } - } - template - __device__ void store(const SRC* src, int64_t row, int64_t col) { - cuda::softmax::Pack pack; - cuda::softmax::Pack mask_pack; - const IndexType offset = row * params.row_size + col; - IndexType input_index[num_dims]; - IndexType mask_index[num_dims]; - params.src_index_helper.OffsetToNdIndex(offset, input_index); - for (int dim = 0; dim < num_dims; ++dim) { - if (mask_dims[dim] == 1) { - mask_index[dim] = 0; - } else { - mask_index[dim] = input_index[dim]; - } - } - const IndexType mask_offset = params.mask_index_helper.NdIndexToOffset(mask_index); - mask_pack.storage = - *(reinterpret_cast*>(mask) + mask_offset / N); -#pragma unroll - for (int i = 0; i < N; ++i) { - if (mask_pack.elem[i] == 0) { - pack.elem[i] = static_cast(params.fill); - } else { - pack.elem[i] = static_cast(src[i]) * static_cast(params.scale); - } - } - *(reinterpret_cast*>(dst) + offset / N) = pack.storage; - } - DST* dst; - const MASK* mask; - int64_t mask_dims[num_dims]; - BroadcastMaskSoftmaxParams params; -}; - -template -struct ElementwiseScaleMaskStore { - ElementwiseScaleMaskStore(DST* dst, const MASK* mask, ElementwiseMaskSoftmaxParams params) - : dst(dst), mask(mask), params(params) {} - template - __device__ void store(const SRC* src, int64_t row, int64_t col) { - cuda::softmax::Pack pack; - const int64_t offset = (row * params.row_size + col) / N; - cuda::softmax::Pack mask_pack; - mask_pack.storage = *(reinterpret_cast*>(mask) + offset); -#pragma unroll - for (int i = 0; i < N; ++i) { - if (mask_pack.elem[i] == 0) { - pack.elem[i] = params.fill; - } else { - pack.elem[i] = static_cast(src[i]) * static_cast(params.scale); - } - } - *(reinterpret_cast*>(dst) + offset) = pack.storage; - } - DST* dst; - const MASK* mask; - ElementwiseMaskSoftmaxParams params; -}; - -} // namespace - -} // namespace fused_scale_mask_softmax - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/common/nd_index_offset_helper.h" + +namespace oneflow { + +namespace fused_scale_mask_softmax { + +namespace { + +void SimplifyBroadcastDims(size_t num_a_dims, const int64_t* a_dims, size_t num_b_dims, + const int64_t* b_dims, size_t* simplified_num_dims, + int64_t* simplified_a_dims, int64_t* simplified_b_dims) { + const size_t num_max_dims = std::max(num_a_dims, num_b_dims); + auto MakeGetDim = [num_max_dims](size_t num_dims, const int64_t* dims) { + const int64_t num_padding_dims = num_max_dims - num_dims; + return [num_padding_dims, dims](size_t index) { + return index < num_padding_dims ? 1 : dims[index - num_padding_dims]; + }; + }; + auto GetADim = MakeGetDim(num_a_dims, a_dims); + auto GetBDim = MakeGetDim(num_b_dims, b_dims); + *simplified_num_dims = 0; + bool prev_broadcast_a = false; + bool prev_broadcast_b = false; + for (int64_t i = 0; i < num_max_dims; ++i) { + const int64_t a_dim = GetADim(i); + const int64_t b_dim = GetBDim(i); + const int64_t broadcast_dim = std::max(a_dim, b_dim); + CHECK_GT(broadcast_dim, 0); + const bool broadcast_a = (a_dim == 1); + const bool broadcast_b = (b_dim == 1); + CHECK((a_dim == broadcast_dim) || broadcast_a); + CHECK((b_dim == broadcast_dim) || broadcast_b); + if (broadcast_dim == 1) { + continue; + } else if (*simplified_num_dims != 0 + && (prev_broadcast_a == broadcast_a && prev_broadcast_b == broadcast_b)) { + simplified_a_dims[*simplified_num_dims - 1] *= a_dim; + simplified_b_dims[*simplified_num_dims - 1] *= b_dim; + } else { + simplified_a_dims[*simplified_num_dims] = a_dim; + simplified_b_dims[*simplified_num_dims] = b_dim; + *simplified_num_dims += 1; + prev_broadcast_a = broadcast_a; + prev_broadcast_b = broadcast_b; + } + } +} + +template +struct BroadcastMaskSoftmaxParams { + NdIndexOffsetHelper src_index_helper; + NdIndexOffsetHelper mask_index_helper; + const int64_t* mask_dims{}; + int64_t row_size; + float fill; + float scale; +}; + +struct ElementwiseMaskSoftmaxParams { + int64_t row_size; + float fill; + float scale; +}; + +template +struct BroadcastScaleMaskLoad { + BroadcastScaleMaskLoad(const SRC* src, const MASK* mask, + BroadcastMaskSoftmaxParams params) + : src(src), mask(mask), params(params) { + for (int i = 0; i < num_dims; i++) { mask_dims[i] = params.mask_dims[i]; } + } + template + __device__ void load(DST* dst, int64_t row, int64_t col) { + cuda::softmax::Pack pack; + cuda::softmax::Pack mask_pack; + const IndexType offset = row * params.row_size + col; + IndexType input_index[num_dims]; + IndexType mask_index[num_dims]; + params.src_index_helper.OffsetToNdIndex(offset, input_index); + for (int dim = 0; dim < num_dims; ++dim) { + if (mask_dims[dim] == 1) { + mask_index[dim] = 0; + } else { + mask_index[dim] = input_index[dim]; + } + } + const IndexType mask_offset = params.mask_index_helper.NdIndexToOffset(mask_index); + pack.storage = *(reinterpret_cast*>(src) + offset / N); + mask_pack.storage = + *(reinterpret_cast*>(mask) + mask_offset / N); +#pragma unroll + for (int i = 0; i < N; ++i) { + if (mask_pack.elem[i] == 0) { + dst[i] = static_cast(params.fill); + } else { + dst[i] = static_cast(pack.elem[i]) * static_cast(params.scale); + } + } + } + const SRC* src; + const MASK* mask; + int64_t mask_dims[num_dims]; + BroadcastMaskSoftmaxParams params; +}; + +template +struct ElementwiseScaleMaskLoad { + ElementwiseScaleMaskLoad(const SRC* src, const MASK* mask, ElementwiseMaskSoftmaxParams param) + : src(src), mask(mask), param(param) {} + template + __device__ void load(DST* dst, int64_t row, int64_t col) { + cuda::softmax::Pack pack; + const int64_t offset = (row * param.row_size + col) / N; + pack.storage = *(reinterpret_cast*>(src) + offset); + cuda::softmax::Pack mask_pack; + mask_pack.storage = *(reinterpret_cast*>(mask) + offset); +#pragma unroll + for (int i = 0; i < N; ++i) { + if (mask_pack.elem[i] == 0) { + dst[i] = static_cast(param.fill); + } else { + dst[i] = static_cast(pack.elem[i]) * static_cast(param.scale); + } + } + } + const SRC* src; + const MASK* mask; + ElementwiseMaskSoftmaxParams param; +}; + +template +struct BroadcastScaleMaskStore { + BroadcastScaleMaskStore(DST* dst, const MASK* mask, + BroadcastMaskSoftmaxParams params) + : dst(dst), mask(mask), params(params) { + for (int i = 0; i < num_dims; ++i) { mask_dims[i] = params.mask_dims[i]; } + } + template + __device__ void store(const SRC* src, int64_t row, int64_t col) { + cuda::softmax::Pack pack; + cuda::softmax::Pack mask_pack; + const IndexType offset = row * params.row_size + col; + IndexType input_index[num_dims]; + IndexType mask_index[num_dims]; + params.src_index_helper.OffsetToNdIndex(offset, input_index); + for (int dim = 0; dim < num_dims; ++dim) { + if (mask_dims[dim] == 1) { + mask_index[dim] = 0; + } else { + mask_index[dim] = input_index[dim]; + } + } + const IndexType mask_offset = params.mask_index_helper.NdIndexToOffset(mask_index); + mask_pack.storage = + *(reinterpret_cast*>(mask) + mask_offset / N); +#pragma unroll + for (int i = 0; i < N; ++i) { + if (mask_pack.elem[i] == 0) { + pack.elem[i] = static_cast(params.fill); + } else { + pack.elem[i] = static_cast(src[i]) * static_cast(params.scale); + } + } + *(reinterpret_cast*>(dst) + offset / N) = pack.storage; + } + DST* dst; + const MASK* mask; + int64_t mask_dims[num_dims]; + BroadcastMaskSoftmaxParams params; +}; + +template +struct ElementwiseScaleMaskStore { + ElementwiseScaleMaskStore(DST* dst, const MASK* mask, ElementwiseMaskSoftmaxParams params) + : dst(dst), mask(mask), params(params) {} + template + __device__ void store(const SRC* src, int64_t row, int64_t col) { + cuda::softmax::Pack pack; + const int64_t offset = (row * params.row_size + col) / N; + cuda::softmax::Pack mask_pack; + mask_pack.storage = *(reinterpret_cast*>(mask) + offset); +#pragma unroll + for (int i = 0; i < N; ++i) { + if (mask_pack.elem[i] == 0) { + pack.elem[i] = params.fill; + } else { + pack.elem[i] = static_cast(src[i]) * static_cast(params.scale); + } + } + *(reinterpret_cast*>(dst) + offset) = pack.storage; + } + DST* dst; + const MASK* mask; + ElementwiseMaskSoftmaxParams params; +}; + +} // namespace + +} // namespace fused_scale_mask_softmax + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/fused_scale_mask_softmax_dropout.hip.cpp b/oneflow/user/kernels/fused_scale_mask_softmax_dropout.hip.cpp index a0e6482..12a8fc9 100644 --- a/oneflow/user/kernels/fused_scale_mask_softmax_dropout.hip.cpp +++ b/oneflow/user/kernels/fused_scale_mask_softmax_dropout.hip.cpp @@ -1,303 +1,303 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/hip/softmax.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include "oneflow/user/kernels/fused_scale_mask_softmax.hip.h" - -namespace oneflow { - -namespace { - -template -struct DropoutLoad { - DropoutLoad(const SRC* src, const bool* mask, int64_t row_size, SRC scale) - : src(src), mask(mask), row_size(row_size), scale(scale) {} - template - __device__ void load(DST* dst, int64_t row, int64_t col) const { - cuda::softmax::Pack pack; - const int64_t offset = (row * row_size + col) / N; - pack.storage = *(reinterpret_cast*>(src) + offset); - cuda::softmax::Pack mask_pack; - mask_pack.storage = *(reinterpret_cast*>(mask) + offset); -#pragma unroll - for (int i = 0; i < N; ++i) { - dst[i] = static_cast(pack.elem[i]) * static_cast(mask_pack.elem[i]) - * static_cast(scale); - } - } - const SRC* src; - const bool* mask; - int64_t row_size; - SRC scale; -}; - -template -struct DropoutStore { - DropoutStore(DST* dst, DST* softmax_y, const bool* mask, int64_t row_size, DST scale) - : dst(dst), softmax_y(softmax_y), mask(mask), row_size(row_size), scale(scale) {} - template - __device__ void store(const SRC* src, int64_t row, int64_t col) { - cuda::softmax::Pack softmax_y_pack; - cuda::softmax::Pack dst_pack; - const int64_t offset = (row * row_size + col) / N; - cuda::softmax::Pack mask_pack; - mask_pack.storage = *(reinterpret_cast*>(mask) + offset); -#pragma unroll - for (int i = 0; i < N; ++i) { - softmax_y_pack.elem[i] = static_cast(src[i]); - dst_pack.elem[i] = - static_cast(src[i]) * static_cast(mask_pack.elem[i]) * static_cast(scale); - } - *(reinterpret_cast*>(softmax_y) + offset) = - softmax_y_pack.storage; - *(reinterpret_cast*>(dst) + offset) = dst_pack.storage; - } - DST* dst; - DST* softmax_y; - const bool* mask; - int64_t row_size; - DST scale; -}; - -template -void LaunchBroadcastForwardKernel(hipStream_t stream, const T* x, T* y, T* softmax_y, - const MASK* mask, const bool* dropout_mask, - const int64_t elem_cnt, const int64_t rows, const int64_t cols, - const float fill, const float scale, const float dropout_scale, - const int64_t* input_dims, const int64_t* mask_dims) { - DropoutStore store(y, softmax_y, dropout_mask, cols, dropout_scale); - NdIndexOffsetHelper input_index_helper(input_dims); - NdIndexOffsetHelper mask_index_helper(mask_dims); - fused_scale_mask_softmax::BroadcastMaskSoftmaxParams params; - params.src_index_helper = input_index_helper; - params.mask_index_helper = mask_index_helper; - params.mask_dims = mask_dims; - params.row_size = cols; - params.fill = fill; - params.scale = scale; - fused_scale_mask_softmax::BroadcastScaleMaskLoad load( - x, mask, params); - OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax( - stream, load, store, rows, cols))); -} - -template -void LaunchElementwiseForwardKernel(hipStream_t stream, const T* x, T* y, T* softmax_y, - const MASK* mask, const bool* dropout_mask, const int64_t rows, - const int64_t cols, const float fill, const float scale, - const float dropout_scale) { - fused_scale_mask_softmax::ElementwiseMaskSoftmaxParams params; - params.row_size = cols; - params.fill = fill; - params.scale = scale; - fused_scale_mask_softmax::ElementwiseScaleMaskLoad load(x, mask, params); - DropoutStore store(y, softmax_y, dropout_mask, cols, dropout_scale); - OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax( - stream, load, store, rows, cols))); -} - -template -void LaunchBroadcastBackwardKernel(hipStream_t stream, const T* softmax_y, const T* dy, T* dx, - const MASK* mask, const bool* dropout_mask, - const int64_t elem_cnt, const int64_t rows, const int64_t cols, - const float fill, const float scale, const float dropout_scale, - const int64_t* input_dims, const int64_t* mask_dims) { - DropoutLoad load_dy(dy, dropout_mask, cols, dropout_scale); - NdIndexOffsetHelper input_index_helper(input_dims, num_dims); - NdIndexOffsetHelper mask_index_helper(mask_dims, num_dims); - fused_scale_mask_softmax::BroadcastMaskSoftmaxParams params; - params.src_index_helper = input_index_helper; - params.mask_index_helper = mask_index_helper; - params.mask_dims = mask_dims; - params.row_size = cols; - params.fill = fill; - params.scale = scale; - cuda::softmax::DirectLoad load_softmax_y(softmax_y, cols); - fused_scale_mask_softmax::BroadcastScaleMaskStore store( - dx, mask, params); - OF_CUDA_CHECK((cuda::softmax::DispatchSoftmaxGrad( - stream, load_softmax_y, load_dy, store, rows, cols))); -} - -template -void LaunchElementwiseBackwardKernel(hipStream_t stream, const T* softmax_y, const T* dy, T* dx, - const MASK* mask, const bool* dropout_mask, const int64_t rows, - const int64_t cols, const float fill, const float scale, - const float dropout_scale) { - fused_scale_mask_softmax::ElementwiseMaskSoftmaxParams params; - params.row_size = cols; - params.fill = fill; - params.scale = scale; - cuda::softmax::DirectLoad load_softmax_y(softmax_y, cols); - DropoutLoad load_dy(dy, dropout_mask, cols, dropout_scale); - fused_scale_mask_softmax::ElementwiseScaleMaskStore store(dx, mask, params); - OF_CUDA_CHECK((cuda::softmax::DispatchSoftmaxGrad( - stream, load_softmax_y, load_dy, store, rows, cols))); -} - -constexpr int32_t kMaxNumDims = 5; - -template -class FusedScaleMaskSoftmaxDropoutKernel final : public user_op::OpKernel { - public: - FusedScaleMaskSoftmaxDropoutKernel() = default; - ~FusedScaleMaskSoftmaxDropoutKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); - const user_op::Tensor* dropout_mask = ctx->Tensor4ArgNameAndIndex("dropout_mask", 0); - user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - const float mask_fill_value = ctx->Attr("mask_fill_value"); - const float scale_value = ctx->Attr("scale_value"); - const float dropout_scale_value = ctx->Attr("dropout_scale_value"); - user_op::Tensor* softmax_y = ctx->Tensor4ArgNameAndIndex("softmax_y", 0); - const ShapeView& x_shape = x->shape_view(); - const ShapeView& mask_shape = mask->shape_view(); - CHECK_GE(x_shape.NumAxes(), 2); - const int64_t elem_cnt = x_shape.elem_cnt(); - const int64_t cols = x_shape.At(x_shape.NumAxes() - 1); - const int64_t rows = x_shape.Count(0, x_shape.NumAxes() - 1); - const size_t num_input_dims = x_shape.NumAxes(); - const int64_t* input_dims = x_shape.ptr(); - const size_t num_mask_dims = mask_shape.NumAxes(); - const int64_t* mask_dims = mask_shape.ptr(); - using ComputeType = typename cuda::softmax::DefaultComputeType::type; - - size_t simplified_num_dims = 0; - int64_t simplified_input_dims[kMaxNumDims]; - int64_t simplified_mask_dims[kMaxNumDims]; - fused_scale_mask_softmax::SimplifyBroadcastDims(num_input_dims, input_dims, num_mask_dims, - mask_dims, &simplified_num_dims, - simplified_input_dims, simplified_mask_dims); - if (simplified_num_dims == 1) { - LaunchElementwiseForwardKernel( - ctx->stream()->As()->cuda_stream(), x->dptr(), y->mut_dptr(), - softmax_y->mut_dptr(), mask->dptr(), dropout_mask->dptr(), rows, cols, - mask_fill_value, scale_value, dropout_scale_value); - } - -#define DEFINE_ONE_ELIF(dims) \ - else if (simplified_num_dims == dims) { \ - LaunchBroadcastForwardKernel( \ - ctx->stream()->As()->cuda_stream(), x->dptr(), y->mut_dptr(), \ - softmax_y->mut_dptr(), mask->dptr(), dropout_mask->dptr(), elem_cnt, rows, \ - cols, mask_fill_value, scale_value, dropout_scale_value, simplified_input_dims, \ - simplified_mask_dims); \ - } - DEFINE_ONE_ELIF(2) - DEFINE_ONE_ELIF(3) - DEFINE_ONE_ELIF(4) -#undef DEFINE_ONE_ELIF - else { - UNIMPLEMENTED(); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class FusedScaleMaskSoftmaxDropoutGradKernel final : public user_op::OpKernel { - public: - FusedScaleMaskSoftmaxDropoutGradKernel() = default; - ~FusedScaleMaskSoftmaxDropoutGradKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* softmax_y = ctx->Tensor4ArgNameAndIndex("softmax_y", 0); - const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); - const user_op::Tensor* dropout_mask = ctx->Tensor4ArgNameAndIndex("dropout_mask", 0); - user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const float mask_fill_value = static_cast(0.0); - const float scale_value = ctx->Attr("scale_value"); - const float dropout_scale_value = ctx->Attr("dropout_scale_value"); - const ShapeView& dy_shape = dy->shape_view(); - const int64_t elem_cnt = dy_shape.elem_cnt(); - const ShapeView& mask_shape = mask->shape_view(); - CHECK_GE(dy_shape.NumAxes(), 2); - const int64_t cols = dy_shape.At(dy_shape.NumAxes() - 1); - const int64_t rows = dy_shape.Count(0, dy_shape.NumAxes() - 1); - const int64_t* input_dims = dy_shape.ptr(); - const size_t num_input_dims = dy_shape.NumAxes(); - const int64_t* mask_dims = mask_shape.ptr(); - const size_t num_mask_dims = mask_shape.NumAxes(); - - using ComputeType = typename cuda::softmax::DefaultComputeType::type; - cuda::softmax::DirectLoad load_softmax_y(softmax_y->dptr(), cols); - - size_t simplified_num_dims = 0; - int64_t simplified_input_dims[kMaxNumDims]; - int64_t simplified_mask_dims[kMaxNumDims]; - fused_scale_mask_softmax::SimplifyBroadcastDims(num_input_dims, input_dims, num_mask_dims, - mask_dims, &simplified_num_dims, - simplified_input_dims, simplified_mask_dims); - if (simplified_num_dims == 1) { - LaunchElementwiseBackwardKernel( - ctx->stream()->As()->cuda_stream(), softmax_y->dptr(), dy->dptr(), - dx->mut_dptr(), mask->dptr(), dropout_mask->dptr(), rows, cols, - mask_fill_value, scale_value, dropout_scale_value); - } -#define DEFINE_ONE_ELIF(dims) \ - else if (simplified_num_dims == dims) { \ - LaunchBroadcastBackwardKernel( \ - ctx->stream()->As()->cuda_stream(), softmax_y->dptr(), dy->dptr(), \ - dx->mut_dptr(), mask->dptr(), dropout_mask->dptr(), elem_cnt, rows, cols, \ - static_cast(0.0), ctx->Attr("scale_value"), \ - ctx->Attr("dropout_scale_value"), simplified_input_dims, simplified_mask_dims); \ - } - DEFINE_ONE_ELIF(2) - DEFINE_ONE_ELIF(3) - DEFINE_ONE_ELIF(4) -#undef DEFINE_ONE_ELIF - else { - UNIMPLEMENTED(); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -} // namespace - -#define REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL(dtype, mask_dtype) \ - REGISTER_USER_KERNEL("fused_scale_mask_softmax_dropout") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("x", 0) == GetDataType::value) \ - && (user_op::HobDataType("mask", 0) == GetDataType::value)); - -REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL(half, bool) -REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL(float, bool) -#undef REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL - -#define REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL(dtype, mask_dtype) \ - REGISTER_USER_KERNEL("fused_scale_mask_softmax_dropout_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value) \ - && (user_op::HobDataType("mask", 0) == GetDataType::value)); - -REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL(half, bool) -REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL(float, bool) -#undef REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/hip/softmax.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include "oneflow/user/kernels/fused_scale_mask_softmax.hip.h" + +namespace oneflow { + +namespace { + +template +struct DropoutLoad { + DropoutLoad(const SRC* src, const bool* mask, int64_t row_size, SRC scale) + : src(src), mask(mask), row_size(row_size), scale(scale) {} + template + __device__ void load(DST* dst, int64_t row, int64_t col) const { + cuda::softmax::Pack pack; + const int64_t offset = (row * row_size + col) / N; + pack.storage = *(reinterpret_cast*>(src) + offset); + cuda::softmax::Pack mask_pack; + mask_pack.storage = *(reinterpret_cast*>(mask) + offset); +#pragma unroll + for (int i = 0; i < N; ++i) { + dst[i] = static_cast(pack.elem[i]) * static_cast(mask_pack.elem[i]) + * static_cast(scale); + } + } + const SRC* src; + const bool* mask; + int64_t row_size; + SRC scale; +}; + +template +struct DropoutStore { + DropoutStore(DST* dst, DST* softmax_y, const bool* mask, int64_t row_size, DST scale) + : dst(dst), softmax_y(softmax_y), mask(mask), row_size(row_size), scale(scale) {} + template + __device__ void store(const SRC* src, int64_t row, int64_t col) { + cuda::softmax::Pack softmax_y_pack; + cuda::softmax::Pack dst_pack; + const int64_t offset = (row * row_size + col) / N; + cuda::softmax::Pack mask_pack; + mask_pack.storage = *(reinterpret_cast*>(mask) + offset); +#pragma unroll + for (int i = 0; i < N; ++i) { + softmax_y_pack.elem[i] = static_cast(src[i]); + dst_pack.elem[i] = + static_cast(src[i]) * static_cast(mask_pack.elem[i]) * static_cast(scale); + } + *(reinterpret_cast*>(softmax_y) + offset) = + softmax_y_pack.storage; + *(reinterpret_cast*>(dst) + offset) = dst_pack.storage; + } + DST* dst; + DST* softmax_y; + const bool* mask; + int64_t row_size; + DST scale; +}; + +template +void LaunchBroadcastForwardKernel(hipStream_t stream, const T* x, T* y, T* softmax_y, + const MASK* mask, const bool* dropout_mask, + const int64_t elem_cnt, const int64_t rows, const int64_t cols, + const float fill, const float scale, const float dropout_scale, + const int64_t* input_dims, const int64_t* mask_dims) { + DropoutStore store(y, softmax_y, dropout_mask, cols, dropout_scale); + NdIndexOffsetHelper input_index_helper(input_dims); + NdIndexOffsetHelper mask_index_helper(mask_dims); + fused_scale_mask_softmax::BroadcastMaskSoftmaxParams params; + params.src_index_helper = input_index_helper; + params.mask_index_helper = mask_index_helper; + params.mask_dims = mask_dims; + params.row_size = cols; + params.fill = fill; + params.scale = scale; + fused_scale_mask_softmax::BroadcastScaleMaskLoad load( + x, mask, params); + OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax( + stream, load, store, rows, cols))); +} + +template +void LaunchElementwiseForwardKernel(hipStream_t stream, const T* x, T* y, T* softmax_y, + const MASK* mask, const bool* dropout_mask, const int64_t rows, + const int64_t cols, const float fill, const float scale, + const float dropout_scale) { + fused_scale_mask_softmax::ElementwiseMaskSoftmaxParams params; + params.row_size = cols; + params.fill = fill; + params.scale = scale; + fused_scale_mask_softmax::ElementwiseScaleMaskLoad load(x, mask, params); + DropoutStore store(y, softmax_y, dropout_mask, cols, dropout_scale); + OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax( + stream, load, store, rows, cols))); +} + +template +void LaunchBroadcastBackwardKernel(hipStream_t stream, const T* softmax_y, const T* dy, T* dx, + const MASK* mask, const bool* dropout_mask, + const int64_t elem_cnt, const int64_t rows, const int64_t cols, + const float fill, const float scale, const float dropout_scale, + const int64_t* input_dims, const int64_t* mask_dims) { + DropoutLoad load_dy(dy, dropout_mask, cols, dropout_scale); + NdIndexOffsetHelper input_index_helper(input_dims, num_dims); + NdIndexOffsetHelper mask_index_helper(mask_dims, num_dims); + fused_scale_mask_softmax::BroadcastMaskSoftmaxParams params; + params.src_index_helper = input_index_helper; + params.mask_index_helper = mask_index_helper; + params.mask_dims = mask_dims; + params.row_size = cols; + params.fill = fill; + params.scale = scale; + cuda::softmax::DirectLoad load_softmax_y(softmax_y, cols); + fused_scale_mask_softmax::BroadcastScaleMaskStore store( + dx, mask, params); + OF_CUDA_CHECK((cuda::softmax::DispatchSoftmaxGrad( + stream, load_softmax_y, load_dy, store, rows, cols))); +} + +template +void LaunchElementwiseBackwardKernel(hipStream_t stream, const T* softmax_y, const T* dy, T* dx, + const MASK* mask, const bool* dropout_mask, const int64_t rows, + const int64_t cols, const float fill, const float scale, + const float dropout_scale) { + fused_scale_mask_softmax::ElementwiseMaskSoftmaxParams params; + params.row_size = cols; + params.fill = fill; + params.scale = scale; + cuda::softmax::DirectLoad load_softmax_y(softmax_y, cols); + DropoutLoad load_dy(dy, dropout_mask, cols, dropout_scale); + fused_scale_mask_softmax::ElementwiseScaleMaskStore store(dx, mask, params); + OF_CUDA_CHECK((cuda::softmax::DispatchSoftmaxGrad( + stream, load_softmax_y, load_dy, store, rows, cols))); +} + +constexpr int32_t kMaxNumDims = 5; + +template +class FusedScaleMaskSoftmaxDropoutKernel final : public user_op::OpKernel { + public: + FusedScaleMaskSoftmaxDropoutKernel() = default; + ~FusedScaleMaskSoftmaxDropoutKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); + const user_op::Tensor* dropout_mask = ctx->Tensor4ArgNameAndIndex("dropout_mask", 0); + user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); + const float mask_fill_value = ctx->Attr("mask_fill_value"); + const float scale_value = ctx->Attr("scale_value"); + const float dropout_scale_value = ctx->Attr("dropout_scale_value"); + user_op::Tensor* softmax_y = ctx->Tensor4ArgNameAndIndex("softmax_y", 0); + const ShapeView& x_shape = x->shape_view(); + const ShapeView& mask_shape = mask->shape_view(); + CHECK_GE(x_shape.NumAxes(), 2); + const int64_t elem_cnt = x_shape.elem_cnt(); + const int64_t cols = x_shape.At(x_shape.NumAxes() - 1); + const int64_t rows = x_shape.Count(0, x_shape.NumAxes() - 1); + const size_t num_input_dims = x_shape.NumAxes(); + const int64_t* input_dims = x_shape.ptr(); + const size_t num_mask_dims = mask_shape.NumAxes(); + const int64_t* mask_dims = mask_shape.ptr(); + using ComputeType = typename cuda::softmax::DefaultComputeType::type; + + size_t simplified_num_dims = 0; + int64_t simplified_input_dims[kMaxNumDims]; + int64_t simplified_mask_dims[kMaxNumDims]; + fused_scale_mask_softmax::SimplifyBroadcastDims(num_input_dims, input_dims, num_mask_dims, + mask_dims, &simplified_num_dims, + simplified_input_dims, simplified_mask_dims); + if (simplified_num_dims == 1) { + LaunchElementwiseForwardKernel( + ctx->stream()->As()->cuda_stream(), x->dptr(), y->mut_dptr(), + softmax_y->mut_dptr(), mask->dptr(), dropout_mask->dptr(), rows, cols, + mask_fill_value, scale_value, dropout_scale_value); + } + +#define DEFINE_ONE_ELIF(dims) \ + else if (simplified_num_dims == dims) { \ + LaunchBroadcastForwardKernel( \ + ctx->stream()->As()->cuda_stream(), x->dptr(), y->mut_dptr(), \ + softmax_y->mut_dptr(), mask->dptr(), dropout_mask->dptr(), elem_cnt, rows, \ + cols, mask_fill_value, scale_value, dropout_scale_value, simplified_input_dims, \ + simplified_mask_dims); \ + } + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(3) + DEFINE_ONE_ELIF(4) +#undef DEFINE_ONE_ELIF + else { + UNIMPLEMENTED(); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class FusedScaleMaskSoftmaxDropoutGradKernel final : public user_op::OpKernel { + public: + FusedScaleMaskSoftmaxDropoutGradKernel() = default; + ~FusedScaleMaskSoftmaxDropoutGradKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* softmax_y = ctx->Tensor4ArgNameAndIndex("softmax_y", 0); + const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); + const user_op::Tensor* dropout_mask = ctx->Tensor4ArgNameAndIndex("dropout_mask", 0); + user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + const float mask_fill_value = static_cast(0.0); + const float scale_value = ctx->Attr("scale_value"); + const float dropout_scale_value = ctx->Attr("dropout_scale_value"); + const ShapeView& dy_shape = dy->shape_view(); + const int64_t elem_cnt = dy_shape.elem_cnt(); + const ShapeView& mask_shape = mask->shape_view(); + CHECK_GE(dy_shape.NumAxes(), 2); + const int64_t cols = dy_shape.At(dy_shape.NumAxes() - 1); + const int64_t rows = dy_shape.Count(0, dy_shape.NumAxes() - 1); + const int64_t* input_dims = dy_shape.ptr(); + const size_t num_input_dims = dy_shape.NumAxes(); + const int64_t* mask_dims = mask_shape.ptr(); + const size_t num_mask_dims = mask_shape.NumAxes(); + + using ComputeType = typename cuda::softmax::DefaultComputeType::type; + cuda::softmax::DirectLoad load_softmax_y(softmax_y->dptr(), cols); + + size_t simplified_num_dims = 0; + int64_t simplified_input_dims[kMaxNumDims]; + int64_t simplified_mask_dims[kMaxNumDims]; + fused_scale_mask_softmax::SimplifyBroadcastDims(num_input_dims, input_dims, num_mask_dims, + mask_dims, &simplified_num_dims, + simplified_input_dims, simplified_mask_dims); + if (simplified_num_dims == 1) { + LaunchElementwiseBackwardKernel( + ctx->stream()->As()->cuda_stream(), softmax_y->dptr(), dy->dptr(), + dx->mut_dptr(), mask->dptr(), dropout_mask->dptr(), rows, cols, + mask_fill_value, scale_value, dropout_scale_value); + } +#define DEFINE_ONE_ELIF(dims) \ + else if (simplified_num_dims == dims) { \ + LaunchBroadcastBackwardKernel( \ + ctx->stream()->As()->cuda_stream(), softmax_y->dptr(), dy->dptr(), \ + dx->mut_dptr(), mask->dptr(), dropout_mask->dptr(), elem_cnt, rows, cols, \ + static_cast(0.0), ctx->Attr("scale_value"), \ + ctx->Attr("dropout_scale_value"), simplified_input_dims, simplified_mask_dims); \ + } + DEFINE_ONE_ELIF(2) + DEFINE_ONE_ELIF(3) + DEFINE_ONE_ELIF(4) +#undef DEFINE_ONE_ELIF + else { + UNIMPLEMENTED(); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +} // namespace + +#define REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL(dtype, mask_dtype) \ + REGISTER_USER_KERNEL("fused_scale_mask_softmax_dropout") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == GetDataType::value) \ + && (user_op::HobDataType("mask", 0) == GetDataType::value)); + +REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL(half, bool) +REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL(float, bool) +#undef REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_CUDA_KERNEL + +#define REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL(dtype, mask_dtype) \ + REGISTER_USER_KERNEL("fused_scale_mask_softmax_dropout_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value) \ + && (user_op::HobDataType("mask", 0) == GetDataType::value)); + +REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL(half, bool) +REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL(float, bool) +#undef REGISTER_FUSED_SCALE_MASK_SOFTMAX_DROPOUT_GRAD_KERNEL + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/fused_self_attention_query_mul_key_and_value_kernel.hip.cpp b/oneflow/user/kernels/fused_self_attention_query_mul_key_and_value_kernel.hip.cpp index c72cc2a..458b8b9 100644 --- a/oneflow/user/kernels/fused_self_attention_query_mul_key_and_value_kernel.hip.cpp +++ b/oneflow/user/kernels/fused_self_attention_query_mul_key_and_value_kernel.hip.cpp @@ -1,293 +1,293 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/framework/framework.h" -#include "oneflow/user/kernels/slice_util.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/ep/include/primitive/permute.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -inline hipblasOperation_t GetCublasOp(char op) { - switch (op) { - case 'n': - case 'N': { - return HIPBLAS_OP_N; - } - case 't': - case 'T': { - return HIPBLAS_OP_T; - } - case 'c': - case 'C': { - return HIPBLAS_OP_C; - } - default: { - UNIMPLEMENTED(); - } - } - return HIPBLAS_OP_N; -} - -template -struct CudaDataTypeTrait; - -template<> -struct CudaDataTypeTrait { - const static hipblasDatatype_t value = HIPBLAS_R_32F; -}; - -template<> -struct CudaDataTypeTrait { - const static hipblasDatatype_t value = HIPBLAS_R_16F; -}; - -template -void CublasBatchGemm(hipblasHandle_t handle, char transa, char transb, int64_t m, int64_t n, - int64_t k, T alpha, const T* a, int64_t lda, int64_t stridea, const T* b, - int64_t ldb, int64_t strideb, T beta, T* c, int64_t ldc, int64_t stridec, - int64_t batch_size) { - hipblasOperation_t opa = GetCublasOp(transa); - hipblasOperation_t opb = GetCublasOp(transb); - - - hipblasDatatype_t data_type = CudaDataTypeTrait::value; - OF_CUBLAS_CHECK(hipblasGemmStridedBatchedEx( - handle, opa, opb, m, n, k, reinterpret_cast(&alpha), - reinterpret_cast(a), data_type, lda, stridea, reinterpret_cast(b), - data_type, ldb, strideb, reinterpret_cast(&beta), reinterpret_cast(c), - data_type, ldc, stridec, batch_size, data_type, HIPBLAS_GEMM_DEFAULT)); - -} - - -template<> -void CublasBatchGemm(hipblasHandle_t handle, char transa, char transb, int64_t m, int64_t n, - int64_t k, half alpha, const half* a, int64_t lda, int64_t stridea, - const half* b, int64_t ldb, int64_t strideb, half beta, half* c, - int64_t ldc, int64_t stridec, int64_t batch_size) { - using comp_t = float; - hipblasOperation_t opa = GetCublasOp(transa); - hipblasOperation_t opb = GetCublasOp(transb); - - - float alpha_f = static_cast(alpha); - float beta_f = static_cast(beta); - hipblasGemmAlgo_t algo = HIPBLAS_GEMM_DEFAULT; - hipblasDatatype_t data_type = CudaDataTypeTrait::value; - hipblasDatatype_t comp_type = CudaDataTypeTrait::value; - OF_CUBLAS_CHECK(hipblasGemmStridedBatchedEx( - handle, opa, opb, m, n, k, &alpha_f, reinterpret_cast(a), data_type, lda, - stridea, reinterpret_cast(b), data_type, ldb, strideb, &beta_f, - reinterpret_cast(c), data_type, ldc, stridec, batch_size, comp_type, algo)); - -} - -template<> -void CublasBatchGemm(hipblasHandle_t handle, char transa, char transb, int64_t m, int64_t n, - int64_t k, float16 alpha, const float16* a, int64_t lda, - int64_t stridea, const float16* b, int64_t ldb, int64_t strideb, - float16 beta, float16* c, int64_t ldc, int64_t stridec, - int64_t batch_size) { - CublasBatchGemm(handle, transa, transb, m, n, k, static_cast(alpha), - reinterpret_cast(a), lda, stridea, - reinterpret_cast(b), ldb, strideb, static_cast(beta), - reinterpret_cast(c), ldc, stridec, batch_size); -} - - -template -void BatchedGemm(ep::Stream* stream, char opa, char opb, int64_t m, int64_t n, int64_t k, - float alpha, const T* a, int64_t lda, int64_t stridea, const T* b, int64_t ldb, - int64_t strideb, float beta, T* c, int64_t ldc, int64_t stridec, - int64_t batch_size) { - // swap m and n, a and b to convert from row-major to col-major - CublasBatchGemm(stream->As()->cublas_handle(), opb, opa, n, m, k, - static_cast(alpha), b, ldb, strideb, a, lda, stridea, static_cast(beta), - c, ldc, stridec, batch_size); -} - -SliceParams ConstructSliceParams4Value(int64_t seq_len, int64_t batch_size, int64_t num_heads, - int64_t head_size) { - // slice (s, b, n, 3, h) to (s, b, n, 1, h) - SliceParams params; - params.ndim = 4; - params.dims[0] = seq_len; - params.dims[1] = batch_size; - params.dims[2] = num_heads; - params.dims[3] = 3 * head_size; - params.start[0] = 0; - params.start[1] = 0; - params.start[2] = 0; - params.start[3] = 2 * head_size; - params.step[0] = 1; - params.step[1] = 1; - params.step[2] = 1; - params.step[3] = 1; - params.size[0] = seq_len; - params.size[1] = batch_size; - params.size[2] = num_heads; - params.size[3] = head_size; - return params; -} - -template -void TransposeGpu(ep::Stream* stream, DataType data_type, const ShapeView& in_shape, - const ShapeView& out_shape, const std::vector& perm, const T* in, - T* out) { - CHECK_EQ(in_shape.NumAxes(), out_shape.NumAxes()); - int32_t num_axes = in_shape.NumAxes(); - CHECK_EQ(num_axes, perm.size()); - for (int i = 0; i < perm.size(); ++i) { CHECK_EQ(in_shape.At(perm[i]), out_shape.At(i)); } - auto transpose = ep::primitive::NewPrimitive(stream->device_type(), - in_shape.NumAxes()); - CHECK(transpose); - transpose->Launch(stream, data_type, in_shape.NumAxes(), in_shape.ptr(), in, perm.data(), out); -} - -template -class FusedSelfAttentionQueryMulKeyAndValueGpuKernel final : public user_op::OpKernel { - public: - FusedSelfAttentionQueryMulKeyAndValueGpuKernel() = default; - ~FusedSelfAttentionQueryMulKeyAndValueGpuKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* h_tensor = ctx->Tensor4ArgNameAndIndex("hidden_states", 0); - int64_t seq_len = h_tensor->shape_view().At(0); - int64_t batch_size = h_tensor->shape_view().At(1); - int64_t hidden_size = h_tensor->shape_view().At(2); - int64_t head_size = ctx->Attr("head_size"); - int64_t num_heads = hidden_size / (3 * head_size); - int64_t ld = batch_size * hidden_size; - int64_t stride = 3 * head_size; - int64_t k_offset = head_size; - - // q * k: (sq, b, n, h) x (sk, b, n, h) => (b, n, sq, h) x (b, n, sk, h) - // => (b, n, sq, h) x (b, n, h, sk) -> (b, n, sq, sk) - float alpha = ctx->Attr("alpha"); - user_op::Tensor* qmk_tensor = ctx->Tensor4ArgNameAndIndex("query_mul_key", 0); - const T* q_dptr = h_tensor->dptr(); - const T* k_dptr = h_tensor->dptr() + k_offset; - BatchedGemm(ctx->stream(), 'N', 'T', seq_len, seq_len, head_size, alpha, q_dptr, ld, stride, - k_dptr, ld, stride, 0.0f, qmk_tensor->mut_dptr(), seq_len, seq_len * seq_len, - batch_size * num_heads); - - // slice v - user_op::Tensor* tmp_v_tensor = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - user_op::Tensor* v_tensor = ctx->Tensor4ArgNameAndIndex("value", 0); - SliceParams params = ConstructSliceParams4Value(seq_len, batch_size, num_heads, head_size); - SliceKernelUtil::Forward(ctx->stream(), params, h_tensor->dptr(), - tmp_v_tensor->mut_dptr()); - // v from (s, b, n, h) transpose to (b, n, s, h) - Shape value_shape({seq_len, batch_size, num_heads, head_size}); - TransposeGpu(ctx->stream(), h_tensor->data_type(), value_shape, v_tensor->shape_view(), - {1, 2, 0, 3}, tmp_v_tensor->dptr(), v_tensor->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class FusedSelfAttentionQueryMulKeyAndValueGradGpuKernel final : public user_op::OpKernel { - public: - FusedSelfAttentionQueryMulKeyAndValueGradGpuKernel() = default; - ~FusedSelfAttentionQueryMulKeyAndValueGradGpuKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* v_grad_tensor = ctx->Tensor4ArgNameAndIndex("value_grad", 0); - const user_op::Tensor* qmk_grad_tensor = ctx->Tensor4ArgNameAndIndex("query_mul_key_grad", 0); - const user_op::Tensor* h_tensor = ctx->Tensor4ArgNameAndIndex("hidden_states", 0); - user_op::Tensor* tmp_v_tensor = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - user_op::Tensor* h_grad_tensor = ctx->Tensor4ArgNameAndIndex("hidden_states_grad", 0); - - float alpha = ctx->Attr("alpha"); - int64_t seq_len = h_grad_tensor->shape_view().At(0); - int64_t batch_size = h_grad_tensor->shape_view().At(1); - int64_t hidden_size = h_grad_tensor->shape_view().At(2); - int64_t num_heads = v_grad_tensor->shape_view().At(1); - int64_t head_size = v_grad_tensor->shape_view().At(3); - int64_t ld = batch_size * hidden_size; - int64_t stride = 3 * head_size; - CHECK_EQ(hidden_size, num_heads * stride); - - // transpose from (b, n, s, h) to (s, b, n, h) - Shape value_shape({seq_len, batch_size, num_heads, head_size}); - TransposeGpu(ctx->stream(), v_grad_tensor->data_type(), v_grad_tensor->shape_view(), - value_shape, {2, 0, 1, 3}, v_grad_tensor->dptr(), - tmp_v_tensor->mut_dptr()); - // slice v grad - SliceParams params = ConstructSliceParams4Value(seq_len, batch_size, num_heads, head_size); - SliceKernelUtil::Backward(ctx->stream(), params, tmp_v_tensor->dptr(), - h_grad_tensor->mut_dptr()); - - // grad_q = grad_qmk * k - // (b, n, sq, sk) x (b, n, sk, h) -> (b, n, s, h) <= (s, b, n, h) <= (s, b, n, 3, h) - const T* qmk_grad_dptr = qmk_grad_tensor->dptr(); - const T* k_dptr = h_tensor->dptr() + head_size; - T* grad_q_dptr = h_grad_tensor->mut_dptr(); - BatchedGemm(ctx->stream(), 'N', 'N', seq_len, head_size, seq_len, alpha, qmk_grad_dptr, - seq_len, seq_len * seq_len, k_dptr, ld, stride, 0.0f, grad_q_dptr, ld, stride, - batch_size * num_heads); - // grad_k = grad_qmk * q - // (b, n, sk, sq) x (b, n, sq, h) -> (b, n, sk, h) <= (s, b, n, h) <= (s, b, n, 3, h) - const T* q_dptr = h_tensor->dptr(); - T* grad_k_dptr = h_grad_tensor->mut_dptr() + head_size; - BatchedGemm(ctx->stream(), 'T', 'N', seq_len, head_size, seq_len, alpha, qmk_grad_dptr, - seq_len, seq_len * seq_len, q_dptr, ld, stride, 0.0f, grad_k_dptr, ld, stride, - batch_size * num_heads); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -size_t InferTmpBufferSize(user_op::InferContext* ctx) { - const Shape* value_shape = ctx->OutputShape("value", 0); - DataType value_dtype = *ctx->OutputDType("value", 0); - return value_shape->elem_cnt() * GetSizeOfDataType(value_dtype); -} - -size_t InferGradTmpBufferSize(user_op::InferContext* ctx) { - const Shape& value_shape = ctx->InputShape("value_grad", 0); - const DataType& value_dtype = ctx->InputDType("value_grad", 0); - return value_shape.elem_cnt() * GetSizeOfDataType(value_dtype); -} - -} // namespace - -#define REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_CUDA_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fused_self_attention_query_mul_key_and_value") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("hidden_states", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn(InferTmpBufferSize); - -#define REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_GRAD_CUDA_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fused_self_attention_query_mul_key_and_value_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("hidden_states", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn(InferGradTmpBufferSize); - -REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_CUDA_KERNEL(float) -REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_CUDA_KERNEL(float16) -REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_GRAD_CUDA_KERNEL(float) -REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_GRAD_CUDA_KERNEL(float16) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/user/kernels/slice_util.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/ep/include/primitive/permute.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +inline hipblasOperation_t GetCublasOp(char op) { + switch (op) { + case 'n': + case 'N': { + return HIPBLAS_OP_N; + } + case 't': + case 'T': { + return HIPBLAS_OP_T; + } + case 'c': + case 'C': { + return HIPBLAS_OP_C; + } + default: { + UNIMPLEMENTED(); + } + } + return HIPBLAS_OP_N; +} + +template +struct CudaDataTypeTrait; + +template<> +struct CudaDataTypeTrait { + const static hipblasDatatype_t value = HIPBLAS_R_32F; +}; + +template<> +struct CudaDataTypeTrait { + const static hipblasDatatype_t value = HIPBLAS_R_16F; +}; + +template +void CublasBatchGemm(hipblasHandle_t handle, char transa, char transb, int64_t m, int64_t n, + int64_t k, T alpha, const T* a, int64_t lda, int64_t stridea, const T* b, + int64_t ldb, int64_t strideb, T beta, T* c, int64_t ldc, int64_t stridec, + int64_t batch_size) { + hipblasOperation_t opa = GetCublasOp(transa); + hipblasOperation_t opb = GetCublasOp(transb); + + + hipblasDatatype_t data_type = CudaDataTypeTrait::value; + OF_CUBLAS_CHECK(hipblasGemmStridedBatchedEx( + handle, opa, opb, m, n, k, reinterpret_cast(&alpha), + reinterpret_cast(a), data_type, lda, stridea, reinterpret_cast(b), + data_type, ldb, strideb, reinterpret_cast(&beta), reinterpret_cast(c), + data_type, ldc, stridec, batch_size, data_type, HIPBLAS_GEMM_DEFAULT)); + +} + + +template<> +void CublasBatchGemm(hipblasHandle_t handle, char transa, char transb, int64_t m, int64_t n, + int64_t k, half alpha, const half* a, int64_t lda, int64_t stridea, + const half* b, int64_t ldb, int64_t strideb, half beta, half* c, + int64_t ldc, int64_t stridec, int64_t batch_size) { + using comp_t = float; + hipblasOperation_t opa = GetCublasOp(transa); + hipblasOperation_t opb = GetCublasOp(transb); + + + float alpha_f = static_cast(alpha); + float beta_f = static_cast(beta); + hipblasGemmAlgo_t algo = HIPBLAS_GEMM_DEFAULT; + hipblasDatatype_t data_type = CudaDataTypeTrait::value; + hipblasDatatype_t comp_type = CudaDataTypeTrait::value; + OF_CUBLAS_CHECK(hipblasGemmStridedBatchedEx( + handle, opa, opb, m, n, k, &alpha_f, reinterpret_cast(a), data_type, lda, + stridea, reinterpret_cast(b), data_type, ldb, strideb, &beta_f, + reinterpret_cast(c), data_type, ldc, stridec, batch_size, comp_type, algo)); + +} + +template<> +void CublasBatchGemm(hipblasHandle_t handle, char transa, char transb, int64_t m, int64_t n, + int64_t k, float16 alpha, const float16* a, int64_t lda, + int64_t stridea, const float16* b, int64_t ldb, int64_t strideb, + float16 beta, float16* c, int64_t ldc, int64_t stridec, + int64_t batch_size) { + CublasBatchGemm(handle, transa, transb, m, n, k, static_cast(alpha), + reinterpret_cast(a), lda, stridea, + reinterpret_cast(b), ldb, strideb, static_cast(beta), + reinterpret_cast(c), ldc, stridec, batch_size); +} + + +template +void BatchedGemm(ep::Stream* stream, char opa, char opb, int64_t m, int64_t n, int64_t k, + float alpha, const T* a, int64_t lda, int64_t stridea, const T* b, int64_t ldb, + int64_t strideb, float beta, T* c, int64_t ldc, int64_t stridec, + int64_t batch_size) { + // swap m and n, a and b to convert from row-major to col-major + CublasBatchGemm(stream->As()->cublas_handle(), opb, opa, n, m, k, + static_cast(alpha), b, ldb, strideb, a, lda, stridea, static_cast(beta), + c, ldc, stridec, batch_size); +} + +SliceParams ConstructSliceParams4Value(int64_t seq_len, int64_t batch_size, int64_t num_heads, + int64_t head_size) { + // slice (s, b, n, 3, h) to (s, b, n, 1, h) + SliceParams params; + params.ndim = 4; + params.dims[0] = seq_len; + params.dims[1] = batch_size; + params.dims[2] = num_heads; + params.dims[3] = 3 * head_size; + params.start[0] = 0; + params.start[1] = 0; + params.start[2] = 0; + params.start[3] = 2 * head_size; + params.step[0] = 1; + params.step[1] = 1; + params.step[2] = 1; + params.step[3] = 1; + params.size[0] = seq_len; + params.size[1] = batch_size; + params.size[2] = num_heads; + params.size[3] = head_size; + return params; +} + +template +void TransposeGpu(ep::Stream* stream, DataType data_type, const ShapeView& in_shape, + const ShapeView& out_shape, const std::vector& perm, const T* in, + T* out) { + CHECK_EQ(in_shape.NumAxes(), out_shape.NumAxes()); + int32_t num_axes = in_shape.NumAxes(); + CHECK_EQ(num_axes, perm.size()); + for (int i = 0; i < perm.size(); ++i) { CHECK_EQ(in_shape.At(perm[i]), out_shape.At(i)); } + auto transpose = ep::primitive::NewPrimitive(stream->device_type(), + in_shape.NumAxes()); + CHECK(transpose); + transpose->Launch(stream, data_type, in_shape.NumAxes(), in_shape.ptr(), in, perm.data(), out); +} + +template +class FusedSelfAttentionQueryMulKeyAndValueGpuKernel final : public user_op::OpKernel { + public: + FusedSelfAttentionQueryMulKeyAndValueGpuKernel() = default; + ~FusedSelfAttentionQueryMulKeyAndValueGpuKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* h_tensor = ctx->Tensor4ArgNameAndIndex("hidden_states", 0); + int64_t seq_len = h_tensor->shape_view().At(0); + int64_t batch_size = h_tensor->shape_view().At(1); + int64_t hidden_size = h_tensor->shape_view().At(2); + int64_t head_size = ctx->Attr("head_size"); + int64_t num_heads = hidden_size / (3 * head_size); + int64_t ld = batch_size * hidden_size; + int64_t stride = 3 * head_size; + int64_t k_offset = head_size; + + // q * k: (sq, b, n, h) x (sk, b, n, h) => (b, n, sq, h) x (b, n, sk, h) + // => (b, n, sq, h) x (b, n, h, sk) -> (b, n, sq, sk) + float alpha = ctx->Attr("alpha"); + user_op::Tensor* qmk_tensor = ctx->Tensor4ArgNameAndIndex("query_mul_key", 0); + const T* q_dptr = h_tensor->dptr(); + const T* k_dptr = h_tensor->dptr() + k_offset; + BatchedGemm(ctx->stream(), 'N', 'T', seq_len, seq_len, head_size, alpha, q_dptr, ld, stride, + k_dptr, ld, stride, 0.0f, qmk_tensor->mut_dptr(), seq_len, seq_len * seq_len, + batch_size * num_heads); + + // slice v + user_op::Tensor* tmp_v_tensor = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + user_op::Tensor* v_tensor = ctx->Tensor4ArgNameAndIndex("value", 0); + SliceParams params = ConstructSliceParams4Value(seq_len, batch_size, num_heads, head_size); + SliceKernelUtil::Forward(ctx->stream(), params, h_tensor->dptr(), + tmp_v_tensor->mut_dptr()); + // v from (s, b, n, h) transpose to (b, n, s, h) + Shape value_shape({seq_len, batch_size, num_heads, head_size}); + TransposeGpu(ctx->stream(), h_tensor->data_type(), value_shape, v_tensor->shape_view(), + {1, 2, 0, 3}, tmp_v_tensor->dptr(), v_tensor->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class FusedSelfAttentionQueryMulKeyAndValueGradGpuKernel final : public user_op::OpKernel { + public: + FusedSelfAttentionQueryMulKeyAndValueGradGpuKernel() = default; + ~FusedSelfAttentionQueryMulKeyAndValueGradGpuKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* v_grad_tensor = ctx->Tensor4ArgNameAndIndex("value_grad", 0); + const user_op::Tensor* qmk_grad_tensor = ctx->Tensor4ArgNameAndIndex("query_mul_key_grad", 0); + const user_op::Tensor* h_tensor = ctx->Tensor4ArgNameAndIndex("hidden_states", 0); + user_op::Tensor* tmp_v_tensor = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + user_op::Tensor* h_grad_tensor = ctx->Tensor4ArgNameAndIndex("hidden_states_grad", 0); + + float alpha = ctx->Attr("alpha"); + int64_t seq_len = h_grad_tensor->shape_view().At(0); + int64_t batch_size = h_grad_tensor->shape_view().At(1); + int64_t hidden_size = h_grad_tensor->shape_view().At(2); + int64_t num_heads = v_grad_tensor->shape_view().At(1); + int64_t head_size = v_grad_tensor->shape_view().At(3); + int64_t ld = batch_size * hidden_size; + int64_t stride = 3 * head_size; + CHECK_EQ(hidden_size, num_heads * stride); + + // transpose from (b, n, s, h) to (s, b, n, h) + Shape value_shape({seq_len, batch_size, num_heads, head_size}); + TransposeGpu(ctx->stream(), v_grad_tensor->data_type(), v_grad_tensor->shape_view(), + value_shape, {2, 0, 1, 3}, v_grad_tensor->dptr(), + tmp_v_tensor->mut_dptr()); + // slice v grad + SliceParams params = ConstructSliceParams4Value(seq_len, batch_size, num_heads, head_size); + SliceKernelUtil::Backward(ctx->stream(), params, tmp_v_tensor->dptr(), + h_grad_tensor->mut_dptr()); + + // grad_q = grad_qmk * k + // (b, n, sq, sk) x (b, n, sk, h) -> (b, n, s, h) <= (s, b, n, h) <= (s, b, n, 3, h) + const T* qmk_grad_dptr = qmk_grad_tensor->dptr(); + const T* k_dptr = h_tensor->dptr() + head_size; + T* grad_q_dptr = h_grad_tensor->mut_dptr(); + BatchedGemm(ctx->stream(), 'N', 'N', seq_len, head_size, seq_len, alpha, qmk_grad_dptr, + seq_len, seq_len * seq_len, k_dptr, ld, stride, 0.0f, grad_q_dptr, ld, stride, + batch_size * num_heads); + // grad_k = grad_qmk * q + // (b, n, sk, sq) x (b, n, sq, h) -> (b, n, sk, h) <= (s, b, n, h) <= (s, b, n, 3, h) + const T* q_dptr = h_tensor->dptr(); + T* grad_k_dptr = h_grad_tensor->mut_dptr() + head_size; + BatchedGemm(ctx->stream(), 'T', 'N', seq_len, head_size, seq_len, alpha, qmk_grad_dptr, + seq_len, seq_len * seq_len, q_dptr, ld, stride, 0.0f, grad_k_dptr, ld, stride, + batch_size * num_heads); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +size_t InferTmpBufferSize(user_op::InferContext* ctx) { + const Shape* value_shape = ctx->OutputShape("value", 0); + DataType value_dtype = *ctx->OutputDType("value", 0); + return value_shape->elem_cnt() * GetSizeOfDataType(value_dtype); +} + +size_t InferGradTmpBufferSize(user_op::InferContext* ctx) { + const Shape& value_shape = ctx->InputShape("value_grad", 0); + const DataType& value_dtype = ctx->InputDType("value_grad", 0); + return value_shape.elem_cnt() * GetSizeOfDataType(value_dtype); +} + +} // namespace + +#define REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_CUDA_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fused_self_attention_query_mul_key_and_value") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("hidden_states", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn(InferTmpBufferSize); + +#define REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_GRAD_CUDA_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fused_self_attention_query_mul_key_and_value_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("hidden_states", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn(InferGradTmpBufferSize); + +REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_CUDA_KERNEL(float) +REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_CUDA_KERNEL(float16) +REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_GRAD_CUDA_KERNEL(float) +REGISTER_FUSED_SELF_ATTENTION_QUERY_MUL_KEY_AND_VALUE_GRAD_CUDA_KERNEL(float16) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/fused_tril_scale_softmax_mask_scale_kernel.hip.cpp b/oneflow/user/kernels/fused_tril_scale_softmax_mask_scale_kernel.hip.cpp index 0f8f340..9585018 100644 --- a/oneflow/user/kernels/fused_tril_scale_softmax_mask_scale_kernel.hip.cpp +++ b/oneflow/user/kernels/fused_tril_scale_softmax_mask_scale_kernel.hip.cpp @@ -1,229 +1,229 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/hip/softmax.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -template -struct TrilScaleLoad { - TrilScaleLoad(const SRC* src, int64_t tril_num_rows, int64_t row_size, int64_t diagonal, SRC fill, - SRC scale) - : src(src), - tril_num_rows(tril_num_rows), - row_size(row_size), - diagonal(diagonal), - fill(fill), - scale(scale) {} - template - __device__ void load(DST* dst, int64_t row, int64_t col) { - int64_t tril_row = row % tril_num_rows; - int64_t diagonal_col_id = tril_row + diagonal; - bool need_load = (col <= diagonal_col_id); - cuda::softmax::Pack pack; - if (need_load) { - const int64_t offset = (row * row_size + col) / N; - pack.storage = *(reinterpret_cast*>(src) + offset); - } -#pragma unroll - for (int i = 0; i < N; ++i) { - if (col + i > diagonal_col_id) { - dst[i] = static_cast(fill); - } else { - dst[i] = static_cast(pack.elem[i]) * static_cast(scale); - } - } - } - const SRC* src; - int64_t tril_num_rows; - int64_t row_size; - int64_t diagonal; - SRC fill; - SRC scale; -}; - -template -struct MaskAndScaleStore { - MaskAndScaleStore(DST* dst, DST* softmax_y, const bool* mask, int64_t row_size, DST scale) - : dst(dst), softmax_y(softmax_y), mask(mask), row_size(row_size), scale(scale) {} - template - __device__ void store(const SRC* src, int64_t row, int64_t col) { - cuda::softmax::Pack softmax_y_pack; - cuda::softmax::Pack dst_pack; - const int64_t offset = (row * row_size + col) / N; - cuda::softmax::Pack mask_pack; - mask_pack.storage = *(reinterpret_cast*>(mask) + offset); -#pragma unroll - for (int i = 0; i < N; ++i) { - softmax_y_pack.elem[i] = static_cast(src[i]); - dst_pack.elem[i] = - static_cast(src[i]) * static_cast(mask_pack.elem[i]) * static_cast(scale); - } - *(reinterpret_cast*>(softmax_y) + offset) = - softmax_y_pack.storage; - *(reinterpret_cast*>(dst) + offset) = dst_pack.storage; - } - DST* dst; - DST* softmax_y; - const bool* mask; - int64_t row_size; - DST scale; -}; - -template -struct MaskAndScaleLoad { - MaskAndScaleLoad(const SRC* src, const bool* mask, int64_t row_size, SRC scale) - : src(src), mask(mask), row_size(row_size), scale(scale) {} - template - __device__ void load(DST* dst, int64_t row, int64_t col) const { - cuda::softmax::Pack pack; - const int64_t offset = (row * row_size + col) / N; - pack.storage = *(reinterpret_cast*>(src) + offset); - cuda::softmax::Pack mask_pack; - mask_pack.storage = *(reinterpret_cast*>(mask) + offset); -#pragma unroll - for (int i = 0; i < N; ++i) { - dst[i] = static_cast(pack.elem[i]) * static_cast(mask_pack.elem[i]) - * static_cast(scale); - } - } - const SRC* src; - const bool* mask; - int64_t row_size; - SRC scale; -}; - -template -struct TrilScaleStore { - TrilScaleStore(DST* dst, int64_t tril_num_rows, int64_t row_size, int64_t diagonal, DST fill, - DST scale) - : dst(dst), - tril_num_rows(tril_num_rows), - row_size(row_size), - diagonal(diagonal), - fill(fill), - scale(scale) {} - template - __device__ void store(const SRC* src, int64_t row, int64_t col) { - cuda::softmax::Pack pack; - const int64_t offset = (row * row_size + col) / N; - int64_t tril_row = row % tril_num_rows; -#pragma unroll - for (int i = 0; i < N; ++i) { - if (col + i > tril_row + diagonal) { - pack.elem[i] = fill; - } else { - pack.elem[i] = static_cast(src[i]) * static_cast(scale); - } - } - *(reinterpret_cast*>(dst) + offset) = pack.storage; - } - DST* dst; - int64_t tril_num_rows; - int64_t row_size; - int64_t diagonal; - DST fill; - DST scale; -}; - -template -class FusedTrilScaleSoftmaxMaskScaleKernel final : public user_op::OpKernel { - public: - FusedTrilScaleSoftmaxMaskScaleKernel() = default; - ~FusedTrilScaleSoftmaxMaskScaleKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); - user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - user_op::Tensor* softmax_y = ctx->Tensor4ArgNameAndIndex("softmax_y", 0); - const ShapeView& x_shape = x->shape_view(); - CHECK_GE(x_shape.NumAxes(), 2); - const int64_t cols = x_shape.At(x_shape.NumAxes() - 1); - const int64_t rows = x_shape.Count(0, x_shape.NumAxes() - 1); - const int64_t tril_num_rows = x_shape.At(x_shape.NumAxes() - 2); - using ComputeType = typename cuda::softmax::DefaultComputeType::type; - TrilScaleLoad load( - x->dptr(), tril_num_rows, cols, ctx->Attr("diagonal"), - ctx->Attr("tril_fill_value"), ctx->Attr("tril_scale_value")); - MaskAndScaleStore store(y->mut_dptr(), softmax_y->mut_dptr(), - mask->dptr(), cols, - ctx->Attr("mask_scale_value")); - OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax( - ctx->stream()->As()->cuda_stream(), load, store, rows, cols))); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_CUDA_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fused_tril_scale_softmax_mask_scale") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value)); - -REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_CUDA_KERNEL(half) -REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_CUDA_KERNEL(float) -REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_CUDA_KERNEL(double) -#undef REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_CUDA_KERNEL - -template -class FusedTrilScaleSoftmaxMaskScaleGradKernel final : public user_op::OpKernel { - public: - FusedTrilScaleSoftmaxMaskScaleGradKernel() = default; - ~FusedTrilScaleSoftmaxMaskScaleGradKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* softmax_y = ctx->Tensor4ArgNameAndIndex("softmax_y", 0); - const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); - user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const ShapeView& dy_shape = dy->shape_view(); - CHECK_GE(dy_shape.NumAxes(), 2); - const int64_t cols = dy_shape.At(dy_shape.NumAxes() - 1); - const int64_t rows = dy_shape.Count(0, dy_shape.NumAxes() - 1); - const int64_t tril_num_rows = dy_shape.At(dy_shape.NumAxes() - 2); - using ComputeType = typename cuda::softmax::DefaultComputeType::type; - cuda::softmax::DirectLoad load_softmax_y(softmax_y->dptr(), cols); - MaskAndScaleLoad load_dy(dy->dptr(), mask->dptr(), cols, - ctx->Attr("mask_scale_value")); - TrilScaleStore store(dx->mut_dptr(), tril_num_rows, cols, - ctx->Attr("diagonal"), static_cast(0.0), - ctx->Attr("tril_scale_value")); - OF_CUDA_CHECK((cuda::softmax::DispatchSoftmaxGrad( - ctx->stream()->As()->cuda_stream(), load_softmax_y, load_dy, store, rows, - cols))); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_GRAD_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fused_tril_scale_softmax_mask_scale_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)); - -REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_GRAD_KERNEL(half) -REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_GRAD_KERNEL(float) -REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_GRAD_KERNEL(double) -#undef REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_GRAD_KERNEL - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/hip/softmax.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +template +struct TrilScaleLoad { + TrilScaleLoad(const SRC* src, int64_t tril_num_rows, int64_t row_size, int64_t diagonal, SRC fill, + SRC scale) + : src(src), + tril_num_rows(tril_num_rows), + row_size(row_size), + diagonal(diagonal), + fill(fill), + scale(scale) {} + template + __device__ void load(DST* dst, int64_t row, int64_t col) { + int64_t tril_row = row % tril_num_rows; + int64_t diagonal_col_id = tril_row + diagonal; + bool need_load = (col <= diagonal_col_id); + cuda::softmax::Pack pack; + if (need_load) { + const int64_t offset = (row * row_size + col) / N; + pack.storage = *(reinterpret_cast*>(src) + offset); + } +#pragma unroll + for (int i = 0; i < N; ++i) { + if (col + i > diagonal_col_id) { + dst[i] = static_cast(fill); + } else { + dst[i] = static_cast(pack.elem[i]) * static_cast(scale); + } + } + } + const SRC* src; + int64_t tril_num_rows; + int64_t row_size; + int64_t diagonal; + SRC fill; + SRC scale; +}; + +template +struct MaskAndScaleStore { + MaskAndScaleStore(DST* dst, DST* softmax_y, const bool* mask, int64_t row_size, DST scale) + : dst(dst), softmax_y(softmax_y), mask(mask), row_size(row_size), scale(scale) {} + template + __device__ void store(const SRC* src, int64_t row, int64_t col) { + cuda::softmax::Pack softmax_y_pack; + cuda::softmax::Pack dst_pack; + const int64_t offset = (row * row_size + col) / N; + cuda::softmax::Pack mask_pack; + mask_pack.storage = *(reinterpret_cast*>(mask) + offset); +#pragma unroll + for (int i = 0; i < N; ++i) { + softmax_y_pack.elem[i] = static_cast(src[i]); + dst_pack.elem[i] = + static_cast(src[i]) * static_cast(mask_pack.elem[i]) * static_cast(scale); + } + *(reinterpret_cast*>(softmax_y) + offset) = + softmax_y_pack.storage; + *(reinterpret_cast*>(dst) + offset) = dst_pack.storage; + } + DST* dst; + DST* softmax_y; + const bool* mask; + int64_t row_size; + DST scale; +}; + +template +struct MaskAndScaleLoad { + MaskAndScaleLoad(const SRC* src, const bool* mask, int64_t row_size, SRC scale) + : src(src), mask(mask), row_size(row_size), scale(scale) {} + template + __device__ void load(DST* dst, int64_t row, int64_t col) const { + cuda::softmax::Pack pack; + const int64_t offset = (row * row_size + col) / N; + pack.storage = *(reinterpret_cast*>(src) + offset); + cuda::softmax::Pack mask_pack; + mask_pack.storage = *(reinterpret_cast*>(mask) + offset); +#pragma unroll + for (int i = 0; i < N; ++i) { + dst[i] = static_cast(pack.elem[i]) * static_cast(mask_pack.elem[i]) + * static_cast(scale); + } + } + const SRC* src; + const bool* mask; + int64_t row_size; + SRC scale; +}; + +template +struct TrilScaleStore { + TrilScaleStore(DST* dst, int64_t tril_num_rows, int64_t row_size, int64_t diagonal, DST fill, + DST scale) + : dst(dst), + tril_num_rows(tril_num_rows), + row_size(row_size), + diagonal(diagonal), + fill(fill), + scale(scale) {} + template + __device__ void store(const SRC* src, int64_t row, int64_t col) { + cuda::softmax::Pack pack; + const int64_t offset = (row * row_size + col) / N; + int64_t tril_row = row % tril_num_rows; +#pragma unroll + for (int i = 0; i < N; ++i) { + if (col + i > tril_row + diagonal) { + pack.elem[i] = fill; + } else { + pack.elem[i] = static_cast(src[i]) * static_cast(scale); + } + } + *(reinterpret_cast*>(dst) + offset) = pack.storage; + } + DST* dst; + int64_t tril_num_rows; + int64_t row_size; + int64_t diagonal; + DST fill; + DST scale; +}; + +template +class FusedTrilScaleSoftmaxMaskScaleKernel final : public user_op::OpKernel { + public: + FusedTrilScaleSoftmaxMaskScaleKernel() = default; + ~FusedTrilScaleSoftmaxMaskScaleKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); + user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); + user_op::Tensor* softmax_y = ctx->Tensor4ArgNameAndIndex("softmax_y", 0); + const ShapeView& x_shape = x->shape_view(); + CHECK_GE(x_shape.NumAxes(), 2); + const int64_t cols = x_shape.At(x_shape.NumAxes() - 1); + const int64_t rows = x_shape.Count(0, x_shape.NumAxes() - 1); + const int64_t tril_num_rows = x_shape.At(x_shape.NumAxes() - 2); + using ComputeType = typename cuda::softmax::DefaultComputeType::type; + TrilScaleLoad load( + x->dptr(), tril_num_rows, cols, ctx->Attr("diagonal"), + ctx->Attr("tril_fill_value"), ctx->Attr("tril_scale_value")); + MaskAndScaleStore store(y->mut_dptr(), softmax_y->mut_dptr(), + mask->dptr(), cols, + ctx->Attr("mask_scale_value")); + OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax( + ctx->stream()->As()->cuda_stream(), load, store, rows, cols))); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_CUDA_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fused_tril_scale_softmax_mask_scale") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("y", 0) == GetDataType::value)); + +REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_CUDA_KERNEL(half) +REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_CUDA_KERNEL(float) +REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_CUDA_KERNEL(double) +#undef REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_CUDA_KERNEL + +template +class FusedTrilScaleSoftmaxMaskScaleGradKernel final : public user_op::OpKernel { + public: + FusedTrilScaleSoftmaxMaskScaleGradKernel() = default; + ~FusedTrilScaleSoftmaxMaskScaleGradKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* softmax_y = ctx->Tensor4ArgNameAndIndex("softmax_y", 0); + const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + const user_op::Tensor* mask = ctx->Tensor4ArgNameAndIndex("mask", 0); + user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + const ShapeView& dy_shape = dy->shape_view(); + CHECK_GE(dy_shape.NumAxes(), 2); + const int64_t cols = dy_shape.At(dy_shape.NumAxes() - 1); + const int64_t rows = dy_shape.Count(0, dy_shape.NumAxes() - 1); + const int64_t tril_num_rows = dy_shape.At(dy_shape.NumAxes() - 2); + using ComputeType = typename cuda::softmax::DefaultComputeType::type; + cuda::softmax::DirectLoad load_softmax_y(softmax_y->dptr(), cols); + MaskAndScaleLoad load_dy(dy->dptr(), mask->dptr(), cols, + ctx->Attr("mask_scale_value")); + TrilScaleStore store(dx->mut_dptr(), tril_num_rows, cols, + ctx->Attr("diagonal"), static_cast(0.0), + ctx->Attr("tril_scale_value")); + OF_CUDA_CHECK((cuda::softmax::DispatchSoftmaxGrad( + ctx->stream()->As()->cuda_stream(), load_softmax_y, load_dy, store, rows, + cols))); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_GRAD_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fused_tril_scale_softmax_mask_scale_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value)); + +REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_GRAD_KERNEL(half) +REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_GRAD_KERNEL(float) +REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_GRAD_KERNEL(double) +#undef REGISTER_FUSED_TRIL_SCALE_SOFTMAX_MASK_SCALE_GRAD_KERNEL + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/gather_kernel_util.hip.cpp b/oneflow/user/kernels/gather_kernel_util.hip.cpp index 675a617..c783961 100644 --- a/oneflow/user/kernels/gather_kernel_util.hip.cpp +++ b/oneflow/user/kernels/gather_kernel_util.hip.cpp @@ -1,123 +1,123 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/user/kernels/gather_kernel_util.h" -#include "oneflow/core/kernel/kernel.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include "oneflow/core/common/nd_index_offset_helper.h" -#include - -namespace oneflow { - -namespace { - -template -__global__ void GatherForwardGpu(const IDX elem_cnt, NdIndexOffsetHelper in_helper, - NdIndexOffsetHelper out_helper, const K* indices, - const T* in, const IDX gather_dim_size, T* out, const IDX offset) { - IDX index[3]; - CUDA_1D_KERNEL_LOOP_T(IDX, i, elem_cnt) { - out_helper.OffsetToNdIndex(i, index); - index[1] = indices[index[1]] - offset; - T v{}; - if (index[1] >= 0 && index[1] < gather_dim_size) { v = in[in_helper.NdIndexToOffset(index)]; } - out[i] = v; - } -} - -bool IsSafeUseIndex32(int64_t outer_dim_size, int64_t gather_dim_size, int64_t inner_dim_size, - int64_t num_indices) { - const int64_t in_elem_cnt = outer_dim_size * gather_dim_size * inner_dim_size; - const int64_t out_elem_cnt = outer_dim_size * num_indices * inner_dim_size; - return std::max(out_elem_cnt, in_elem_cnt) < GetMaxVal() / 2; -} - -template -void DispatchIndexSize(ep::Stream* stream, int64_t outer_dim_size, int64_t gather_dim_size, - int64_t inner_dim_size, int64_t num_indices, int64_t offset, - const K* indices, const T* in, T* out) { - const int64_t out_elem_cnt = outer_dim_size * num_indices * inner_dim_size; - if (IsSafeUseIndex32(outer_dim_size, gather_dim_size, inner_dim_size, num_indices)) { - NdIndexOffsetHelper in_helper(outer_dim_size, gather_dim_size, inner_dim_size); - NdIndexOffsetHelper out_helper(outer_dim_size, num_indices, inner_dim_size); - GatherForwardGpu<<As()->cuda_stream()>>>( - out_elem_cnt, in_helper, out_helper, indices, in, gather_dim_size, out, offset); - } else { - NdIndexOffsetHelper in_helper(outer_dim_size, gather_dim_size, inner_dim_size); - NdIndexOffsetHelper out_helper(outer_dim_size, num_indices, inner_dim_size); - GatherForwardGpu<<As()->cuda_stream()>>>( - out_elem_cnt, in_helper, out_helper, indices, in, gather_dim_size, out, offset); - } -} - -template -bool TryDispatchMovementType(ep::Stream* stream, int64_t outer_dim_size, int64_t gather_dim_size, - int64_t inner_dim_size, int64_t num_indices, int64_t offset, - const K* indices, const void* in, void* out) { - if (reinterpret_cast(in) % sizeof(T) == 0 - && reinterpret_cast(out) % sizeof(T) == 0 && inner_dim_size % sizeof(T) == 0) { - DispatchIndexSize(stream, outer_dim_size, gather_dim_size, inner_dim_size / sizeof(T), - num_indices, offset, indices, static_cast(in), - static_cast(out)); - return true; - } else { - return false; - } -} - -template -void DispatchMovementSize(ep::Stream* stream, int64_t outer_dim_size, int64_t gather_dim_size, - int64_t inner_dim_size, int64_t num_indices, int64_t offset, - const K* indices, const void* in, void* out) { - using Func = bool (*)(ep::Stream * stream, int64_t outer_dim_size, int64_t gather_dim_size, - int64_t inner_dim_size, int64_t num_indices, int64_t offset, - const K* indices, const void* in, void* out); - Func funcs[] = { - TryDispatchMovementType, // 16B - TryDispatchMovementType, // 8B - TryDispatchMovementType, // 4B - TryDispatchMovementType, // 2B - TryDispatchMovementType, // 1B - }; - for (size_t i = 0; i < sizeof(funcs) / sizeof(funcs[0]); ++i) { - if (funcs[i](stream, outer_dim_size, gather_dim_size, inner_dim_size, num_indices, offset, - indices, in, out)) { - break; - } - } -} - -} // namespace - -template -struct GatherKernelUtilImpl final { - static void Forward(ep::Stream* stream, const K* indices, int64_t num_indices, const T* in, - const Shape& flat_in_shape, T* out, const int64_t offset) { - DispatchMovementSize(stream, flat_in_shape.At(0), flat_in_shape.At(1), - flat_in_shape.At(2) * sizeof(T), num_indices, offset, indices, in, out); - } -}; - -#define INITIATE_GATHER_KERNEL_UTIL_CUDA_IMPL(in_type_pair, index_type_pair) \ - template struct GatherKernelUtilImpl; -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_GATHER_KERNEL_UTIL_CUDA_IMPL, - GATHER_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, GATHER_INDEX_TYPE_SEQ); -#undef INITIATE_GATHER_KERNEL_UTIL_CUDA_IMPL - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/user/kernels/gather_kernel_util.h" +#include "oneflow/core/kernel/kernel.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include "oneflow/core/common/nd_index_offset_helper.h" +#include + +namespace oneflow { + +namespace { + +template +__global__ void GatherForwardGpu(const IDX elem_cnt, NdIndexOffsetHelper in_helper, + NdIndexOffsetHelper out_helper, const K* indices, + const T* in, const IDX gather_dim_size, T* out, const IDX offset) { + IDX index[3]; + CUDA_1D_KERNEL_LOOP_T(IDX, i, elem_cnt) { + out_helper.OffsetToNdIndex(i, index); + index[1] = indices[index[1]] - offset; + T v{}; + if (index[1] >= 0 && index[1] < gather_dim_size) { v = in[in_helper.NdIndexToOffset(index)]; } + out[i] = v; + } +} + +bool IsSafeUseIndex32(int64_t outer_dim_size, int64_t gather_dim_size, int64_t inner_dim_size, + int64_t num_indices) { + const int64_t in_elem_cnt = outer_dim_size * gather_dim_size * inner_dim_size; + const int64_t out_elem_cnt = outer_dim_size * num_indices * inner_dim_size; + return std::max(out_elem_cnt, in_elem_cnt) < GetMaxVal() / 2; +} + +template +void DispatchIndexSize(ep::Stream* stream, int64_t outer_dim_size, int64_t gather_dim_size, + int64_t inner_dim_size, int64_t num_indices, int64_t offset, + const K* indices, const T* in, T* out) { + const int64_t out_elem_cnt = outer_dim_size * num_indices * inner_dim_size; + if (IsSafeUseIndex32(outer_dim_size, gather_dim_size, inner_dim_size, num_indices)) { + NdIndexOffsetHelper in_helper(outer_dim_size, gather_dim_size, inner_dim_size); + NdIndexOffsetHelper out_helper(outer_dim_size, num_indices, inner_dim_size); + GatherForwardGpu<<As()->cuda_stream()>>>( + out_elem_cnt, in_helper, out_helper, indices, in, gather_dim_size, out, offset); + } else { + NdIndexOffsetHelper in_helper(outer_dim_size, gather_dim_size, inner_dim_size); + NdIndexOffsetHelper out_helper(outer_dim_size, num_indices, inner_dim_size); + GatherForwardGpu<<As()->cuda_stream()>>>( + out_elem_cnt, in_helper, out_helper, indices, in, gather_dim_size, out, offset); + } +} + +template +bool TryDispatchMovementType(ep::Stream* stream, int64_t outer_dim_size, int64_t gather_dim_size, + int64_t inner_dim_size, int64_t num_indices, int64_t offset, + const K* indices, const void* in, void* out) { + if (reinterpret_cast(in) % sizeof(T) == 0 + && reinterpret_cast(out) % sizeof(T) == 0 && inner_dim_size % sizeof(T) == 0) { + DispatchIndexSize(stream, outer_dim_size, gather_dim_size, inner_dim_size / sizeof(T), + num_indices, offset, indices, static_cast(in), + static_cast(out)); + return true; + } else { + return false; + } +} + +template +void DispatchMovementSize(ep::Stream* stream, int64_t outer_dim_size, int64_t gather_dim_size, + int64_t inner_dim_size, int64_t num_indices, int64_t offset, + const K* indices, const void* in, void* out) { + using Func = bool (*)(ep::Stream * stream, int64_t outer_dim_size, int64_t gather_dim_size, + int64_t inner_dim_size, int64_t num_indices, int64_t offset, + const K* indices, const void* in, void* out); + Func funcs[] = { + TryDispatchMovementType, // 16B + TryDispatchMovementType, // 8B + TryDispatchMovementType, // 4B + TryDispatchMovementType, // 2B + TryDispatchMovementType, // 1B + }; + for (size_t i = 0; i < sizeof(funcs) / sizeof(funcs[0]); ++i) { + if (funcs[i](stream, outer_dim_size, gather_dim_size, inner_dim_size, num_indices, offset, + indices, in, out)) { + break; + } + } +} + +} // namespace + +template +struct GatherKernelUtilImpl final { + static void Forward(ep::Stream* stream, const K* indices, int64_t num_indices, const T* in, + const Shape& flat_in_shape, T* out, const int64_t offset) { + DispatchMovementSize(stream, flat_in_shape.At(0), flat_in_shape.At(1), + flat_in_shape.At(2) * sizeof(T), num_indices, offset, indices, in, out); + } +}; + +#define INITIATE_GATHER_KERNEL_UTIL_CUDA_IMPL(in_type_pair, index_type_pair) \ + template struct GatherKernelUtilImpl; +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_GATHER_KERNEL_UTIL_CUDA_IMPL, + GATHER_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, GATHER_INDEX_TYPE_SEQ); +#undef INITIATE_GATHER_KERNEL_UTIL_CUDA_IMPL + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.hip.cpp b/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.hip.cpp index 3b75fe2..2306d8c 100644 --- a/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.hip.cpp +++ b/oneflow/user/kernels/generate_random_batch_permutation_indices_kernel.hip.cpp @@ -1,138 +1,138 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/kernel/random_generator.h" -#include "oneflow/user/kernels/radix_sort.hip.h" -#include "oneflow/user/kernels/op_kernel_wrapper.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -class TmpBufferManager final { - public: - OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager); - TmpBufferManager(const int32_t& batch_size, const int32_t& capacity, void* ptr) - : capacity_{capacity}, - random_value_elem_cnt_{batch_size}, - sorted_value_elem_cnt_{batch_size}, - indices_elem_cnt_{batch_size} { - const int32_t random_value_aligned_bytes = - GetCudaAlignedSize(random_value_elem_cnt_ * sizeof(float)); - const int32_t sorted_value_aligned_bytes = - GetCudaAlignedSize(sorted_value_elem_cnt_ * sizeof(float)); - const int32_t indices_aligned_bytes = GetCudaAlignedSize(indices_elem_cnt_ * sizeof(int32_t)); - random_value_ptr_ = reinterpret_cast(ptr); - sorted_value_ptr_ = reinterpret_cast(reinterpret_cast(random_value_ptr_) - + random_value_aligned_bytes); - indices_ptr_ = reinterpret_cast(reinterpret_cast(sorted_value_ptr_) - + sorted_value_aligned_bytes); - temp_storage_ptr_ = - reinterpret_cast(reinterpret_cast(indices_ptr_) + indices_aligned_bytes); - temp_storage_bytes_ = - capacity_ - random_value_aligned_bytes - sorted_value_aligned_bytes - indices_aligned_bytes; - CHECK_GE(temp_storage_bytes_, 0); - } - ~TmpBufferManager() = default; - - float* RandomValuePtr() const { return random_value_ptr_; } - float* SortedValuePtr() const { return sorted_value_ptr_; } - int32_t* IndicesPtr() const { return indices_ptr_; } - void* TempStoragePtr() const { return temp_storage_ptr_; } - - int32_t TempStorageBytes() const { return temp_storage_bytes_; } - - private: - int32_t capacity_; - - float* random_value_ptr_; - float* sorted_value_ptr_; - int32_t* indices_ptr_; - void* temp_storage_ptr_; - - int32_t random_value_elem_cnt_; - int32_t sorted_value_elem_cnt_; - int32_t indices_elem_cnt_; - int32_t temp_storage_bytes_; -}; - -__global__ void InitializeIndices(int32_t elem_cnt, int32_t* indices_ptr) { - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { indices_ptr[i] = i; }; -} - -} // namespace - -class GenerateRandomBatchPermutationIndicesGPUKernel final : public user_op::OpKernel { - public: - GenerateRandomBatchPermutationIndicesGPUKernel() = default; - ~GenerateRandomBatchPermutationIndicesGPUKernel() = default; - - std::shared_ptr CreateOpKernelState( - user_op::KernelInitContext* ctx) const override { - int64_t seed = ctx->Attr("seed"); - return std::make_shared>>( - seed, ctx->stream()); - } - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, - const user_op::OpKernelCache*) const override { - auto* random_generator = - dynamic_cast>*>(state); - user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - const int32_t batch_size = y->shape_view().At(0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - TmpBufferManager buf_manager(batch_size, - static_cast(tmp_buffer->shape_view().elem_cnt()), - tmp_buffer->mut_dptr()); - random_generator->Mutable()->Uniform(batch_size, buf_manager.RandomValuePtr()); - InitializeIndices<<stream()->As()->cuda_stream()>>>( - batch_size, buf_manager.IndicesPtr()); - const int32_t argsort_instance_num = 1; - const int32_t argsort_instance_size = batch_size; - SortPairsAscending(buf_manager.RandomValuePtr(), buf_manager.IndicesPtr(), argsort_instance_num, - argsort_instance_size, buf_manager.TempStoragePtr(), - buf_manager.TempStorageBytes(), buf_manager.SortedValuePtr(), - y->mut_dptr(), ctx->stream()->As()->cuda_stream()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -REGISTER_USER_KERNEL("generate_random_batch_permutation_indices") - .SetCreateFn() - .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA) - .SetInferTmpSizeFn([](oneflow::user_op::InferContext* ctx) { - const Shape* y_shape = ctx->OutputShape("y", 0); - const int32_t batch_size = y_shape->At(0); - - const int32_t random_value_aligned_bytes = GetCudaAlignedSize(batch_size * sizeof(float)); - const int32_t sorted_value_aligned_bytes = GetCudaAlignedSize(batch_size * sizeof(float)); - const int32_t indices_aligned_bytes = GetCudaAlignedSize(batch_size * sizeof(int32_t)); - const int32_t argsort_instance_num = 1; - const int32_t argsort_instance_size = batch_size; - const int32_t temp_storage_bytes = InferTempStorageForSortPairsAscending( - argsort_instance_num, argsort_instance_size); - - return random_value_aligned_bytes + sorted_value_aligned_bytes + indices_aligned_bytes - + temp_storage_bytes; - }); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/kernel/random_generator.h" +#include "oneflow/user/kernels/radix_sort.hip.h" +#include "oneflow/user/kernels/op_kernel_wrapper.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +class TmpBufferManager final { + public: + OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager); + TmpBufferManager(const int32_t& batch_size, const int32_t& capacity, void* ptr) + : capacity_{capacity}, + random_value_elem_cnt_{batch_size}, + sorted_value_elem_cnt_{batch_size}, + indices_elem_cnt_{batch_size} { + const int32_t random_value_aligned_bytes = + GetCudaAlignedSize(random_value_elem_cnt_ * sizeof(float)); + const int32_t sorted_value_aligned_bytes = + GetCudaAlignedSize(sorted_value_elem_cnt_ * sizeof(float)); + const int32_t indices_aligned_bytes = GetCudaAlignedSize(indices_elem_cnt_ * sizeof(int32_t)); + random_value_ptr_ = reinterpret_cast(ptr); + sorted_value_ptr_ = reinterpret_cast(reinterpret_cast(random_value_ptr_) + + random_value_aligned_bytes); + indices_ptr_ = reinterpret_cast(reinterpret_cast(sorted_value_ptr_) + + sorted_value_aligned_bytes); + temp_storage_ptr_ = + reinterpret_cast(reinterpret_cast(indices_ptr_) + indices_aligned_bytes); + temp_storage_bytes_ = + capacity_ - random_value_aligned_bytes - sorted_value_aligned_bytes - indices_aligned_bytes; + CHECK_GE(temp_storage_bytes_, 0); + } + ~TmpBufferManager() = default; + + float* RandomValuePtr() const { return random_value_ptr_; } + float* SortedValuePtr() const { return sorted_value_ptr_; } + int32_t* IndicesPtr() const { return indices_ptr_; } + void* TempStoragePtr() const { return temp_storage_ptr_; } + + int32_t TempStorageBytes() const { return temp_storage_bytes_; } + + private: + int32_t capacity_; + + float* random_value_ptr_; + float* sorted_value_ptr_; + int32_t* indices_ptr_; + void* temp_storage_ptr_; + + int32_t random_value_elem_cnt_; + int32_t sorted_value_elem_cnt_; + int32_t indices_elem_cnt_; + int32_t temp_storage_bytes_; +}; + +__global__ void InitializeIndices(int32_t elem_cnt, int32_t* indices_ptr) { + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { indices_ptr[i] = i; }; +} + +} // namespace + +class GenerateRandomBatchPermutationIndicesGPUKernel final : public user_op::OpKernel { + public: + GenerateRandomBatchPermutationIndicesGPUKernel() = default; + ~GenerateRandomBatchPermutationIndicesGPUKernel() = default; + + std::shared_ptr CreateOpKernelState( + user_op::KernelInitContext* ctx) const override { + int64_t seed = ctx->Attr("seed"); + return std::make_shared>>( + seed, ctx->stream()); + } + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, + const user_op::OpKernelCache*) const override { + auto* random_generator = + dynamic_cast>*>(state); + user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); + const int32_t batch_size = y->shape_view().At(0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + TmpBufferManager buf_manager(batch_size, + static_cast(tmp_buffer->shape_view().elem_cnt()), + tmp_buffer->mut_dptr()); + random_generator->Mutable()->Uniform(batch_size, buf_manager.RandomValuePtr()); + InitializeIndices<<stream()->As()->cuda_stream()>>>( + batch_size, buf_manager.IndicesPtr()); + const int32_t argsort_instance_num = 1; + const int32_t argsort_instance_size = batch_size; + SortPairsAscending(buf_manager.RandomValuePtr(), buf_manager.IndicesPtr(), argsort_instance_num, + argsort_instance_size, buf_manager.TempStoragePtr(), + buf_manager.TempStorageBytes(), buf_manager.SortedValuePtr(), + y->mut_dptr(), ctx->stream()->As()->cuda_stream()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +REGISTER_USER_KERNEL("generate_random_batch_permutation_indices") + .SetCreateFn() + .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA) + .SetInferTmpSizeFn([](oneflow::user_op::InferContext* ctx) { + const Shape* y_shape = ctx->OutputShape("y", 0); + const int32_t batch_size = y_shape->At(0); + + const int32_t random_value_aligned_bytes = GetCudaAlignedSize(batch_size * sizeof(float)); + const int32_t sorted_value_aligned_bytes = GetCudaAlignedSize(batch_size * sizeof(float)); + const int32_t indices_aligned_bytes = GetCudaAlignedSize(batch_size * sizeof(int32_t)); + const int32_t argsort_instance_num = 1; + const int32_t argsort_instance_size = batch_size; + const int32_t temp_storage_bytes = InferTempStorageForSortPairsAscending( + argsort_instance_num, argsort_instance_size); + + return random_value_aligned_bytes + sorted_value_aligned_bytes + indices_aligned_bytes + + temp_storage_bytes; + }); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/heap_selection_top_k_kernel.hip.cpp b/oneflow/user/kernels/heap_selection_top_k_kernel.hip.cpp index 0b59386..3fbc1aa 100644 --- a/oneflow/user/kernels/heap_selection_top_k_kernel.hip.cpp +++ b/oneflow/user/kernels/heap_selection_top_k_kernel.hip.cpp @@ -1,233 +1,233 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -T PowOf2Floor(T val, int64_t max_power) { - CHECK_GT(val, GetZeroVal()); - T max_floor = static_cast(std::pow(2, max_power)); - val = std::min(val, max_floor); - T ret = GetOneVal(); - while (true) { - ret *= 2; - if (ret >= val) { return ret == val ? ret : ret / 2; } - } -} - -template -T PowOf2Ceil(T val, int64_t max_power) { - CHECK_GT(val, GetZeroVal()); - T max_ceil = static_cast(std::pow(2, max_power)); - val = std::min(val, max_ceil); - T ret = GetOneVal(); - while (true) { - ret *= 2; - if (ret >= val) { return ret; } - } -} - -template -__device__ void BitonicSwap(T* data, const int64_t i, const int64_t j, const bool dir, - const Compare& comp) { - if (comp(data[i], data[j]) == dir) { - T tmp = data[i]; - data[i] = data[j]; - data[j] = tmp; - } -} - -// https://en.wikipedia.org/wiki/Bitonic_sorter -template -__device__ void BitonicSort(T* data, const int64_t elem_cnt, const Compare& comp) { - // The element count of instance should be pow-of-2 - assert(elem_cnt > 0 && !(elem_cnt & (elem_cnt - 1))); - - // Generate a bitonic sequence from input - for (int64_t size = 2; size <= elem_cnt / 2; size *= 2) { - // Merge 2 bitonic sequences of length 'size' into a bitonic sequence of length '2 * size' - for (int64_t stride = size / 2; stride > 0; stride /= 2) { - for (int64_t swap_id = threadIdx.x; swap_id < elem_cnt / 2; swap_id += blockDim.x) { - // Change dir at intervals of 'size / 2' swaps - const bool dir = swap_id & (size / 2); - // Locate the pair {pos, pos + stride} which is going te be swaped if needed - const int pos = 2 * swap_id - (swap_id & (stride - 1)); - - BitonicSwap(data, pos, pos + stride, dir, comp); - - __syncthreads(); - } - } - } - - // Sort the bitonic sequence - for (int64_t stride = elem_cnt / 2; stride > 0; stride /= 2) { - for (int64_t swap_id = threadIdx.x; swap_id < elem_cnt / 2; swap_id += blockDim.x) { - // Locate the pair {pos, pos + stride} which is going te be swaped if needed - const int pos = 2 * swap_id - (swap_id & (stride - 1)); - - BitonicSwap(data, pos, pos + stride, false, comp); - - __syncthreads(); - } - } -} - -template -class Entry final { - public: - __device__ __forceinline__ Entry(int64_t index, T value) : index_(index), value_(value) {} - - __device__ __forceinline__ int64_t GetIndex() const { return index_; } - __device__ __forceinline__ T GetValue() const { return value_; } - __device__ __forceinline__ void SetIndex(int64_t index) { index_ = index; } - __device__ __forceinline__ void SetValue(T value) { value_ = value; } - - __device__ __forceinline__ bool operator<(const Entry& entry) const { - return (value_ < entry.GetValue()) || (value_ == entry.GetValue() && index_ > entry.GetIndex()); - } - __device__ __forceinline__ bool operator>(const Entry& entry) const { - return (value_ > entry.GetValue()) || (value_ == entry.GetValue() && index_ < entry.GetIndex()); - } - - private: - int64_t index_; - T value_; -}; - -template -class MinHeap final { - public: - __device__ __forceinline__ MinHeap(Entry* data, const int64_t heap_size, - const int64_t init_index, const T init_value) - : data_(data), heap_size_(heap_size) { - for (int64_t i = 0; i < heap_size; ++i) { - data_[i].SetIndex(init_index); - data_[i].SetValue(init_value); - } - } - __device__ __forceinline__ Entry& Top() { return data_[0]; } - __device__ __forceinline__ void Swap(const int64_t i, const int64_t j) { - auto tmp = data_[j]; - data_[j] = data_[i]; - data_[i] = tmp; - } - __device__ __forceinline__ void MinHeapify(int64_t index) { - while (true) { - const int64_t left = 2 * index + 1; - const int64_t right = 2 * index + 2; - int64_t min = index; - if (left < heap_size_ && data_[left] < data_[min]) { min = left; } - if (right < heap_size_ && data_[right] < data_[min]) { min = right; } - if (min == index) { return; } - Swap(min, index); - index = min; - } - } - - private: - Entry* data_; - int64_t heap_size_; -}; - -template -__global__ void HeapTopKKernel(const T* in_ptr, const int64_t instance_num, - const int64_t instance_size, const int64_t k, - const int64_t heap_size, const int64_t init_index, - const T init_value, int64_t* out_ptr) { - extern __shared__ char smem[]; - auto* shared_entries = reinterpret_cast*>(smem); - - // Divide elements to be sorted into disjoint sets (# of sets == # of heaps). - // Each thread in the thread block manipulates one heap to select top heap_size entries from - // corresponding set - const T* input = in_ptr + blockIdx.x * instance_size; - auto heap = - MinHeap(shared_entries + threadIdx.x * heap_size, heap_size, init_index, init_value); - for (int64_t i = threadIdx.x; i < instance_size; i += blockDim.x) { - auto entry = Entry(i, input[i]); - if (entry > heap.Top()) { - heap.Top() = entry; - heap.MinHeapify(0); - } - } - - __syncthreads(); - - // Merge all heaps into a unified, sorted array - BitonicSort(shared_entries, blockDim.x * heap_size, - [](const Entry& x, const Entry& y) { return x > y; }); - - // Write top_k elements in sorted array to output - for (int64_t i = threadIdx.x; i < k; i += blockDim.x) { - (out_ptr + blockIdx.x * k)[i] = shared_entries[i].GetIndex(); - } -} - -} // namespace - -template -class GpuHeapSelectionTopKKernel final : public user_op::OpKernel { - public: - GpuHeapSelectionTopKKernel() = default; - ~GpuHeapSelectionTopKKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - if (in->shape_view().elem_cnt() == 0) { return; } - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - - const int64_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); - const int64_t instance_num = in->shape_view().elem_cnt() / instance_size; - const int64_t k = std::min(static_cast(ctx->Attr("k")), instance_size); - - // Use as many heaps as possible (# of heaps == # of threads used in thread block). - // Limitation 1: size of shared memory - // We also need heap_size * num_heap to be pow-of-2 which is necessary for bitonic sort - const int64_t heap_size = PowOf2Ceil(k, 16); - int32_t num_heap = - PowOf2Floor(kCudaMaxSharedMemoryByteSize / (heap_size * sizeof(Entry)), 16); - // Limitation 2: # of threads in thread block - num_heap = std::min(num_heap, kCudaThreadsNumPerBlock); - - HeapTopKKernel<<), - ctx->stream()->As()->cuda_stream()>>>( - in->dptr(), instance_num, instance_size, k, heap_size, GetMaxVal(), - GetMinVal(), out->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(dtype) \ - REGISTER_USER_KERNEL("top_k").SetCreateFn>().SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) && (user_op::HobAttr("k") <= 128) \ - && (user_op::HobDataType("in", 0) == GetDataType::value)); - -REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(float) -REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(double) -REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(uint8_t) -REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(int8_t) -REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(int32_t) -REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(int64_t) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +T PowOf2Floor(T val, int64_t max_power) { + CHECK_GT(val, GetZeroVal()); + T max_floor = static_cast(std::pow(2, max_power)); + val = std::min(val, max_floor); + T ret = GetOneVal(); + while (true) { + ret *= 2; + if (ret >= val) { return ret == val ? ret : ret / 2; } + } +} + +template +T PowOf2Ceil(T val, int64_t max_power) { + CHECK_GT(val, GetZeroVal()); + T max_ceil = static_cast(std::pow(2, max_power)); + val = std::min(val, max_ceil); + T ret = GetOneVal(); + while (true) { + ret *= 2; + if (ret >= val) { return ret; } + } +} + +template +__device__ void BitonicSwap(T* data, const int64_t i, const int64_t j, const bool dir, + const Compare& comp) { + if (comp(data[i], data[j]) == dir) { + T tmp = data[i]; + data[i] = data[j]; + data[j] = tmp; + } +} + +// https://en.wikipedia.org/wiki/Bitonic_sorter +template +__device__ void BitonicSort(T* data, const int64_t elem_cnt, const Compare& comp) { + // The element count of instance should be pow-of-2 + assert(elem_cnt > 0 && !(elem_cnt & (elem_cnt - 1))); + + // Generate a bitonic sequence from input + for (int64_t size = 2; size <= elem_cnt / 2; size *= 2) { + // Merge 2 bitonic sequences of length 'size' into a bitonic sequence of length '2 * size' + for (int64_t stride = size / 2; stride > 0; stride /= 2) { + for (int64_t swap_id = threadIdx.x; swap_id < elem_cnt / 2; swap_id += blockDim.x) { + // Change dir at intervals of 'size / 2' swaps + const bool dir = swap_id & (size / 2); + // Locate the pair {pos, pos + stride} which is going te be swaped if needed + const int pos = 2 * swap_id - (swap_id & (stride - 1)); + + BitonicSwap(data, pos, pos + stride, dir, comp); + + __syncthreads(); + } + } + } + + // Sort the bitonic sequence + for (int64_t stride = elem_cnt / 2; stride > 0; stride /= 2) { + for (int64_t swap_id = threadIdx.x; swap_id < elem_cnt / 2; swap_id += blockDim.x) { + // Locate the pair {pos, pos + stride} which is going te be swaped if needed + const int pos = 2 * swap_id - (swap_id & (stride - 1)); + + BitonicSwap(data, pos, pos + stride, false, comp); + + __syncthreads(); + } + } +} + +template +class Entry final { + public: + __device__ __forceinline__ Entry(int64_t index, T value) : index_(index), value_(value) {} + + __device__ __forceinline__ int64_t GetIndex() const { return index_; } + __device__ __forceinline__ T GetValue() const { return value_; } + __device__ __forceinline__ void SetIndex(int64_t index) { index_ = index; } + __device__ __forceinline__ void SetValue(T value) { value_ = value; } + + __device__ __forceinline__ bool operator<(const Entry& entry) const { + return (value_ < entry.GetValue()) || (value_ == entry.GetValue() && index_ > entry.GetIndex()); + } + __device__ __forceinline__ bool operator>(const Entry& entry) const { + return (value_ > entry.GetValue()) || (value_ == entry.GetValue() && index_ < entry.GetIndex()); + } + + private: + int64_t index_; + T value_; +}; + +template +class MinHeap final { + public: + __device__ __forceinline__ MinHeap(Entry* data, const int64_t heap_size, + const int64_t init_index, const T init_value) + : data_(data), heap_size_(heap_size) { + for (int64_t i = 0; i < heap_size; ++i) { + data_[i].SetIndex(init_index); + data_[i].SetValue(init_value); + } + } + __device__ __forceinline__ Entry& Top() { return data_[0]; } + __device__ __forceinline__ void Swap(const int64_t i, const int64_t j) { + auto tmp = data_[j]; + data_[j] = data_[i]; + data_[i] = tmp; + } + __device__ __forceinline__ void MinHeapify(int64_t index) { + while (true) { + const int64_t left = 2 * index + 1; + const int64_t right = 2 * index + 2; + int64_t min = index; + if (left < heap_size_ && data_[left] < data_[min]) { min = left; } + if (right < heap_size_ && data_[right] < data_[min]) { min = right; } + if (min == index) { return; } + Swap(min, index); + index = min; + } + } + + private: + Entry* data_; + int64_t heap_size_; +}; + +template +__global__ void HeapTopKKernel(const T* in_ptr, const int64_t instance_num, + const int64_t instance_size, const int64_t k, + const int64_t heap_size, const int64_t init_index, + const T init_value, int64_t* out_ptr) { + extern __shared__ char smem[]; + auto* shared_entries = reinterpret_cast*>(smem); + + // Divide elements to be sorted into disjoint sets (# of sets == # of heaps). + // Each thread in the thread block manipulates one heap to select top heap_size entries from + // corresponding set + const T* input = in_ptr + blockIdx.x * instance_size; + auto heap = + MinHeap(shared_entries + threadIdx.x * heap_size, heap_size, init_index, init_value); + for (int64_t i = threadIdx.x; i < instance_size; i += blockDim.x) { + auto entry = Entry(i, input[i]); + if (entry > heap.Top()) { + heap.Top() = entry; + heap.MinHeapify(0); + } + } + + __syncthreads(); + + // Merge all heaps into a unified, sorted array + BitonicSort(shared_entries, blockDim.x * heap_size, + [](const Entry& x, const Entry& y) { return x > y; }); + + // Write top_k elements in sorted array to output + for (int64_t i = threadIdx.x; i < k; i += blockDim.x) { + (out_ptr + blockIdx.x * k)[i] = shared_entries[i].GetIndex(); + } +} + +} // namespace + +template +class GpuHeapSelectionTopKKernel final : public user_op::OpKernel { + public: + GpuHeapSelectionTopKKernel() = default; + ~GpuHeapSelectionTopKKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + if (in->shape_view().elem_cnt() == 0) { return; } + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + + const int64_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); + const int64_t instance_num = in->shape_view().elem_cnt() / instance_size; + const int64_t k = std::min(static_cast(ctx->Attr("k")), instance_size); + + // Use as many heaps as possible (# of heaps == # of threads used in thread block). + // Limitation 1: size of shared memory + // We also need heap_size * num_heap to be pow-of-2 which is necessary for bitonic sort + const int64_t heap_size = PowOf2Ceil(k, 16); + int32_t num_heap = + PowOf2Floor(kCudaMaxSharedMemoryByteSize / (heap_size * sizeof(Entry)), 16); + // Limitation 2: # of threads in thread block + num_heap = std::min(num_heap, kCudaThreadsNumPerBlock); + + HeapTopKKernel<<), + ctx->stream()->As()->cuda_stream()>>>( + in->dptr(), instance_num, instance_size, k, heap_size, GetMaxVal(), + GetMinVal(), out->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(dtype) \ + REGISTER_USER_KERNEL("top_k").SetCreateFn>().SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) && (user_op::HobAttr("k") <= 128) \ + && (user_op::HobDataType("in", 0) == GetDataType::value)); + +REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(float) +REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(double) +REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(uint8_t) +REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(int8_t) +REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(int32_t) +REGISTER_CUDA_HEAP_SELECTION_TOP_K_KERNEL(int64_t) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/image_preprocess_kernels.hip.cpp b/oneflow/user/kernels/image_preprocess_kernels.hip.cpp index 26a961e..49d0bef 100644 --- a/oneflow/user/kernels/image_preprocess_kernels.hip.cpp +++ b/oneflow/user/kernels/image_preprocess_kernels.hip.cpp @@ -1,216 +1,216 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/common/small_vector.h" -#include "oneflow/core/common/nd_index_offset_helper.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -struct NormalizeVal { - float val[3]; -}; - -enum TensorLayout { - kNCHW = 0, - kNHWC = 1, -}; - -class NormalizeAttr final : public user_op::OpKernelState { - public: - NormalizeAttr(user_op::KernelInitContext* ctx) { - const std::vector& mean_vec = ctx->Attr>("mean"); - if (mean_vec.size() == 1) { - for (int i = 0; i < 3; ++i) { mean_.val[i] = mean_vec.at(0); } - } else if (mean_vec.size() == 3) { - for (int i = 0; i < 3; ++i) { mean_.val[i] = mean_vec.at(i); } - } else { - UNIMPLEMENTED(); - } - - const std::vector& std_vec = ctx->Attr>("std"); - if (std_vec.size() == 1) { - for (int i = 0; i < 3; ++i) { inv_std_.val[i] = 1.0f / std_vec.at(0); } - } else if (std_vec.size() == 3) { - for (int i = 0; i < 3; ++i) { inv_std_.val[i] = 1.0f / std_vec.at(i); } - } else { - UNIMPLEMENTED(); - } - } - ~NormalizeAttr() = default; - - const NormalizeVal& mean() const { return mean_; } - const NormalizeVal& inv_std() const { return inv_std_; } - - private: - NormalizeVal mean_; - NormalizeVal inv_std_; -}; - -template -__device__ __forceinline__ void OutIdx2InIdx(int32_t* out_idx, int32_t* in_idx, - const int8_t* mirror_dptr, int32_t out_W, - int32_t H_offset, int32_t W_offset); -template<> -__device__ __forceinline__ void OutIdx2InIdx(int32_t* out_idx, int32_t* in_idx, - const int8_t* mirror_dptr, - int32_t out_W, int32_t H_offset, - int32_t W_offset) { - if (mirror_dptr && mirror_dptr[out_idx[0]]) { out_idx[3] = out_W - 1 - out_idx[3]; } - in_idx[0] = out_idx[0]; // N - in_idx[1] = out_idx[2] + H_offset; // H - in_idx[2] = out_idx[3] + W_offset; // W - in_idx[3] = out_idx[1]; // C -} - -template<> -__device__ __forceinline__ void OutIdx2InIdx(int32_t* out_idx, int32_t* in_idx, - const int8_t* mirror_dptr, - int32_t out_W, int32_t H_offset, - int32_t W_offset) { - if (mirror_dptr && mirror_dptr[out_idx[0]]) { out_idx[2] = out_W - 1 - out_idx[2]; } - in_idx[0] = out_idx[0]; // N - in_idx[1] = out_idx[1] + H_offset; // H - in_idx[2] = out_idx[2] + W_offset; // W - in_idx[3] = out_idx[3]; // C -} - -template -__global__ void CropMirrorNormalizeGpuImpl(int32_t elem_cnt, const uint8_t* in_dptr, - float* out_dptr, const int8_t* mirror_dptr, - int32_t out_W, - const NdIndexOffsetHelper in_helper, - const NdIndexOffsetHelper out_helper, - int32_t H_offset, int32_t W_offset, - const NormalizeVal mean, const NormalizeVal inv_std) { - CUDA_1D_KERNEL_LOOP(out_offset, elem_cnt) { - int32_t in_idx[4]; - int32_t out_idx[4]; - out_helper.OffsetToNdIndex(out_offset, out_idx); - OutIdx2InIdx(out_idx, in_idx, mirror_dptr, out_W, H_offset, W_offset); - float mean_val; - float inv_std_val; - const int32_t c = in_idx[3]; - // When the compiler can't resolve array indices to constants it will put private arrays into - // GPU local memory. Using local memory is slower than keeping array elements directly in - // registers. - if (c == 0) { - mean_val = mean.val[0]; - inv_std_val = inv_std.val[0]; - } else if (c == 1) { - mean_val = mean.val[1]; - inv_std_val = inv_std.val[1]; - } else if (c == 2) { - mean_val = mean.val[2]; - inv_std_val = inv_std.val[2]; - } else { - // undefined behavior - assert(false); - } - int32_t in_offset = in_helper.NdIndexToOffset(in_idx); - out_dptr[out_offset] = (static_cast(in_dptr[in_offset]) - mean_val) * inv_std_val; - } -} - -} // namespace - -class CropMirrorNormalizeGpuKernel final : public user_op::OpKernel { - public: - CropMirrorNormalizeGpuKernel() = default; - ~CropMirrorNormalizeGpuKernel() override = default; - - std::shared_ptr CreateOpKernelState( - user_op::KernelInitContext* ctx) const override { - return std::make_shared(ctx); - } - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, - const user_op::OpKernelCache*) const override { - auto* normalize_attr = dynamic_cast(state); - const NormalizeVal& mean = normalize_attr->mean(); - const NormalizeVal& inv_std = normalize_attr->inv_std(); - user_op::Tensor* in_blob = ctx->Tensor4ArgNameAndIndex("in", 0); - user_op::Tensor* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); - const std::string& output_layout = ctx->Attr("output_layout"); - float* out_dptr = out_blob->mut_dptr(); - const uint8_t* in_dptr = in_blob->dptr(); - const ShapeView& in_shape = in_blob->shape_view(); - const ShapeView& out_shape = out_blob->shape_view(); - CHECK_EQ(in_shape.NumAxes(), 4); - CHECK_EQ(out_shape.NumAxes(), 4); - int32_t elem_cnt = out_shape.elem_cnt(); - CHECK_LE(elem_cnt, GetMaxVal()); - float crop_pos_y = ctx->Attr("crop_pos_y"); - float crop_pos_x = ctx->Attr("crop_pos_x"); - - int32_t N = in_shape.At(0); - int32_t in_H = in_shape.At(1); - int32_t in_W = in_shape.At(2); - int32_t C = in_shape.At(3); - const NdIndexOffsetHelper in_helper(N, in_H, in_W, C); - const int8_t* mirror_dptr = nullptr; - user_op::Tensor* mirror_blob = ctx->Tensor4ArgNameAndIndex("mirror", 0); - if (mirror_blob) { mirror_dptr = mirror_blob->dptr(); } - - if (output_layout == "NCHW") { - CHECK_EQ(N, out_shape.At(0)); - CHECK_EQ(C, out_shape.At(1)); - int32_t out_H = out_shape.At(2); - int32_t out_W = out_shape.At(3); - CHECK_LE(out_H, in_H); - CHECK_LE(out_W, in_W); - int32_t H_offset = (in_H - out_H) * crop_pos_y; - int32_t W_offset = (in_W - out_W) * crop_pos_x; - const NdIndexOffsetHelper out_helper(N, C, out_H, out_W); - CropMirrorNormalizeGpuImpl - <<stream()->As()->cuda_stream()>>>( - elem_cnt, in_dptr, out_dptr, mirror_dptr, out_W, in_helper, out_helper, H_offset, - W_offset, mean, inv_std); - } else if (output_layout == "NHWC") { - CHECK_EQ(N, out_shape.At(0)); - int32_t out_H = out_shape.At(1); - int32_t out_W = out_shape.At(2); - CHECK_EQ(C, out_shape.At(3)); - CHECK_LE(out_H, in_H); - CHECK_LE(out_W, in_W); - int32_t H_offset = (in_H - out_H) * crop_pos_y; - int32_t W_offset = (in_W - out_W) * crop_pos_x; - const NdIndexOffsetHelper out_helper(N, out_H, out_W, C); - CropMirrorNormalizeGpuImpl - <<stream()->As()->cuda_stream()>>>( - elem_cnt, in_dptr, out_dptr, mirror_dptr, out_W, in_helper, out_helper, H_offset, - W_offset, mean, inv_std); - } else { - UNIMPLEMENTED(); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -REGISTER_USER_KERNEL("crop_mirror_normalize_from_uint8") - .SetCreateFn() - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) - && (user_op::HobDataType("in", 0) == DataType::kUInt8) - && (user_op::HobDataType("out", 0) == DataType::kFloat)); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/common/small_vector.h" +#include "oneflow/core/common/nd_index_offset_helper.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +struct NormalizeVal { + float val[3]; +}; + +enum TensorLayout { + kNCHW = 0, + kNHWC = 1, +}; + +class NormalizeAttr final : public user_op::OpKernelState { + public: + NormalizeAttr(user_op::KernelInitContext* ctx) { + const std::vector& mean_vec = ctx->Attr>("mean"); + if (mean_vec.size() == 1) { + for (int i = 0; i < 3; ++i) { mean_.val[i] = mean_vec.at(0); } + } else if (mean_vec.size() == 3) { + for (int i = 0; i < 3; ++i) { mean_.val[i] = mean_vec.at(i); } + } else { + UNIMPLEMENTED(); + } + + const std::vector& std_vec = ctx->Attr>("std"); + if (std_vec.size() == 1) { + for (int i = 0; i < 3; ++i) { inv_std_.val[i] = 1.0f / std_vec.at(0); } + } else if (std_vec.size() == 3) { + for (int i = 0; i < 3; ++i) { inv_std_.val[i] = 1.0f / std_vec.at(i); } + } else { + UNIMPLEMENTED(); + } + } + ~NormalizeAttr() = default; + + const NormalizeVal& mean() const { return mean_; } + const NormalizeVal& inv_std() const { return inv_std_; } + + private: + NormalizeVal mean_; + NormalizeVal inv_std_; +}; + +template +__device__ __forceinline__ void OutIdx2InIdx(int32_t* out_idx, int32_t* in_idx, + const int8_t* mirror_dptr, int32_t out_W, + int32_t H_offset, int32_t W_offset); +template<> +__device__ __forceinline__ void OutIdx2InIdx(int32_t* out_idx, int32_t* in_idx, + const int8_t* mirror_dptr, + int32_t out_W, int32_t H_offset, + int32_t W_offset) { + if (mirror_dptr && mirror_dptr[out_idx[0]]) { out_idx[3] = out_W - 1 - out_idx[3]; } + in_idx[0] = out_idx[0]; // N + in_idx[1] = out_idx[2] + H_offset; // H + in_idx[2] = out_idx[3] + W_offset; // W + in_idx[3] = out_idx[1]; // C +} + +template<> +__device__ __forceinline__ void OutIdx2InIdx(int32_t* out_idx, int32_t* in_idx, + const int8_t* mirror_dptr, + int32_t out_W, int32_t H_offset, + int32_t W_offset) { + if (mirror_dptr && mirror_dptr[out_idx[0]]) { out_idx[2] = out_W - 1 - out_idx[2]; } + in_idx[0] = out_idx[0]; // N + in_idx[1] = out_idx[1] + H_offset; // H + in_idx[2] = out_idx[2] + W_offset; // W + in_idx[3] = out_idx[3]; // C +} + +template +__global__ void CropMirrorNormalizeGpuImpl(int32_t elem_cnt, const uint8_t* in_dptr, + float* out_dptr, const int8_t* mirror_dptr, + int32_t out_W, + const NdIndexOffsetHelper in_helper, + const NdIndexOffsetHelper out_helper, + int32_t H_offset, int32_t W_offset, + const NormalizeVal mean, const NormalizeVal inv_std) { + CUDA_1D_KERNEL_LOOP(out_offset, elem_cnt) { + int32_t in_idx[4]; + int32_t out_idx[4]; + out_helper.OffsetToNdIndex(out_offset, out_idx); + OutIdx2InIdx(out_idx, in_idx, mirror_dptr, out_W, H_offset, W_offset); + float mean_val; + float inv_std_val; + const int32_t c = in_idx[3]; + // When the compiler can't resolve array indices to constants it will put private arrays into + // GPU local memory. Using local memory is slower than keeping array elements directly in + // registers. + if (c == 0) { + mean_val = mean.val[0]; + inv_std_val = inv_std.val[0]; + } else if (c == 1) { + mean_val = mean.val[1]; + inv_std_val = inv_std.val[1]; + } else if (c == 2) { + mean_val = mean.val[2]; + inv_std_val = inv_std.val[2]; + } else { + // undefined behavior + assert(false); + } + int32_t in_offset = in_helper.NdIndexToOffset(in_idx); + out_dptr[out_offset] = (static_cast(in_dptr[in_offset]) - mean_val) * inv_std_val; + } +} + +} // namespace + +class CropMirrorNormalizeGpuKernel final : public user_op::OpKernel { + public: + CropMirrorNormalizeGpuKernel() = default; + ~CropMirrorNormalizeGpuKernel() override = default; + + std::shared_ptr CreateOpKernelState( + user_op::KernelInitContext* ctx) const override { + return std::make_shared(ctx); + } + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, + const user_op::OpKernelCache*) const override { + auto* normalize_attr = dynamic_cast(state); + const NormalizeVal& mean = normalize_attr->mean(); + const NormalizeVal& inv_std = normalize_attr->inv_std(); + user_op::Tensor* in_blob = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); + const std::string& output_layout = ctx->Attr("output_layout"); + float* out_dptr = out_blob->mut_dptr(); + const uint8_t* in_dptr = in_blob->dptr(); + const ShapeView& in_shape = in_blob->shape_view(); + const ShapeView& out_shape = out_blob->shape_view(); + CHECK_EQ(in_shape.NumAxes(), 4); + CHECK_EQ(out_shape.NumAxes(), 4); + int32_t elem_cnt = out_shape.elem_cnt(); + CHECK_LE(elem_cnt, GetMaxVal()); + float crop_pos_y = ctx->Attr("crop_pos_y"); + float crop_pos_x = ctx->Attr("crop_pos_x"); + + int32_t N = in_shape.At(0); + int32_t in_H = in_shape.At(1); + int32_t in_W = in_shape.At(2); + int32_t C = in_shape.At(3); + const NdIndexOffsetHelper in_helper(N, in_H, in_W, C); + const int8_t* mirror_dptr = nullptr; + user_op::Tensor* mirror_blob = ctx->Tensor4ArgNameAndIndex("mirror", 0); + if (mirror_blob) { mirror_dptr = mirror_blob->dptr(); } + + if (output_layout == "NCHW") { + CHECK_EQ(N, out_shape.At(0)); + CHECK_EQ(C, out_shape.At(1)); + int32_t out_H = out_shape.At(2); + int32_t out_W = out_shape.At(3); + CHECK_LE(out_H, in_H); + CHECK_LE(out_W, in_W); + int32_t H_offset = (in_H - out_H) * crop_pos_y; + int32_t W_offset = (in_W - out_W) * crop_pos_x; + const NdIndexOffsetHelper out_helper(N, C, out_H, out_W); + CropMirrorNormalizeGpuImpl + <<stream()->As()->cuda_stream()>>>( + elem_cnt, in_dptr, out_dptr, mirror_dptr, out_W, in_helper, out_helper, H_offset, + W_offset, mean, inv_std); + } else if (output_layout == "NHWC") { + CHECK_EQ(N, out_shape.At(0)); + int32_t out_H = out_shape.At(1); + int32_t out_W = out_shape.At(2); + CHECK_EQ(C, out_shape.At(3)); + CHECK_LE(out_H, in_H); + CHECK_LE(out_W, in_W); + int32_t H_offset = (in_H - out_H) * crop_pos_y; + int32_t W_offset = (in_W - out_W) * crop_pos_x; + const NdIndexOffsetHelper out_helper(N, out_H, out_W, C); + CropMirrorNormalizeGpuImpl + <<stream()->As()->cuda_stream()>>>( + elem_cnt, in_dptr, out_dptr, mirror_dptr, out_W, in_helper, out_helper, H_offset, + W_offset, mean, inv_std); + } else { + UNIMPLEMENTED(); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +REGISTER_USER_KERNEL("crop_mirror_normalize_from_uint8") + .SetCreateFn() + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) + && (user_op::HobDataType("in", 0) == DataType::kUInt8) + && (user_op::HobDataType("out", 0) == DataType::kFloat)); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/in_top_k_kernel_util.hip.cpp b/oneflow/user/kernels/in_top_k_kernel_util.hip.cpp index b971dc1..e6be6f7 100644 --- a/oneflow/user/kernels/in_top_k_kernel_util.hip.cpp +++ b/oneflow/user/kernels/in_top_k_kernel_util.hip.cpp @@ -1,68 +1,68 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/user/kernels/in_top_k_kernel_util.h" -#include "oneflow/core/device/cuda_util.h" - -namespace oneflow { - -namespace { - -template -__global__ void InTopkGpu(const int instance_num, const int classes_num, const T* targets, - const float* predictions, const int k, bool* out) { - CUDA_1D_KERNEL_LOOP(idx, instance_num) { - T target = targets[idx]; - bool cannot_say = (target >= classes_num) || !isfinite(predictions[idx * classes_num + target]); - - int32_t more_probable_classes = 0; - if (!cannot_say) { - const float target_prediction = predictions[idx * classes_num + target]; - FOR_RANGE(int32_t, class_idx, 0, classes_num) { - float pred = predictions[idx * classes_num + class_idx]; - - if (!isfinite(pred)) { - cannot_say = true; - break; - } else if (pred > target_prediction) { - ++more_probable_classes; - if (more_probable_classes > k) break; - } - } - } - out[idx] = cannot_say ? false : (more_probable_classes < k); - } -} - -} // namespace - -template -struct InTopkKernelUtil { - static void InTopk(ep::Stream* stream, const int instance_num, const int classes_num, - const T* targets, const float* predictions, const int k, bool* out) { - RUN_CUDA_KERNEL((InTopkGpu), stream, instance_num, instance_num, classes_num, targets, - predictions, k, out); - } -}; - -#define INSTANTIATE_IN_TOP_K_KERNEL_UTIL_CUDA(cpp_data_type, data_type) \ - template struct InTopkKernelUtil; - -OF_PP_FOR_EACH_TUPLE(INSTANTIATE_IN_TOP_K_KERNEL_UTIL_CUDA, INDEX_DATA_TYPE_SEQ) - -#undef INSTANTIATE_IN_TOP_K_KERNEL_UTIL_CUDA - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/user/kernels/in_top_k_kernel_util.h" +#include "oneflow/core/device/cuda_util.h" + +namespace oneflow { + +namespace { + +template +__global__ void InTopkGpu(const int instance_num, const int classes_num, const T* targets, + const float* predictions, const int k, bool* out) { + CUDA_1D_KERNEL_LOOP(idx, instance_num) { + T target = targets[idx]; + bool cannot_say = (target >= classes_num) || !isfinite(predictions[idx * classes_num + target]); + + int32_t more_probable_classes = 0; + if (!cannot_say) { + const float target_prediction = predictions[idx * classes_num + target]; + FOR_RANGE(int32_t, class_idx, 0, classes_num) { + float pred = predictions[idx * classes_num + class_idx]; + + if (!isfinite(pred)) { + cannot_say = true; + break; + } else if (pred > target_prediction) { + ++more_probable_classes; + if (more_probable_classes > k) break; + } + } + } + out[idx] = cannot_say ? false : (more_probable_classes < k); + } +} + +} // namespace + +template +struct InTopkKernelUtil { + static void InTopk(ep::Stream* stream, const int instance_num, const int classes_num, + const T* targets, const float* predictions, const int k, bool* out) { + RUN_CUDA_KERNEL((InTopkGpu), stream, instance_num, instance_num, classes_num, targets, + predictions, k, out); + } +}; + +#define INSTANTIATE_IN_TOP_K_KERNEL_UTIL_CUDA(cpp_data_type, data_type) \ + template struct InTopkKernelUtil; + +OF_PP_FOR_EACH_TUPLE(INSTANTIATE_IN_TOP_K_KERNEL_UTIL_CUDA, INDEX_DATA_TYPE_SEQ) + +#undef INSTANTIATE_IN_TOP_K_KERNEL_UTIL_CUDA + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/kl_div_kernel.hip.cpp b/oneflow/user/kernels/kl_div_kernel.hip.cpp index cc78fa5..eddebd8 100644 --- a/oneflow/user/kernels/kl_div_kernel.hip.cpp +++ b/oneflow/user/kernels/kl_div_kernel.hip.cpp @@ -1,121 +1,121 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/common/data_type.h" -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/kernel_util.hip.h" -#include "oneflow/user/kernels/loss_kernel_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { -namespace user_op { -namespace { - -using namespace loss; - -template -struct KLDivFunctor { - __device__ __forceinline__ T operator()(T input_val, T target_val) const { - if (LOG_TARGET) { - return exp(target_val) * (target_val - input_val); - } else { - const T zero_val = static_cast(0); - const T out_val = target_val * (SafeLog(target_val) - input_val); - return target_val > zero_val ? out_val : zero_val; - } - } -}; - -template -struct KLDivFunctor { - __device__ __forceinline__ half operator()(half input_val, half target_val) const { - if (LOG_TARGET) { - return hexp(target_val) * (target_val - input_val); - } else { - const half zero_val = __float2half(0.f); - const half out_val = target_val * (SafeLog(target_val) - input_val); - return target_val > zero_val ? out_val : zero_val; - } - } -}; - -template -struct KLDivGradFunctor { - __device__ __forceinline__ T operator()(T target_val, T dy_val) const { - if (LOG_TARGET) { - return -exp(target_val) * dy_val; - } else { - const T zero_val = static_cast(0); - return target_val > zero_val ? -target_val * dy_val : zero_val; - } - } -}; - -template -struct KLDivGradFunctor { - __device__ __forceinline__ half operator()(half target_val, half dy_val) const { - if (LOG_TARGET) { - return __hneg(hexp(target_val) * dy_val); - } else { - const half zero_val = __float2half(0.f); - return target_val > zero_val ? __hneg(target_val * dy_val) : zero_val; - } - } -}; - -template -class KLDivKernel : public SimpleLossKernel> { - public: - void ComputeOut(user_op::KernelComputeContext* ctx, int64_t elem_cnt, const T* input, - const T* target, T* out) const { - const bool log_target = ctx->Attr("log_target"); - if (log_target) { - OF_CUDA_CHECK( - (cuda::elementwise::Binary(KLDivFunctor(), elem_cnt, out, input, target, - ctx->stream()->As()->cuda_stream()))); - } else { - OF_CUDA_CHECK( - (cuda::elementwise::Binary(KLDivFunctor(), elem_cnt, out, input, target, - ctx->stream()->As()->cuda_stream()))); - } - } -}; - -template -class KLDivGradKernel : public SimpleLossGradKernel> { - public: - void ComputeOut(user_op::KernelComputeContext* ctx, int64_t elem_cnt, const T* input, - const T* target, const T* dy, T* dx) const { - const bool log_target = ctx->Attr("log_target"); - if (log_target) { - OF_CUDA_CHECK((cuda::elementwise::Binary( - KLDivGradFunctor(), elem_cnt, dx, target, dy, - ctx->stream()->As()->cuda_stream()))); - } else { - OF_CUDA_CHECK((cuda::elementwise::Binary( - KLDivGradFunctor(), elem_cnt, dx, target, dy, - ctx->stream()->As()->cuda_stream()))); - } - } -}; - -} // namespace - -REGISTER_SIMPLE_LOSS_KERNEL_CUDA("kl_div_loss", KLDivKernel) -REGISTER_SIMPLE_LOSS_GRAD_KERNEL_CUDA("kl_div_loss_grad", KLDivGradKernel) - -} // namespace user_op +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/common/data_type.h" +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/kernel_util.hip.h" +#include "oneflow/user/kernels/loss_kernel_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { +namespace user_op { +namespace { + +using namespace loss; + +template +struct KLDivFunctor { + __device__ __forceinline__ T operator()(T input_val, T target_val) const { + if (LOG_TARGET) { + return exp(target_val) * (target_val - input_val); + } else { + const T zero_val = static_cast(0); + const T out_val = target_val * (SafeLog(target_val) - input_val); + return target_val > zero_val ? out_val : zero_val; + } + } +}; + +template +struct KLDivFunctor { + __device__ __forceinline__ half operator()(half input_val, half target_val) const { + if (LOG_TARGET) { + return hexp(target_val) * (target_val - input_val); + } else { + const half zero_val = __float2half(0.f); + const half out_val = target_val * (SafeLog(target_val) - input_val); + return target_val > zero_val ? out_val : zero_val; + } + } +}; + +template +struct KLDivGradFunctor { + __device__ __forceinline__ T operator()(T target_val, T dy_val) const { + if (LOG_TARGET) { + return -exp(target_val) * dy_val; + } else { + const T zero_val = static_cast(0); + return target_val > zero_val ? -target_val * dy_val : zero_val; + } + } +}; + +template +struct KLDivGradFunctor { + __device__ __forceinline__ half operator()(half target_val, half dy_val) const { + if (LOG_TARGET) { + return __hneg(hexp(target_val) * dy_val); + } else { + const half zero_val = __float2half(0.f); + return target_val > zero_val ? __hneg(target_val * dy_val) : zero_val; + } + } +}; + +template +class KLDivKernel : public SimpleLossKernel> { + public: + void ComputeOut(user_op::KernelComputeContext* ctx, int64_t elem_cnt, const T* input, + const T* target, T* out) const { + const bool log_target = ctx->Attr("log_target"); + if (log_target) { + OF_CUDA_CHECK( + (cuda::elementwise::Binary(KLDivFunctor(), elem_cnt, out, input, target, + ctx->stream()->As()->cuda_stream()))); + } else { + OF_CUDA_CHECK( + (cuda::elementwise::Binary(KLDivFunctor(), elem_cnt, out, input, target, + ctx->stream()->As()->cuda_stream()))); + } + } +}; + +template +class KLDivGradKernel : public SimpleLossGradKernel> { + public: + void ComputeOut(user_op::KernelComputeContext* ctx, int64_t elem_cnt, const T* input, + const T* target, const T* dy, T* dx) const { + const bool log_target = ctx->Attr("log_target"); + if (log_target) { + OF_CUDA_CHECK((cuda::elementwise::Binary( + KLDivGradFunctor(), elem_cnt, dx, target, dy, + ctx->stream()->As()->cuda_stream()))); + } else { + OF_CUDA_CHECK((cuda::elementwise::Binary( + KLDivGradFunctor(), elem_cnt, dx, target, dy, + ctx->stream()->As()->cuda_stream()))); + } + } +}; + +} // namespace + +REGISTER_SIMPLE_LOSS_KERNEL_CUDA("kl_div_loss", KLDivKernel) +REGISTER_SIMPLE_LOSS_GRAD_KERNEL_CUDA("kl_div_loss_grad", KLDivGradKernel) + +} // namespace user_op } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/l1_l2_regularize_gradient_kernel_util.hip.cpp b/oneflow/user/kernels/l1_l2_regularize_gradient_kernel_util.hip.cpp index 9672d97..27404bf 100644 --- a/oneflow/user/kernels/l1_l2_regularize_gradient_kernel_util.hip.cpp +++ b/oneflow/user/kernels/l1_l2_regularize_gradient_kernel_util.hip.cpp @@ -1,51 +1,51 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/user/kernels/l1_l2_regularize_gradient_kernel_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -__global__ void L1L2RegularizeGradientGpu(int64_t n, const T* model, const T* model_diff, T* out, - const T l1, const T l2) { - CUDA_1D_KERNEL_LOOP(i, n) { - const T model_val = model[i]; - out[i] = model_diff[i] + l1 * ((model_val >= 0) - (model_val <= 0)) + l2 * model_val; - } -} - -} // namespace - -template -struct L1L2RegularizeGradientKernelUtil { - static void RegularizeGradient(ep::Stream* stream, int64_t n, const T* model, const T* model_diff, - T* out, const T l1, const T l2) { - L1L2RegularizeGradientGpu<<As()->cuda_stream()>>>(n, model, model_diff, - out, l1, l2); - } -}; - -#define INSTANTIATE_L1_L2_REGULARIZE_GRADIENT_KERNEL_UTIL_CUDA(type_cpp, type_proto) \ - template struct L1L2RegularizeGradientKernelUtil; -OF_PP_FOR_EACH_TUPLE(INSTANTIATE_L1_L2_REGULARIZE_GRADIENT_KERNEL_UTIL_CUDA, - FLOATING_DATA_TYPE_SEQ); -#undef INSTANTIATE_L1_L2_REGULARIZE_GRADIENT_KERNEL_UTIL_CUDA - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/user/kernels/l1_l2_regularize_gradient_kernel_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +__global__ void L1L2RegularizeGradientGpu(int64_t n, const T* model, const T* model_diff, T* out, + const T l1, const T l2) { + CUDA_1D_KERNEL_LOOP(i, n) { + const T model_val = model[i]; + out[i] = model_diff[i] + l1 * ((model_val >= 0) - (model_val <= 0)) + l2 * model_val; + } +} + +} // namespace + +template +struct L1L2RegularizeGradientKernelUtil { + static void RegularizeGradient(ep::Stream* stream, int64_t n, const T* model, const T* model_diff, + T* out, const T l1, const T l2) { + L1L2RegularizeGradientGpu<<As()->cuda_stream()>>>(n, model, model_diff, + out, l1, l2); + } +}; + +#define INSTANTIATE_L1_L2_REGULARIZE_GRADIENT_KERNEL_UTIL_CUDA(type_cpp, type_proto) \ + template struct L1L2RegularizeGradientKernelUtil; +OF_PP_FOR_EACH_TUPLE(INSTANTIATE_L1_L2_REGULARIZE_GRADIENT_KERNEL_UTIL_CUDA, + FLOATING_DATA_TYPE_SEQ); +#undef INSTANTIATE_L1_L2_REGULARIZE_GRADIENT_KERNEL_UTIL_CUDA + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/l2_normalize_kernel.hip.cpp b/oneflow/user/kernels/l2_normalize_kernel.hip.cpp index 5228003..f3ac7b2 100644 --- a/oneflow/user/kernels/l2_normalize_kernel.hip.cpp +++ b/oneflow/user/kernels/l2_normalize_kernel.hip.cpp @@ -1,150 +1,150 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include -#include "oneflow/core/device/cuda_util.h" - -namespace oneflow { - -namespace { - -template -__global__ void L2NormalizeForward(const int32_t n, const int32_t c, const int32_t d, - const T epsilon, const T* in, T* square_x_sum, T* out) { - using BlockReduce = hipcub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - - for (int32_t i = blockIdx.x; i < n; i += gridDim.x) { - T sum = GetZeroVal(); - const int32_t offset = (i / d) * d * c + (i % d); - for (int32_t j = threadIdx.x; j < c; j += blockDim.x) { - const T x = in[offset + j * d]; - sum += x * x; - } - const T reduce_sum = BlockReduce(temp_storage).Sum(sum); - if (threadIdx.x == 0) { square_x_sum[i] = reduce_sum; } - __syncthreads(); - - const T inv_norm = rsqrtf(fmaxf(square_x_sum[i], epsilon)); - for (int32_t j = threadIdx.x; j < c; j += blockDim.x) { - const int32_t index = offset + j * d; - out[index] = inv_norm * in[index]; - } - } -} - -template -__global__ void L2NormalizeBackward(const int32_t n, const int32_t c, const int32_t d, - const float epsilon, const T* out, const T* out_diff, - const T* square_x_sum, T* in_diff) { - for (int32_t i = blockIdx.x; i < n; i += gridDim.x) { - const T inv_norm = rsqrt(fmaxf(square_x_sum[i], epsilon)); - const int32_t offset = (i / d) * d * c + (i % d); - if (square_x_sum[i] >= epsilon) { - using BlockReduce = hipcub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage_prod_sum; - - T y_dy_prod_sum = GetZeroVal(); - for (int32_t j = threadIdx.x; j < c; j += blockDim.x) { - const int32_t index = offset + j * d; - y_dy_prod_sum += out[index] * out_diff[index]; - } - - const T reduce_y_dy_prod_sum = BlockReduce(temp_storage_prod_sum).Sum(y_dy_prod_sum); - __shared__ T y_dy_inner_prod; - if (threadIdx.x == 0) { y_dy_inner_prod = reduce_y_dy_prod_sum; } - __syncthreads(); - - for (int32_t j = threadIdx.x; j < c; j += blockDim.x) { - const int32_t index = offset + j * d; - in_diff[index] = inv_norm * (out_diff[index] - y_dy_inner_prod * out[index]); - } - } else { - for (int32_t j = threadIdx.x; j < c; j += blockDim.x) { - const int32_t index = offset + j * d; - in_diff[index] = inv_norm * out_diff[index]; - } - } - } -} - -} // namespace - -template -class GpuL2NormalizeKernel final : public user_op::OpKernel { - public: - GpuL2NormalizeKernel() = default; - ~GpuL2NormalizeKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); - user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - user_op::Tensor* square_x_sum = ctx->Tensor4ArgNameAndIndex("square_x_sum", 0); - const float epsilon = ctx->Attr("epsilon"); - int32_t axis = ctx->Attr("axis"); - int32_t c = x->shape_view().At(axis); - int32_t n = x->shape_view().elem_cnt() / c; - int32_t d = x->shape_view().Count(axis + 1); - RUN_CUDA_KERNEL((L2NormalizeForward), ctx->stream(), n, n, c, d, static_cast(epsilon), - x->dptr(), square_x_sum->mut_dptr(), y->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_L2_NORMALIZE_KERNEL(dtype) \ - REGISTER_USER_KERNEL("l2_normalize") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value)); - -REGISTER_CUDA_L2_NORMALIZE_KERNEL(float) - -template -class GpuL2NormalizeGradKernel final : public user_op::OpKernel { - public: - GpuL2NormalizeGradKernel() = default; - ~GpuL2NormalizeGradKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - const user_op::Tensor* square_x_sum = ctx->Tensor4ArgNameAndIndex("square_x_sum", 0); - user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const float epsilon = ctx->Attr("epsilon"); - int32_t axis = ctx->Attr("axis"); - int32_t c = dy->shape_view().At(axis); - int32_t n = dy->shape_view().elem_cnt() / c; - int32_t d = dy->shape_view().Count(axis + 1); - RUN_CUDA_KERNEL((L2NormalizeBackward), ctx->stream(), n, n, c, d, static_cast(epsilon), - y->dptr(), dy->dptr(), square_x_sum->dptr(), dx->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_L2_NORMALIZE_GRAD_KERNEL(dtype) \ - REGISTER_USER_KERNEL("l2_normalize_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)); - -REGISTER_CUDA_L2_NORMALIZE_GRAD_KERNEL(float) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include +#include "oneflow/core/device/cuda_util.h" + +namespace oneflow { + +namespace { + +template +__global__ void L2NormalizeForward(const int32_t n, const int32_t c, const int32_t d, + const T epsilon, const T* in, T* square_x_sum, T* out) { + using BlockReduce = hipcub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + for (int32_t i = blockIdx.x; i < n; i += gridDim.x) { + T sum = GetZeroVal(); + const int32_t offset = (i / d) * d * c + (i % d); + for (int32_t j = threadIdx.x; j < c; j += blockDim.x) { + const T x = in[offset + j * d]; + sum += x * x; + } + const T reduce_sum = BlockReduce(temp_storage).Sum(sum); + if (threadIdx.x == 0) { square_x_sum[i] = reduce_sum; } + __syncthreads(); + + const T inv_norm = rsqrtf(fmaxf(square_x_sum[i], epsilon)); + for (int32_t j = threadIdx.x; j < c; j += blockDim.x) { + const int32_t index = offset + j * d; + out[index] = inv_norm * in[index]; + } + } +} + +template +__global__ void L2NormalizeBackward(const int32_t n, const int32_t c, const int32_t d, + const float epsilon, const T* out, const T* out_diff, + const T* square_x_sum, T* in_diff) { + for (int32_t i = blockIdx.x; i < n; i += gridDim.x) { + const T inv_norm = rsqrt(fmaxf(square_x_sum[i], epsilon)); + const int32_t offset = (i / d) * d * c + (i % d); + if (square_x_sum[i] >= epsilon) { + using BlockReduce = hipcub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage_prod_sum; + + T y_dy_prod_sum = GetZeroVal(); + for (int32_t j = threadIdx.x; j < c; j += blockDim.x) { + const int32_t index = offset + j * d; + y_dy_prod_sum += out[index] * out_diff[index]; + } + + const T reduce_y_dy_prod_sum = BlockReduce(temp_storage_prod_sum).Sum(y_dy_prod_sum); + __shared__ T y_dy_inner_prod; + if (threadIdx.x == 0) { y_dy_inner_prod = reduce_y_dy_prod_sum; } + __syncthreads(); + + for (int32_t j = threadIdx.x; j < c; j += blockDim.x) { + const int32_t index = offset + j * d; + in_diff[index] = inv_norm * (out_diff[index] - y_dy_inner_prod * out[index]); + } + } else { + for (int32_t j = threadIdx.x; j < c; j += blockDim.x) { + const int32_t index = offset + j * d; + in_diff[index] = inv_norm * out_diff[index]; + } + } + } +} + +} // namespace + +template +class GpuL2NormalizeKernel final : public user_op::OpKernel { + public: + GpuL2NormalizeKernel() = default; + ~GpuL2NormalizeKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); + user_op::Tensor* square_x_sum = ctx->Tensor4ArgNameAndIndex("square_x_sum", 0); + const float epsilon = ctx->Attr("epsilon"); + int32_t axis = ctx->Attr("axis"); + int32_t c = x->shape_view().At(axis); + int32_t n = x->shape_view().elem_cnt() / c; + int32_t d = x->shape_view().Count(axis + 1); + RUN_CUDA_KERNEL((L2NormalizeForward), ctx->stream(), n, n, c, d, static_cast(epsilon), + x->dptr(), square_x_sum->mut_dptr(), y->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_L2_NORMALIZE_KERNEL(dtype) \ + REGISTER_USER_KERNEL("l2_normalize") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("y", 0) == GetDataType::value)); + +REGISTER_CUDA_L2_NORMALIZE_KERNEL(float) + +template +class GpuL2NormalizeGradKernel final : public user_op::OpKernel { + public: + GpuL2NormalizeGradKernel() = default; + ~GpuL2NormalizeGradKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); + const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + const user_op::Tensor* square_x_sum = ctx->Tensor4ArgNameAndIndex("square_x_sum", 0); + user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + const float epsilon = ctx->Attr("epsilon"); + int32_t axis = ctx->Attr("axis"); + int32_t c = dy->shape_view().At(axis); + int32_t n = dy->shape_view().elem_cnt() / c; + int32_t d = dy->shape_view().Count(axis + 1); + RUN_CUDA_KERNEL((L2NormalizeBackward), ctx->stream(), n, n, c, d, static_cast(epsilon), + y->dptr(), dy->dptr(), square_x_sum->dptr(), dx->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_L2_NORMALIZE_GRAD_KERNEL(dtype) \ + REGISTER_USER_KERNEL("l2_normalize_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value)); + +REGISTER_CUDA_L2_NORMALIZE_GRAD_KERNEL(float) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/layer_norm_gpu_kernel.hip.cpp b/oneflow/user/kernels/layer_norm_gpu_kernel.hip.cpp index 70da59a..d9f3285 100644 --- a/oneflow/user/kernels/layer_norm_gpu_kernel.hip.cpp +++ b/oneflow/user/kernels/layer_norm_gpu_kernel.hip.cpp @@ -1,465 +1,678 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/device/cudnn_util.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/ndarray/ndarray_util.h" -#include "oneflow/core/hip/atomic.hip.h" -#include -#include "oneflow/core/kernel/cuda_graph_support.h" -#include "oneflow/core/ep/include/primitive/fill.h" -#include "oneflow/core/ep/include/primitive/matmul.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include "oneflow/core/hip/layer_norm.hip.h" - -namespace oneflow { - -namespace { - -template -struct AffineStore { - AffineStore(DST* y, int64_t row_size, const DST* gamma, const DST* beta) - : y(y), row_size(row_size), gamma(gamma), beta(beta) {} - template - __device__ void store(const SRC* src, int64_t row, int64_t col) { - cuda::layer_norm::Pack y_pack; - cuda::layer_norm::Pack gamma_pack; - cuda::layer_norm::Pack beta_pack; - const int64_t offset = (row * row_size + col) / N; - const int64_t gamma_offset = col / N; - if (do_scale) { - gamma_pack.storage = - *(reinterpret_cast*>(gamma) + gamma_offset); - } else { -#pragma unroll - for (int i = 0; i < N; ++i) { gamma_pack.elem[i] = 1; } - } - if (do_center) { - beta_pack.storage = - *(reinterpret_cast*>(beta) + gamma_offset); - } else { -#pragma unroll - for (int i = 0; i < N; ++i) { beta_pack.elem[i] = 0; } - } -#pragma unroll - for (int i = 0; i < N; ++i) { - DST normalized_i = static_cast(src[i]); - if (do_scale || do_center) { - y_pack.elem[i] = normalized_i * gamma_pack.elem[i] + beta_pack.elem[i]; - } else { - y_pack.elem[i] = normalized_i; - } - } - *(reinterpret_cast*>(y) + offset) = y_pack.storage; - } - DST* y; - int64_t row_size; - const DST* gamma; - const DST* beta; -}; - -template -struct ScaleLoad { - ScaleLoad(const SRC* src, const SRC* gamma, int64_t row_size) - : src(src), gamma(gamma), row_size(row_size) {} - template - __device__ void load(DST* dst, int64_t row, int64_t col) const { - cuda::layer_norm::Pack src_pack; - cuda::layer_norm::Pack gamma_pack; - const int64_t offset = (row * row_size + col) / N; - const int64_t gamma_offset = col / N; - src_pack.storage = *(reinterpret_cast*>(src) + offset); - if (do_scale) { - gamma_pack.storage = - *(reinterpret_cast*>(gamma) + gamma_offset); - } else { -#pragma unroll - for (int i = 0; i < N; ++i) { gamma_pack.elem[i] = static_cast(1); } - } -#pragma unroll - for (int i = 0; i < N; ++i) { - dst[i] = static_cast(src_pack.elem[i] * gamma_pack.elem[i]); - } - } - const SRC* src; - const SRC* gamma; - int64_t row_size; -}; - -template -struct AddStore { - AddStore(const DST* add_to_output, DST* dst, int64_t row_size) - : add_to_output(add_to_output), dst(dst), row_size(row_size) {} - template - __device__ void store(const SRC* src, int64_t row, int64_t col) { - cuda::layer_norm::Pack add_to_output_pack; - cuda::layer_norm::Pack dst_pack; - const int64_t offset = (row * row_size + col) / N; - if (do_add) { - add_to_output_pack.storage = - *(reinterpret_cast*>(add_to_output) + offset); - } -#pragma unroll - for (int i = 0; i < N; ++i) { - if (do_add) { - dst_pack.elem[i] = static_cast(src[i]) + add_to_output_pack.elem[i]; - } else { - dst_pack.elem[i] = static_cast(src[i]); - } - } - *(reinterpret_cast*>(dst) + offset) = dst_pack.storage; - } - const DST* add_to_output; - DST* dst; - int64_t row_size; -}; - -template -__inline__ __device__ T WarpReduce(T val) { -// for (int mask = 16; mask > 0; mask /= 2) { val += __shfl_down_sync(0xffffffff, val, mask); } - for (int mask = 32; mask > 0; mask /= 2) { val += __shfl_down(val, mask, 64); } - return val; -} - -constexpr int tile_size = 32; -constexpr int num_per_block = 4; -constexpr int block_dim_x = 32; -constexpr int block_dim_y = 32 / num_per_block; - -template -__global__ void LayerNormParamGrad(int rows, int cols, const T* __restrict__ dy, - const T* __restrict__ x, const ComputeType* __restrict__ mean, - const ComputeType* __restrict__ inv_var, - T* __restrict__ tmp_gamma_diff, T* __restrict__ tmp_beta_diff) { - __shared__ ComputeType dgamma[32][33]; - __shared__ ComputeType dbeta[32][33]; - ComputeType dgamma_sum[num_per_block]; - ComputeType dbeta_sum[num_per_block]; -#pragma unroll - for (int index = 0; index < num_per_block; ++index) { - dgamma_sum[index] = 0; - dbeta_sum[index] = 0; - } - const int col_id = blockIdx.x * blockDim.x + threadIdx.x; - if (col_id < cols) { - for (int i = blockIdx.y * tile_size + threadIdx.y; i < rows; i += tile_size * gridDim.y) { -#pragma unroll - for (int index = 0; index < num_per_block; ++index) { - int row_id = i + index * blockDim.y; - if (row_id < rows) { - int offset = row_id * cols + col_id; - const ComputeType dy_val = static_cast(dy[offset]); - const ComputeType x_val = static_cast(x[offset]); - const ComputeType mean_val = mean[row_id]; - const ComputeType inv_var_val = inv_var[row_id]; - dgamma_sum[index] += dy_val * (x_val - mean_val) * inv_var_val; - dbeta_sum[index] += dy_val; - } - } - } - } -#pragma unroll - for (int index = 0; index < num_per_block; ++index) { - dgamma[index * blockDim.y + threadIdx.y][threadIdx.x] = dgamma_sum[index]; - dbeta[index * blockDim.y + threadIdx.y][threadIdx.x] = dbeta_sum[index]; - } - __syncthreads(); -#pragma unroll - for (int index = 0; index < num_per_block; ++index) { - const int col_id = blockIdx.x * blockDim.x + threadIdx.y + index * blockDim.y; - if (col_id < cols) { - ComputeType gamma_sum = dgamma[threadIdx.x][threadIdx.y + index * blockDim.y]; - ComputeType beta_sum = dbeta[threadIdx.x][threadIdx.y + index * blockDim.y]; - ComputeType global_dgamma = WarpReduce(gamma_sum); - ComputeType global_dbeta = WarpReduce(beta_sum); - if (threadIdx.x == 0) { - const int offset = blockIdx.y * cols + col_id; - tmp_gamma_diff[offset] = global_dgamma; - tmp_beta_diff[offset] = global_dbeta; - } - } - } -} - -template -int GetGirdDimY(const int64_t num_instances, const int64_t norm_size) { - using ComputeType = typename cuda::layer_norm::DefaultComputeType::type; - const int grid_dim_x = (norm_size + tile_size - 1) / tile_size; - const int max_grid_dim_y = (num_instances + tile_size - 1) / tile_size; - const int block_size = block_dim_x * block_dim_y; - int max_active_blocks = 0; - OF_CUDA_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_blocks, LayerNormParamGrad, block_size, 0)); - int waves = 1; - int dev; - OF_CUDA_CHECK(hipGetDevice(&dev)); - int sm_count; - OF_CUDA_CHECK(hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev)); - int num_blocks = max_active_blocks * sm_count * waves; - int grid_dim_y = std::min(max_grid_dim_y, static_cast(num_blocks / grid_dim_x)); - return std::max(grid_dim_y, 1); -} - -template -void LayerNormForwardGpu(ep::Stream* stream, const int64_t num_instances, const int64_t norm_size, - const double epsilon, const T* x_ptr, const T* gamma_ptr, - const T* beta_ptr, T* y_ptr, user_op::Tensor* mean, - user_op::Tensor* inv_variance) { - using ComputeType = typename cuda::layer_norm::DefaultComputeType::type; - cuda::layer_norm::DirectLoad load(x_ptr, norm_size); - AffineStore store(y_ptr, norm_size, gamma_ptr, beta_ptr); - cuda::layer_norm::DispatchLayerNorm( - stream->As()->cuda_stream(), load, store, num_instances, norm_size, epsilon, - mean->mut_dptr(), inv_variance->mut_dptr()); -} - -template -void DispatchLayerNormForwardGpu(ep::Stream* stream, const int64_t num_instances, - const int64_t norm_size, const double epsilon, const T* x_ptr, - const T* gamma_ptr, const T* beta_ptr, T* y_ptr, - user_op::Tensor* mean, user_op::Tensor* inv_variance) { - if (gamma_ptr != nullptr && beta_ptr != nullptr) { - LayerNormForwardGpu(stream, num_instances, norm_size, epsilon, x_ptr, gamma_ptr, - beta_ptr, y_ptr, mean, inv_variance); - } else if (gamma_ptr != nullptr && beta_ptr == nullptr) { - LayerNormForwardGpu(stream, num_instances, norm_size, epsilon, x_ptr, gamma_ptr, - beta_ptr, y_ptr, mean, inv_variance); - } else if (gamma_ptr == nullptr && beta_ptr != nullptr) { - LayerNormForwardGpu(stream, num_instances, norm_size, epsilon, x_ptr, gamma_ptr, - beta_ptr, y_ptr, mean, inv_variance); - } else { - LayerNormForwardGpu(stream, num_instances, norm_size, epsilon, x_ptr, - gamma_ptr, beta_ptr, y_ptr, mean, inv_variance); - } -} - -template -void LayerNormBackwardGpu(ep::Stream* stream, const int64_t num_instances, const int64_t norm_size, - const T* dy_ptr, const T* x_ptr, const user_op::Tensor* mean, - const user_op::Tensor* inv_variance, const T* gamma_ptr, - const T* add_to_output_ptr, T* dx_ptr) { - using ComputeType = typename cuda::layer_norm::DefaultComputeType::type; - cuda::layer_norm::DirectLoad load_x(x_ptr, norm_size); - ScaleLoad load_scaled_dy(dy_ptr, gamma_ptr, norm_size); - AddStore store(add_to_output_ptr, dx_ptr, norm_size); - OF_CUDA_CHECK((cuda::layer_norm::DispatchLayerNormGrad( - stream->As()->cuda_stream(), load_x, load_scaled_dy, store, - mean->dptr(), inv_variance->dptr(), num_instances, norm_size))); -} - -template -void DispatchLayerNormBackwardDoAdd(ep::Stream* stream, const int64_t num_instances, - const int64_t norm_size, const T* dy_ptr, const T* x_ptr, - const user_op::Tensor* mean, - const user_op::Tensor* inv_variance, const T* gamma_ptr, - const T* add_to_output_ptr, T* dx_ptr) { - if (add_to_output_ptr != nullptr) { - LayerNormBackwardGpu(stream, num_instances, norm_size, dy_ptr, x_ptr, mean, - inv_variance, gamma_ptr, add_to_output_ptr, dx_ptr); - } else { - LayerNormBackwardGpu(stream, num_instances, norm_size, dy_ptr, x_ptr, mean, - inv_variance, gamma_ptr, add_to_output_ptr, dx_ptr); - } -} - -template -void LaunchLayerNormBackward(ep::Stream* stream, const int64_t num_instances, - const int64_t norm_size, const T* dy_ptr, const T* x_ptr, - const user_op::Tensor* mean, const user_op::Tensor* inv_variance, - const T* gamma_ptr, const T* add_to_output_ptr, T* dx_ptr) { - if (gamma_ptr != nullptr) { - DispatchLayerNormBackwardDoAdd(stream, num_instances, norm_size, dy_ptr, x_ptr, mean, - inv_variance, gamma_ptr, add_to_output_ptr, dx_ptr); - } else { - DispatchLayerNormBackwardDoAdd(stream, num_instances, norm_size, dy_ptr, x_ptr, mean, - inv_variance, gamma_ptr, add_to_output_ptr, dx_ptr); - } -} - -} // namespace - -template -class LayerNormGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { - public: - LayerNormGpuKernel() = default; - ~LayerNormGpuKernel() = default; - - private: - using user_op::OpKernel::Compute; - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); - user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - user_op::Tensor* mean = ctx->Tensor4ArgNameAndIndex("mean", 0); - user_op::Tensor* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0); - const double epsilon = ctx->Attr("epsilon"); - CHECK_GE(epsilon, HIPDNN_BN_MIN_EPSILON); - const int64_t num_instances = mean->shape_view().elem_cnt(); - const int64_t norm_size = x->shape_view().elem_cnt() / num_instances; - const T* gamma_ptr = nullptr; - const T* beta_ptr = nullptr; - if (ctx->has_input("gamma", 0)) { - const user_op::Tensor* gamma = ctx->Tensor4ArgNameAndIndex("gamma", 0); - gamma_ptr = gamma->dptr(); - CHECK_EQ(gamma->shape_view().elem_cnt(), norm_size); - } - if (ctx->has_input("beta", 0)) { beta_ptr = ctx->Tensor4ArgNameAndIndex("beta", 0)->dptr(); } - DispatchLayerNormForwardGpu(ctx->stream(), num_instances, norm_size, epsilon, x->dptr(), - gamma_ptr, beta_ptr, y->mut_dptr(), mean, inv_variance); - }; -}; - -#define REGISTER_LAYER_NORM_CUDA_KERNEL(dtype) \ - REGISTER_USER_KERNEL("layer_norm") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("x", 0) == GetDataType::value)); - -REGISTER_LAYER_NORM_CUDA_KERNEL(float) -REGISTER_LAYER_NORM_CUDA_KERNEL(double) -REGISTER_LAYER_NORM_CUDA_KERNEL(half) - -template -class LayerNormGradGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { - public: - LayerNormGradGpuKernel() = default; - ~LayerNormGradGpuKernel() = default; - - private: - using user_op::OpKernel::Compute; - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* mean = ctx->Tensor4ArgNameAndIndex("mean", 0); - const user_op::Tensor* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0); - user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const int64_t num_instances = mean->shape_view().elem_cnt(); - const int64_t norm_size = x->shape_view().elem_cnt() / num_instances; - const T* gamma_ptr = nullptr; - if (ctx->has_input("gamma", 0)) { - gamma_ptr = ctx->Tensor4ArgNameAndIndex("gamma", 0)->dptr(); - } - const T* add_to_output_ptr = nullptr; - if (ctx->has_input("_add_to_output", 0)) { - const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); - CHECK_EQ(add_to_output->data_type(), dx->data_type()); - CHECK_EQ(add_to_output->shape_view(), dx->shape_view()); - add_to_output_ptr = add_to_output->dptr(); - } - LaunchLayerNormBackward(ctx->stream(), num_instances, norm_size, dy->dptr(), x->dptr(), - mean, inv_variance, gamma_ptr, add_to_output_ptr, dx->mut_dptr()); - }; -}; - -#define REGISTER_LAYER_NORM_GRAD_CUDA_KERNEL(dtype) \ - REGISTER_USER_KERNEL("layer_norm_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dy", 0) == GetDataType::value)) \ - .SetInplaceProposalFn( \ - [](const user_op::InferContext& ctx, \ - const user_op::AddInplaceArgPair& AddInplaceArgPairFn) -> Maybe { \ - if (ctx.has_input("_add_to_output", 0)) { \ - OF_RETURN_IF_ERROR(AddInplaceArgPairFn("dx", 0, "_add_to_output", 0, true)); \ - } \ - return Maybe::Ok(); \ - }); - -REGISTER_LAYER_NORM_GRAD_CUDA_KERNEL(float) -REGISTER_LAYER_NORM_GRAD_CUDA_KERNEL(double) -REGISTER_LAYER_NORM_GRAD_CUDA_KERNEL(half) - -template -class LayerNormParamGradGpuKernel final : public user_op::OpKernel, - public user_op::CudaGraphSupport { - public: - LayerNormParamGradGpuKernel() = default; - ~LayerNormParamGradGpuKernel() = default; - - private: - using user_op::OpKernel::Compute; - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* mean = ctx->Tensor4ArgNameAndIndex("mean", 0); - const user_op::Tensor* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0); - const int64_t num_instances = mean->shape_view().elem_cnt(); - const int64_t norm_size = x->shape_view().elem_cnt() / num_instances; - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const DataType data_type = dy->data_type(); - const int grid_dim_x = (norm_size + tile_size - 1) / tile_size; - const int grid_dim_y = GetGirdDimY(num_instances, norm_size); - const size_t tmp_gamma_diff_size = grid_dim_y * norm_size * sizeof(T); - T* tmp_gamma_diff_ptr = reinterpret_cast(tmp_buffer->mut_dptr()); - T* tmp_beta_diff_ptr = reinterpret_cast(tmp_buffer->mut_dptr() + tmp_gamma_diff_size); - T* reduce_buf_ptr = - reinterpret_cast(tmp_buffer->mut_dptr() + 2 * tmp_gamma_diff_size); - using ComputeType = typename cuda::layer_norm::DefaultComputeType::type; - LayerNormParamGrad<<stream()->As()->cuda_stream()>>>( - num_instances, norm_size, dy->dptr(), x->dptr(), mean->dptr(), - inv_variance->dptr(), tmp_gamma_diff_ptr, tmp_beta_diff_ptr); - const int32_t m = norm_size; - const int32_t n = 1; - const int32_t k = grid_dim_y; - std::unique_ptr fill = - ep::primitive::NewPrimitive(ctx->stream()->device_type(), - data_type); - CHECK(fill); - fill->Launch(ctx->stream(), reduce_buf_ptr, 1.0, grid_dim_y); - std::unique_ptr matmul = - ep::primitive::NewPrimitive( - ctx->stream()->device_type(), data_type, ep::primitive::BlasTransposeType::T, - ep::primitive::BlasTransposeType::N); - CHECK(matmul); - if (ctx->has_output("gamma_diff", 0)) { - user_op::Tensor* gamma_diff = ctx->Tensor4ArgNameAndIndex("gamma_diff", 0); - matmul->Launch(ctx->stream(), m, n, k, 1.0, tmp_gamma_diff_ptr, reduce_buf_ptr, 0.0, - gamma_diff->mut_dptr()); - } - if (ctx->has_output("beta_diff", 0)) { - user_op::Tensor* beta_diff = ctx->Tensor4ArgNameAndIndex("beta_diff", 0); - matmul->Launch(ctx->stream(), m, n, k, 1.0, tmp_beta_diff_ptr, reduce_buf_ptr, 0.0, - beta_diff->mut_dptr()); - } - }; -}; - -#define REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(dtype) \ - REGISTER_USER_KERNEL("layer_norm_param_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dy", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ - const int64_t begin_params_axis = ctx->Attr("begin_params_axis"); \ - const bool has_gamma_diff = ctx->has_output("gamma_diff", 0); \ - const bool has_beta_diff = ctx->has_output("beta_diff", 0); \ - const auto& dy = ctx->InputTensorDesc("dy", 0); \ - const int64_t num_instances = dy.shape().Count(0, begin_params_axis); \ - const int64_t norm_size = dy.shape().Count(begin_params_axis); \ - const int grid_dim_y = GetGirdDimY(num_instances, norm_size); \ - size_t tmp_buffer_size = (2 * grid_dim_y * norm_size + grid_dim_y) * sizeof(dtype); \ - return tmp_buffer_size; \ - }); - -REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(float) -REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(double) -REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(half) - -} // namespace oneflow \ No newline at end of file +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/device/cudnn_util.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/ndarray/ndarray_util.h" +#include "oneflow/core/hip/atomic.hip.h" +#include +#include "oneflow/core/kernel/cuda_graph_support.h" +#include "oneflow/core/ep/include/primitive/fill.h" +#include "oneflow/core/ep/include/primitive/matmul.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include "oneflow/core/hip/layer_norm.hip.h" +#include +#include + +template +struct AccumulateType { }; + +#if defined(__HIPCC__) +template <> struct AccumulateType { using type = float; }; +#endif +template <> struct AccumulateType { using type = float; }; +template <> struct AccumulateType { using type = double; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType {using type = bool; }; +template <> struct AccumulateType { using type = double; }; +template <> struct AccumulateType { using type = double; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType { using type = int64_t; }; +template <> struct AccumulateType {using type = bool; }; + +template +using acc_type = typename AccumulateType::type; + +#define C10_HOST_DEVICE __host__ __device__ +#define C10_DEVICE __device__ +#define C10_HOST __host__ +#define C10_WARP_SIZE 64 + +#define VEC 4 +typedef int64_t IndexType ; + +constexpr int BlockReduceNumThreads=512; +constexpr int NumThreads = 256; +constexpr int ColwiseReduceTileSize = 32; + +template +struct WelfordData { + scalar_t mean; + scalar_t m2; + index_t n; + combine_t nf; + + C10_HOST_DEVICE WelfordData() : mean(0), m2(0), n(0), nf(0) {} + + C10_HOST_DEVICE WelfordData( + scalar_t mean, + scalar_t m2, + index_t n, + combine_t nf) + : mean(mean), m2(m2), n(n), nf(nf) {} +}; + + +template +struct WelfordOps { + public: + using acc_t = WelfordData; + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data) const { + acc_scalar_t delta = data - acc.mean; + // using acc.nf(combine_t) here, as acc.n(index_t) would still be converted + // accumulation in reduce is done through index_T + acc_scalar_t new_mean = acc.mean + delta / (acc.nf + 1); + acc_scalar_t new_delta = data - new_mean; + return { + new_mean, + acc.m2 + delta * new_delta, + acc.n + 1, + combine_t(acc.n + 1), // accumulate for combine_t uses index_t + }; + } + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + if (a.nf == 0) { + return b; + } + if (b.nf == 0) { + return a; + } + acc_scalar_t delta = b.mean - a.mean; + combine_t new_count = a.nf + b.nf; + acc_scalar_t nb_over_n = b.nf / new_count; + return { + a.mean + delta * nb_over_n, + a.m2 + b.m2 + delta * delta * a.nf * nb_over_n, + // setting acc.n as -1 since acc.n might not be able to represent the count + // correctly within its range, setting it to -1 to avoid confusion + -1, + new_count + }; + } + inline C10_DEVICE res_t project(acc_t acc) const { + return res_t(acc.m2 / acc.nf, static_cast(acc.mean)); + } + + inline __device__ acc_t warp_shfl_down(acc_t acc, int offset) const { + return { + __shfl_down(acc.mean, offset) + , __shfl_down(acc.m2, offset) + , __shfl_down(acc.n, offset) + , __shfl_down(acc.nf, offset) + }; + } +}; + +template +__inline__ __device__ T WarpReduce(T val, const ReduceOp& op) { +#pragma unroll + for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { + val = op.combine(val, op.warp_shfl_down(val, offset)); + } + return val; +} + +template +__inline__ __device__ T WarpReduce(T val,int max,const ReduceOp& op) { +#pragma unroll + for (int offset = max; offset > 0; offset >>= 1) { + val = op.combine(val, op.warp_shfl_down(val, offset)); + } + return val; +} + +template +__inline__ __device__ T +BlockReduce(T val, const ReduceOp& op, T* shared) { + const int lid = threadIdx.x % C10_WARP_SIZE; + const int wid = threadIdx.x / C10_WARP_SIZE; + val = WarpReduce(val, op); + __syncthreads(); + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + if (wid == 0) { + val= shared[lid]; + val = WarpReduce(val,blockDim.x / C10_WARP_SIZE / 2,op); + } + return val; +} + +template +__inline__ __device__ T WarpReduceSum(T val) { +#pragma unroll + for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { + val += __shfl_down(val, offset); + } + return val; +} + +template +__inline__ __device__ T WarpReduceSum(T val,int max) { +#pragma unroll + for (int offset = max; offset > 0; offset >>= 1) { + val += __shfl_down(val, offset); + } + return val; +} + + +template +__inline__ __device__ T BlockReduceSum(T val, T* shared) { + const int lid = threadIdx.x % C10_WARP_SIZE; + const int wid = threadIdx.x / C10_WARP_SIZE; + val = WarpReduceSum(val); + __syncthreads(); + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + if (wid == 0) { + val= shared[lid]; + val = WarpReduceSum(val,blockDim.x / C10_WARP_SIZE / 2); + } + return val; +} + +template +__global__ void layernorm_forward_kernel(const scalar_t* input,scalar_t* ret,acc_type* mean,acc_type* rstd, + const scalar_t* gamma,const scalar_t* beta,IndexType cols,double eps) +{ + //dropout do nothing in val mode + IndexType i=blockIdx.x; + // add + layernorm get mean and rstd + using T_ACC = acc_type; + using WelfordType = WelfordData; + using WelfordOp = WelfordOps>; + __shared__ typename std::aligned_storage::type val_shared[BlockReduceNumThreads/C10_WARP_SIZE]; + WelfordType* val_shared_ptr = reinterpret_cast(val_shared); + WelfordOp welford_op; + WelfordType val; + + #pragma unroll + for (IndexType j = threadIdx.x; j < cols; j += blockDim.x) { + IndexType index = i * cols + j; + val = welford_op.reduce(val, static_cast(input[index])); + } + val = BlockReduce(val,welford_op,val_shared_ptr); + + __shared__ T_ACC s_mean; + __shared__ T_ACC s_rstd; + if (threadIdx.x == 0) { + thrust::tie(s_rstd, s_mean) = welford_op.project(val); + mean[i] = s_mean; + s_rstd=rsqrt(s_rstd + static_cast(eps)); + rstd[i] = s_rstd; + } + __syncthreads(); + //layernorm (x-mean)*rstd*gamma+beta + #pragma unroll + for (IndexType j = threadIdx.x; j < cols; j += blockDim.x) { + IndexType index = i * cols + j; + ret[index] = (static_cast(input[index]) - s_mean)*s_rstd * (gamma == nullptr ? T_ACC(1) : static_cast(gamma[j])) + + (beta == nullptr ? T_ACC(0) : static_cast(beta[j])); + } +} + +template +void LayerNormKernelImplInternal( + oneflow::ep::Stream* stream, + const T* X, + const T* gamma, + const T* beta, + int64_t M, + int64_t N, + double eps, + T* Y, + acc_type* mean, + acc_type* rstd) { + using T_ACC = acc_type; + const T* X_data = X; + const T* gamma_data = gamma; + const T* beta_data = beta; + T* Y_data = Y; + T_ACC* mean_data = mean; + T_ACC* rstd_data = rstd; + hipStream_t cuda_stream = stream->As()->cuda_stream(); + layernorm_forward_kernel<<>>( + X_data,Y_data,mean_data,rstd_data,gamma_data,beta_data,N,eps); +} + +template +__global__ void GammaBetaBackwardSimple(IndexType M,IndexType N,const scalar_t* dY,const scalar_t* X,const acc_type* mean, + const acc_type* rstd,scalar_t* dg,scalar_t* db) +{ + using T_ACC = acc_type; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < N) { + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t i = 0; i < M; ++i) { + const int64_t index = i * N + j; + sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index]) * + (static_cast(X[index]) - static_cast(mean[i])) * + static_cast(rstd[i]); + sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index]); + } + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } +} + +template +__global__ void GammaBetaBackward(IndexType M,IndexType N,const scalar_t* dY,const scalar_t* X,const acc_type* mean, + const acc_type* rstd,scalar_t* dg,scalar_t* db) +{ + using T_ACC = acc_type; + __shared__ T_ACC g_shared[ColwiseReduceTileSize][ColwiseReduceTileSize + 1]; + __shared__ T_ACC b_shared[ColwiseReduceTileSize][ColwiseReduceTileSize + 1]; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + T_ACC dg_sum1 = 0; + T_ACC dg_sum2 = 0; + T_ACC db_sum1 = 0; + T_ACC db_sum2 = 0; + if (j < N) { + for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) { + const int64_t i1 = i; + const int64_t i2 = i + blockDim.y; + const int64_t index1 = i1 * N + j; + const int64_t index2 = i2 * N + j; + dg_sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index1]) * + (static_cast(X[index1]) - static_cast(mean[i1])) * + static_cast(rstd[i1]); + db_sum1 += db == nullptr ? T_ACC(0) : static_cast(dY[index1]); + if (i2 < M) { + dg_sum2 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index2]) * + (static_cast(X[index2]) - static_cast(mean[i2])) * + static_cast(rstd[i2]); + db_sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index2]); + } + } + } + g_shared[threadIdx.y][threadIdx.x] = dg_sum1; + g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2; + b_shared[threadIdx.y][threadIdx.x] = db_sum1; + b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2; + __syncthreads(); + T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y]; + T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } + sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } +} + +template +__global__ void LayerNormBackward_kernel(IndexType N,const scalar_t* dY,const scalar_t* X,const scalar_t* gamma,const acc_type* mean, + const acc_type* rstd, scalar_t* dX, const scalar_t* add_to_output) +{ + using T_ACC = acc_type; + __shared__ T_ACC ds_shared[C10_WARP_SIZE]; + __shared__ T_ACC db_shared[C10_WARP_SIZE]; + const IndexType i = blockIdx.x; + T_ACC sum1 = 0; + T_ACC sum2 = 0; + #pragma unroll + for (IndexType j = threadIdx.x; j < N; j += blockDim.x) { + const IndexType index = i * N + j; + const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); + sum1 += static_cast(dY[index]) * static_cast(X[index]) * gamma_v; + sum2 += static_cast(dY[index]) * gamma_v; + } + sum1 = BlockReduceSum(sum1, ds_shared); + sum2 = BlockReduceSum(sum2, db_shared); + const T_ACC s = T_ACC(1) / static_cast(N); + __shared__ T_ACC b; + __shared__ T_ACC c; + if (threadIdx.x == 0) { + b = (sum2 * static_cast(mean[i]) - sum1) * static_cast(rstd[i]) * static_cast(rstd[i]) *static_cast(rstd[i]) * s; + c = -(b * static_cast(mean[i]) + sum2 * static_cast(rstd[i]) * s); + } + __syncthreads(); + #pragma unroll + for (IndexType j = threadIdx.x; j < N; j += blockDim.x) { + const IndexType index = i * N + j; + const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); + dX[index] = static_cast(rstd[i]) * static_cast(dY[index]) * gamma_v + b * static_cast(X[index]) + c + + (add_to_output == nullptr ? T_ACC(0) : static_cast(add_to_output[index])); + } +} + +template +void LayerNormBackwardKernelImplInternal( + oneflow::ep::Stream* stream, + const T* dY, + const T* X, + const acc_type* mean, + const acc_type* rstd, + const T* gamma, + int64_t M, + int64_t N, + T* dX, + const T* add_to_output) { + using T_ACC = acc_type; + const T* dY_data = dY; + const T* X_data = X; + const T_ACC* mean_data = mean; + const T_ACC* rstd_data = rstd; + const T* gamma_data = gamma; + T* dX_data = dX; + const T* add_to_output_data = add_to_output; + hipStream_t cuda_stream = stream->As()->cuda_stream(); + if (dX_data != nullptr) { + LayerNormBackward_kernel<<>>( + N, dY_data, X_data,gamma_data,mean_data,rstd_data,dX_data,add_to_output_data); + } +} + +template +void LayerNormBackwardKernelImplInternalParam( + oneflow::ep::Stream* stream, + const T* dY, + const T* X, + const acc_type* mean, + const acc_type* rstd, + int64_t M, + int64_t N, + T* dgamma, + T* dbeta) { + using T_ACC = acc_type; + const T* dY_data = dY; + const T* X_data = X; + const T_ACC* mean_data = mean; + const T_ACC* rstd_data = rstd; + hipStream_t cuda_stream = stream->As()->cuda_stream(); + T* dgamma_data = dgamma; + T* dbeta_data = dbeta; + if (M < 512) { + // For small batch size, do colwise reduce directly. + const int64_t B = (N + NumThreads - 1) / NumThreads; + GammaBetaBackwardSimple + <<>>( + M, + N, + dY_data, + X_data, + mean_data, + rstd_data, + dgamma_data, + dbeta_data); + } else { + const int64_t B = + (N + ColwiseReduceTileSize - 1) / ColwiseReduceTileSize; + constexpr int kThreadX = ColwiseReduceTileSize; + constexpr int kThreadY = ColwiseReduceTileSize / 2; + GammaBetaBackward + <<>>( + M, + N, + dY_data, + X_data, + mean_data, + rstd_data, + dgamma_data, + dbeta_data); + } +} + +namespace oneflow { + +template +class LayerNormGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { + public: + LayerNormGpuKernel() = default; + ~LayerNormGpuKernel() = default; + + private: + using user_op::OpKernel::Compute; + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); + user_op::Tensor* mean = ctx->Tensor4ArgNameAndIndex("mean", 0); + user_op::Tensor* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0); + double epsilon = ctx->Attr("epsilon"); + int64_t num_instances = mean->shape_view().elem_cnt(); + int64_t norm_size = x->shape_view().elem_cnt() / num_instances; + const T* gamma_ptr = nullptr; + const T* beta_ptr = nullptr; + if (ctx->has_input("gamma", 0)) { + const user_op::Tensor* gamma = ctx->Tensor4ArgNameAndIndex("gamma", 0); + gamma_ptr = gamma->dptr(); + CHECK_EQ(gamma->shape_view().elem_cnt(), norm_size); + } + if (ctx->has_input("beta", 0)) { beta_ptr = ctx->Tensor4ArgNameAndIndex("beta", 0)->dptr(); } + // DispatchLayerNormForwardGpu(ctx->stream(), num_instances, norm_size, epsilon, x->dptr(), + // gamma_ptr, beta_ptr, y->mut_dptr(), mean, inv_variance); + using ComputeType = typename cuda::layer_norm::DefaultComputeType::type; + LayerNormKernelImplInternal(ctx->stream(), x->dptr(), gamma_ptr, beta_ptr, num_instances, norm_size, epsilon, + y->mut_dptr(), mean->mut_dptr(), inv_variance->mut_dptr()); + }; +}; + +#define REGISTER_LAYER_NORM_CUDA_KERNEL(dtype) \ + REGISTER_USER_KERNEL("layer_norm") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == GetDataType::value)); + +REGISTER_LAYER_NORM_CUDA_KERNEL(float) +REGISTER_LAYER_NORM_CUDA_KERNEL(double) +REGISTER_LAYER_NORM_CUDA_KERNEL(half) +#if CUDA_VERSION >= 11000 +REGISTER_LAYER_NORM_CUDA_KERNEL(nv_bfloat16) +#endif + +template +class LayerNormGradGpuKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { + public: + LayerNormGradGpuKernel() = default; + ~LayerNormGradGpuKernel() = default; + + private: + using user_op::OpKernel::Compute; + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* mean = ctx->Tensor4ArgNameAndIndex("mean", 0); + const user_op::Tensor* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0); + user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + int64_t num_instances = mean->shape_view().elem_cnt(); + int64_t norm_size = x->shape_view().elem_cnt() / num_instances; + const T* gamma_ptr = nullptr; + if (ctx->has_input("gamma", 0)) { + gamma_ptr = ctx->Tensor4ArgNameAndIndex("gamma", 0)->dptr(); + } + const T* add_to_output_ptr = nullptr; + if (ctx->has_input("_add_to_output", 0)) { + const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); + CHECK_EQ(add_to_output->data_type(), dx->data_type()); + CHECK_EQ(add_to_output->shape_view(), dx->shape_view()); + add_to_output_ptr = add_to_output->dptr(); + } + // LaunchLayerNormBackward(ctx->stream(), num_instances, norm_size, dy->dptr(), x->dptr(), + // mean, inv_variance, gamma_ptr, add_to_output_ptr, dx->mut_dptr()); + using ComputeType = typename cuda::layer_norm::DefaultComputeType::type; + LayerNormBackwardKernelImplInternal(ctx->stream(), dy->dptr(), x->dptr(), mean->dptr(), inv_variance->dptr(), + gamma_ptr, num_instances, norm_size, dx->mut_dptr(), add_to_output_ptr); + }; +}; + +#define REGISTER_LAYER_NORM_GRAD_CUDA_KERNEL(dtype) \ + REGISTER_USER_KERNEL("layer_norm_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dy", 0) == GetDataType::value)) \ + .SetInplaceProposalFn( \ + [](const user_op::InferContext& ctx, \ + const user_op::AddInplaceArgPair& AddInplaceArgPairFn) -> Maybe { \ + if (ctx.has_input("_add_to_output", 0)) { \ + OF_RETURN_IF_ERROR(AddInplaceArgPairFn("dx", 0, "_add_to_output", 0, true)); \ + } \ + return Maybe::Ok(); \ + }); + +REGISTER_LAYER_NORM_GRAD_CUDA_KERNEL(float) +REGISTER_LAYER_NORM_GRAD_CUDA_KERNEL(double) +REGISTER_LAYER_NORM_GRAD_CUDA_KERNEL(half) +#if CUDA_VERSION >= 11000 +REGISTER_LAYER_NORM_GRAD_CUDA_KERNEL(nv_bfloat16) +#endif + +template +class LayerNormParamGradGpuKernel final : public user_op::OpKernel, + public user_op::CudaGraphSupport { + public: + LayerNormParamGradGpuKernel() = default; + ~LayerNormParamGradGpuKernel() = default; + + private: + using user_op::OpKernel::Compute; + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* mean = ctx->Tensor4ArgNameAndIndex("mean", 0); + const user_op::Tensor* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0); + int64_t num_instances = mean->shape_view().elem_cnt(); + int64_t norm_size = x->shape_view().elem_cnt() / num_instances; + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + // const DataType data_type = dy->data_type(); + // const int grid_dim_x = (norm_size + tile_size - 1) / tile_size; + // const int grid_dim_y = GetGirdDimY(num_instances, norm_size); + // const size_t tmp_gamma_diff_size = grid_dim_y * norm_size * sizeof(T); + // T* tmp_gamma_diff_ptr = reinterpret_cast(tmp_buffer->mut_dptr()); + // T* tmp_beta_diff_ptr = reinterpret_cast(tmp_buffer->mut_dptr() + tmp_gamma_diff_size); + // T* reduce_buf_ptr = + // reinterpret_cast(tmp_buffer->mut_dptr() + 2 * tmp_gamma_diff_size); + using ComputeType = typename cuda::layer_norm::DefaultComputeType::type; + // LayerNormParamGrad<<stream()->As()->cuda_stream()>>>( + // num_instances, norm_size, dy->dptr(), x->dptr(), mean->dptr(), + // inv_variance->dptr(), tmp_gamma_diff_ptr, tmp_beta_diff_ptr); + // const int32_t m = norm_size; + // const int32_t n = 1; + // const int32_t k = grid_dim_y; + // std::unique_ptr fill = + // ep::primitive::NewPrimitive(ctx->stream()->device_type(), + // data_type); + // CHECK(fill); + // fill->Launch(ctx->stream(), reduce_buf_ptr, 1.0, grid_dim_y); + // std::unique_ptr matmul = + // ep::primitive::NewPrimitive( + // ctx->stream()->device_type(), data_type, ep::primitive::BlasTransposeType::T, + // ep::primitive::BlasTransposeType::N); + // CHECK(matmul); + // if (ctx->has_output("gamma_diff", 0)) { + // user_op::Tensor* gamma_diff = ctx->Tensor4ArgNameAndIndex("gamma_diff", 0); + // matmul->Launch(ctx->stream(), m, n, k, 1.0, tmp_gamma_diff_ptr, reduce_buf_ptr, 0.0, + // gamma_diff->mut_dptr()); + // } + // if (ctx->has_output("beta_diff", 0)) { + // user_op::Tensor* beta_diff = ctx->Tensor4ArgNameAndIndex("beta_diff", 0); + // matmul->Launch(ctx->stream(), m, n, k, 1.0, tmp_beta_diff_ptr, reduce_buf_ptr, 0.0, + // beta_diff->mut_dptr()); + // } + T* gamma_diff_ptr = nullptr; + T* beta_diff_ptr = nullptr; + if (ctx->has_output("gamma_diff", 0)) { + gamma_diff_ptr = ctx->Tensor4ArgNameAndIndex("gamma_diff", 0)->mut_dptr(); + } + if (ctx->has_output("beta_diff", 0)) { + beta_diff_ptr = ctx->Tensor4ArgNameAndIndex("beta_diff", 0)->mut_dptr(); + } + LayerNormBackwardKernelImplInternalParam(ctx->stream(), dy->dptr(), x->dptr(), mean->dptr(), inv_variance->dptr(), + num_instances, norm_size, gamma_diff_ptr, beta_diff_ptr); + }; +}; + +#define REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(dtype) \ + REGISTER_USER_KERNEL("layer_norm_param_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dy", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ + const int64_t begin_params_axis = ctx->Attr("begin_params_axis"); \ + const bool has_gamma_diff = ctx->has_output("gamma_diff", 0); \ + const bool has_beta_diff = ctx->has_output("beta_diff", 0); \ + const auto& dy = ctx->InputTensorDesc("dy", 0); \ + const int64_t num_instances = dy.shape().Count(0, begin_params_axis); \ + const int64_t norm_size = dy.shape().Count(begin_params_axis); \ + const int grid_dim_y = num_instances; \ + size_t tmp_buffer_size = (2 * grid_dim_y * norm_size + grid_dim_y) * sizeof(dtype); \ + return tmp_buffer_size; \ + }); + +REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(float) +REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(double) +REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(half) +#if CUDA_VERSION >= 11000 +REGISTER_LAYER_NORM_PARAM_GRAD_GPU_KERNEL(nv_bfloat16) +#endif + +} \ No newline at end of file diff --git a/oneflow/user/kernels/math_binary_elementwise_kernel.hip.cpp b/oneflow/user/kernels/math_binary_elementwise_kernel.hip.cpp index db2ebfd..de7d6f2 100644 --- a/oneflow/user/kernels/math_binary_elementwise_kernel.hip.cpp +++ b/oneflow/user/kernels/math_binary_elementwise_kernel.hip.cpp @@ -1,244 +1,244 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/user/kernels/math_binary_elementwise_func.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template class BinaryFunctor, typename T> -__global__ void MathBinaryElementwiseForwardGpu(const int64_t n, const T* x, const T* y, T* z) { - CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) { z[i] = BinaryFunctor::Forward(x[i], y[i]); } -} - -template class BinaryFunctor, typename T> -__global__ void MathBinaryElementwiseBackwardXGradGpu(const int64_t n, const T* x, const T* y, - const T* dz, T* dx) { - CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) { - dx[i] = BinaryFunctor::BackwardXGrad(x[i], y[i], dz[i]); - } -} - -template class BinaryFunctor, typename T> -__global__ void MathBinaryElementwiseBackwardYGradGpu(const int64_t n, const T* x, const T* y, - const T* dz, T* dy) { - CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) { - dy[i] = BinaryFunctor::BackwardYGrad(x[i], y[i], dz[i]); - } -} - -} // namespace - -template class BinaryFunctor, typename T> -class MathBinaryElementwiseGpuKernel final : public user_op::OpKernel { - public: - MathBinaryElementwiseGpuKernel() = default; - ~MathBinaryElementwiseGpuKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); - user_op::Tensor* tensor_z = ctx->Tensor4ArgNameAndIndex("z", 0); - int64_t n = tensor_x->shape_view().elem_cnt(); - if (n == 0) { return; } - MathBinaryElementwiseForwardGpu - <<stream()->As()->cuda_stream()>>>( - n, tensor_x->dptr(), tensor_y->dptr(), tensor_z->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template class BinaryFunctor, typename T> -class MathBinaryElementwiseXGradGpuKernel final : public user_op::OpKernel { - public: - MathBinaryElementwiseXGradGpuKernel() = default; - ~MathBinaryElementwiseXGradGpuKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); - const user_op::Tensor* tensor_dz = ctx->Tensor4ArgNameAndIndex("dz", 0); - user_op::Tensor* tensor_dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - int64_t n = tensor_x->shape_view().elem_cnt(); - if (n == 0) { return; } - MathBinaryElementwiseBackwardXGradGpu - <<stream()->As()->cuda_stream()>>>( - n, tensor_x->dptr(), tensor_y->dptr(), tensor_dz->dptr(), - tensor_dx->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template class BinaryFunctor, typename T> -class MathBinaryElementwiseYGradGpuKernel final : public user_op::OpKernel { - public: - MathBinaryElementwiseYGradGpuKernel() = default; - ~MathBinaryElementwiseYGradGpuKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); - const user_op::Tensor* tensor_dz = ctx->Tensor4ArgNameAndIndex("dz", 0); - user_op::Tensor* tensor_dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - int64_t n = tensor_x->shape_view().elem_cnt(); - if (n == 0) { return; } - MathBinaryElementwiseBackwardYGradGpu - <<stream()->As()->cuda_stream()>>>( - n, tensor_x->dptr(), tensor_y->dptr(), tensor_dz->dptr(), - tensor_dy->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_MATH_BINARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD(math_type_pair, data_type_pair) \ - REGISTER_USER_KERNEL(OF_PP_PAIR_FIRST(math_type_pair)) \ - .SetCreateFn< \ - MathBinaryElementwiseGpuKernel>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair))); \ - \ - REGISTER_USER_KERNEL((std::string("") + OF_PP_PAIR_FIRST(math_type_pair) + "_x_grad")) \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair))); \ - REGISTER_USER_KERNEL((std::string("") + OF_PP_PAIR_FIRST(math_type_pair) + "_y_grad")) \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair))); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_BINARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD, - MATH_BINARY_ELEMENTWISE_FUNC_SEQ, FLOATING_DATA_TYPE_SEQ) -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_BINARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD, - OF_PP_MAKE_TUPLE_SEQ("floordiv", FloorDiv), - INT_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ) - -template class BinaryFunctor> -class MathBinaryElementwiseGpuHalfKernel final : public user_op::OpKernel { - public: - MathBinaryElementwiseGpuHalfKernel() = default; - ~MathBinaryElementwiseGpuHalfKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); - user_op::Tensor* tensor_z = ctx->Tensor4ArgNameAndIndex("z", 0); - const half* x = reinterpret_cast(tensor_x->dptr()); - const half* y = reinterpret_cast(tensor_y->dptr()); - half* z = reinterpret_cast(tensor_z->mut_dptr()); - int64_t n = tensor_x->shape_view().elem_cnt(); - if (n == 0) { return; } - MathBinaryElementwiseForwardGpu - <<stream()->As()->cuda_stream()>>>(n, x, y, z); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template class BinaryFunctor> -class MathBinaryElementwiseXGradGpuHalfKernel final : public user_op::OpKernel { - public: - MathBinaryElementwiseXGradGpuHalfKernel() = default; - ~MathBinaryElementwiseXGradGpuHalfKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); - const user_op::Tensor* tensor_dz = ctx->Tensor4ArgNameAndIndex("dz", 0); - user_op::Tensor* tensor_dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - - const half* x = reinterpret_cast(tensor_x->dptr()); - const half* y = reinterpret_cast(tensor_y->dptr()); - const half* dz = reinterpret_cast(tensor_dz->dptr()); - half* dx = reinterpret_cast(tensor_dx->mut_dptr()); - int64_t n = tensor_x->shape_view().elem_cnt(); - if (n == 0) { return; } - MathBinaryElementwiseBackwardXGradGpu - <<stream()->As()->cuda_stream()>>>(n, x, y, dz, dx); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template class BinaryFunctor> -class MathBinaryElementwiseYGradGpuHalfKernel final : public user_op::OpKernel { - public: - MathBinaryElementwiseYGradGpuHalfKernel() = default; - ~MathBinaryElementwiseYGradGpuHalfKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); - const user_op::Tensor* tensor_dz = ctx->Tensor4ArgNameAndIndex("dz", 0); - user_op::Tensor* tensor_dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - - const half* x = reinterpret_cast(tensor_x->dptr()); - const half* y = reinterpret_cast(tensor_y->dptr()); - const half* dz = reinterpret_cast(tensor_dz->dptr()); - half* dy = reinterpret_cast(tensor_dy->mut_dptr()); - int64_t n = tensor_x->shape_view().elem_cnt(); - if (n == 0) { return; } - MathBinaryElementwiseBackwardYGradGpu - <<stream()->As()->cuda_stream()>>>(n, x, y, dz, dy); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_MATH_BINARY_ELEMENTWISE_CUDA_HALF_KERNEL_AND_GRAD(math_type_str, \ - math_func_prefix) \ - REGISTER_USER_KERNEL(math_type_str) \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("x", 0) == DataType::kFloat16)); \ - \ - REGISTER_USER_KERNEL((std::string("") + math_type_str + "_x_grad")) \ - .SetCreateFn< \ - MathBinaryElementwiseXGradGpuHalfKernel>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("x", 0) == DataType::kFloat16)); \ - REGISTER_USER_KERNEL((std::string("") + math_type_str + "_y_grad")) \ - .SetCreateFn< \ - MathBinaryElementwiseYGradGpuHalfKernel>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("x", 0) == DataType::kFloat16)); - -OF_PP_FOR_EACH_TUPLE(REGISTER_MATH_BINARY_ELEMENTWISE_CUDA_HALF_KERNEL_AND_GRAD, - MATH_BINARY_ELEMENTWISE_FUNC_SEQ) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/user/kernels/math_binary_elementwise_func.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template class BinaryFunctor, typename T> +__global__ void MathBinaryElementwiseForwardGpu(const int64_t n, const T* x, const T* y, T* z) { + CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) { z[i] = BinaryFunctor::Forward(x[i], y[i]); } +} + +template class BinaryFunctor, typename T> +__global__ void MathBinaryElementwiseBackwardXGradGpu(const int64_t n, const T* x, const T* y, + const T* dz, T* dx) { + CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) { + dx[i] = BinaryFunctor::BackwardXGrad(x[i], y[i], dz[i]); + } +} + +template class BinaryFunctor, typename T> +__global__ void MathBinaryElementwiseBackwardYGradGpu(const int64_t n, const T* x, const T* y, + const T* dz, T* dy) { + CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) { + dy[i] = BinaryFunctor::BackwardYGrad(x[i], y[i], dz[i]); + } +} + +} // namespace + +template class BinaryFunctor, typename T> +class MathBinaryElementwiseGpuKernel final : public user_op::OpKernel { + public: + MathBinaryElementwiseGpuKernel() = default; + ~MathBinaryElementwiseGpuKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); + user_op::Tensor* tensor_z = ctx->Tensor4ArgNameAndIndex("z", 0); + int64_t n = tensor_x->shape_view().elem_cnt(); + if (n == 0) { return; } + MathBinaryElementwiseForwardGpu + <<stream()->As()->cuda_stream()>>>( + n, tensor_x->dptr(), tensor_y->dptr(), tensor_z->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template class BinaryFunctor, typename T> +class MathBinaryElementwiseXGradGpuKernel final : public user_op::OpKernel { + public: + MathBinaryElementwiseXGradGpuKernel() = default; + ~MathBinaryElementwiseXGradGpuKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); + const user_op::Tensor* tensor_dz = ctx->Tensor4ArgNameAndIndex("dz", 0); + user_op::Tensor* tensor_dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + int64_t n = tensor_x->shape_view().elem_cnt(); + if (n == 0) { return; } + MathBinaryElementwiseBackwardXGradGpu + <<stream()->As()->cuda_stream()>>>( + n, tensor_x->dptr(), tensor_y->dptr(), tensor_dz->dptr(), + tensor_dx->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template class BinaryFunctor, typename T> +class MathBinaryElementwiseYGradGpuKernel final : public user_op::OpKernel { + public: + MathBinaryElementwiseYGradGpuKernel() = default; + ~MathBinaryElementwiseYGradGpuKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); + const user_op::Tensor* tensor_dz = ctx->Tensor4ArgNameAndIndex("dz", 0); + user_op::Tensor* tensor_dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + int64_t n = tensor_x->shape_view().elem_cnt(); + if (n == 0) { return; } + MathBinaryElementwiseBackwardYGradGpu + <<stream()->As()->cuda_stream()>>>( + n, tensor_x->dptr(), tensor_y->dptr(), tensor_dz->dptr(), + tensor_dy->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_MATH_BINARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD(math_type_pair, data_type_pair) \ + REGISTER_USER_KERNEL(OF_PP_PAIR_FIRST(math_type_pair)) \ + .SetCreateFn< \ + MathBinaryElementwiseGpuKernel>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair))); \ + \ + REGISTER_USER_KERNEL((std::string("") + OF_PP_PAIR_FIRST(math_type_pair) + "_x_grad")) \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair))); \ + REGISTER_USER_KERNEL((std::string("") + OF_PP_PAIR_FIRST(math_type_pair) + "_y_grad")) \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair))); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_BINARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD, + MATH_BINARY_ELEMENTWISE_FUNC_SEQ, FLOATING_DATA_TYPE_SEQ) +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_BINARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD, + OF_PP_MAKE_TUPLE_SEQ("floordiv", FloorDiv), + INT_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ) + +template class BinaryFunctor> +class MathBinaryElementwiseGpuHalfKernel final : public user_op::OpKernel { + public: + MathBinaryElementwiseGpuHalfKernel() = default; + ~MathBinaryElementwiseGpuHalfKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); + user_op::Tensor* tensor_z = ctx->Tensor4ArgNameAndIndex("z", 0); + const half* x = reinterpret_cast(tensor_x->dptr()); + const half* y = reinterpret_cast(tensor_y->dptr()); + half* z = reinterpret_cast(tensor_z->mut_dptr()); + int64_t n = tensor_x->shape_view().elem_cnt(); + if (n == 0) { return; } + MathBinaryElementwiseForwardGpu + <<stream()->As()->cuda_stream()>>>(n, x, y, z); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template class BinaryFunctor> +class MathBinaryElementwiseXGradGpuHalfKernel final : public user_op::OpKernel { + public: + MathBinaryElementwiseXGradGpuHalfKernel() = default; + ~MathBinaryElementwiseXGradGpuHalfKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); + const user_op::Tensor* tensor_dz = ctx->Tensor4ArgNameAndIndex("dz", 0); + user_op::Tensor* tensor_dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + + const half* x = reinterpret_cast(tensor_x->dptr()); + const half* y = reinterpret_cast(tensor_y->dptr()); + const half* dz = reinterpret_cast(tensor_dz->dptr()); + half* dx = reinterpret_cast(tensor_dx->mut_dptr()); + int64_t n = tensor_x->shape_view().elem_cnt(); + if (n == 0) { return; } + MathBinaryElementwiseBackwardXGradGpu + <<stream()->As()->cuda_stream()>>>(n, x, y, dz, dx); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template class BinaryFunctor> +class MathBinaryElementwiseYGradGpuHalfKernel final : public user_op::OpKernel { + public: + MathBinaryElementwiseYGradGpuHalfKernel() = default; + ~MathBinaryElementwiseYGradGpuHalfKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); + const user_op::Tensor* tensor_dz = ctx->Tensor4ArgNameAndIndex("dz", 0); + user_op::Tensor* tensor_dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + + const half* x = reinterpret_cast(tensor_x->dptr()); + const half* y = reinterpret_cast(tensor_y->dptr()); + const half* dz = reinterpret_cast(tensor_dz->dptr()); + half* dy = reinterpret_cast(tensor_dy->mut_dptr()); + int64_t n = tensor_x->shape_view().elem_cnt(); + if (n == 0) { return; } + MathBinaryElementwiseBackwardYGradGpu + <<stream()->As()->cuda_stream()>>>(n, x, y, dz, dy); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_MATH_BINARY_ELEMENTWISE_CUDA_HALF_KERNEL_AND_GRAD(math_type_str, \ + math_func_prefix) \ + REGISTER_USER_KERNEL(math_type_str) \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == DataType::kFloat16)); \ + \ + REGISTER_USER_KERNEL((std::string("") + math_type_str + "_x_grad")) \ + .SetCreateFn< \ + MathBinaryElementwiseXGradGpuHalfKernel>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == DataType::kFloat16)); \ + REGISTER_USER_KERNEL((std::string("") + math_type_str + "_y_grad")) \ + .SetCreateFn< \ + MathBinaryElementwiseYGradGpuHalfKernel>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == DataType::kFloat16)); + +OF_PP_FOR_EACH_TUPLE(REGISTER_MATH_BINARY_ELEMENTWISE_CUDA_HALF_KERNEL_AND_GRAD, + MATH_BINARY_ELEMENTWISE_FUNC_SEQ) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/math_unary_elementwise_func.h b/oneflow/user/kernels/math_unary_elementwise_func.h index c55ecf4..aff50c3 100644 --- a/oneflow/user/kernels/math_unary_elementwise_func.h +++ b/oneflow/user/kernels/math_unary_elementwise_func.h @@ -1,983 +1,983 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_USER_KERNELS_MATH_UNARY_ELEMENTWISE_FUNC_H_ -#define ONEFLOW_USER_KERNELS_MATH_UNARY_ELEMENTWISE_FUNC_H_ - -#include "oneflow/core/common/util.h" -#include "oneflow/core/common/data_type.h" -#include "oneflow/user/ops/math_unary_elementwise_seq.h" -#include "oneflow/core/device/cuda_pseudo_half.h" - -#if defined(__CUDACC__) - -#include -#define MATH_FUNC_F(name, x) name##f(x) -#define MATH_FUNC_D(name, x) name(x) - -#elif defined(__HIPCC__) -#include -#include - -#if defined(__HIP_DEVICE_COMPILE__) -#define MATH_FUNC_F(name, x) name##f(x) -#define MATH_FUNC_D(name, x) name(x) -#else -#define MATH_FUNC_F(name, x) std::name(x) -#define MATH_FUNC_D(name, x) std::name(x) -#endif - -#else - -#include -#define MATH_FUNC_F(name, x) std::name(x) -#define MATH_FUNC_D(name, x) std::name(x) - -#endif - -namespace oneflow { - -#define DECLARE_UNARY_FUNCTOR(math_unary_elementwise_type, func_prefix) \ - template \ - struct func_prefix##Functor; - -OF_PP_FOR_EACH_TUPLE(DECLARE_UNARY_FUNCTOR, MATH_UNARY_ELEMENTWISE_FUNC_SEQ) - -template -struct AbsFunctor { - static OF_DEVICE_FUNC T Forward(const T x) { - if (x == T(0)) - return T(0); - else - return x < T(0) ? -x : x; - } - - static OF_DEVICE_FUNC T Backward(const T x, const T dy) { - if (x == T(0)) - return T(0); - else - return x < T(0) ? -dy : dy; - } -}; - -template -struct SignFunctor { - static OF_DEVICE_FUNC T Forward(const T x) { return (T(0) < x) - (x < T(0)); } - - static OF_DEVICE_FUNC T Backward(const T x, const T dy) { return T(0); } -}; - -template<> -struct RsqrtFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { -#if defined(__CUDACC__) - return rsqrtf(x); -#elif defined(__HIP_DEVICE_COMPILE__) - return rsqrtf(x); -#else - return 1.0f / std::sqrt(x); -#endif - } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - return dy * (-1.0f / (2.0f * MATH_FUNC_F(sqrt, x * x * x))); - } -}; - -template<> -struct RsqrtFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { -#if defined(__CUDACC__) - return rsqrt(x); -#elif defined(__HIP_DEVICE_COMPILE__) - return rsqrt(x); -#else - return 1.0 / std::sqrt(x); -#endif - } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - return dy * (-1.0 / (2.0 * MATH_FUNC_D(sqrt, x * x * x))); - } -}; - -// float version - -template<> -struct AcosFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(acos, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - return dy * -RsqrtFunctor::Forward(1.0f - x * x); - } -}; - -template<> -struct AcoshFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(acosh, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - return dy * RsqrtFunctor::Forward(x * x - 1.0f); - } -}; - -template<> -struct AsinFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(asin, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - return dy * RsqrtFunctor::Forward(1.0f - x * x); - } -}; - -template<> -struct AsinhFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(asinh, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - return dy * RsqrtFunctor::Forward(1.0f + x * x); - } -}; - -template<> -struct AtanFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(atan, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - return dy * (1.0f / (1.0f + x * x)); - } -}; - -template<> -struct AtanhFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(atanh, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - return dy * (1.0f / (1.0f - x * x)); - } -}; - -template<> -struct NotEqualZeroFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return x != 0; } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return 0.0f; } -}; - -template<> -struct CeilFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(ceil, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return 0.0f; } -}; - -template<> -struct CosFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(cos, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - return dy * (-MATH_FUNC_F(sin, x)); - } -}; - -template<> -struct CoshFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(cosh, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - return dy * MATH_FUNC_F(sinh, x); - } -}; - -template<> -struct ErfFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(erf, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - return dy * 2.0f * RsqrtFunctor::Forward(M_PI) * expf(-x * x); - } -}; - -template<> -struct ErfcFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(erfc, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - return dy * -2.0f * RsqrtFunctor::Forward(M_PI) * expf(-x * x); - } -}; - -template<> -struct ExpFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(exp, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - return dy * MATH_FUNC_F(exp, x); - } -}; - -template<> -struct Expm1Functor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(expm1, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - return dy * MATH_FUNC_F(exp, x); - } -}; - -template<> -struct FloorFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(floor, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return 0.0f; } -}; - -template<> -struct LgammaFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(lgamma, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - // TODO(chengcheng): return: dy * digamma(x) - // assert(false); - return 0.0f; - } -}; - -template<> -struct LogFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(log, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return dy * (1.0f / x); } -}; - -template<> -struct Log2Functor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(log2, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - return dy * (1.0f / (x * MATH_FUNC_F(log, 2.0f))); - } -}; - -template<> -struct Log1pFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(log1p, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - return dy * (1.0f / (x + 1.0f)); - } -}; - -template<> -struct LogSigmoidFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { - return -MATH_FUNC_F(log, (1.0f + MATH_FUNC_F(exp, -x))); - } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - return dy * (1.0f / (MATH_FUNC_F(exp, x) + 1.0f)); - } -}; - -template<> -struct NegativeFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return -x; } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return -dy; } -}; - -template<> -struct ReciprocalFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return 1.0f / x; } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - return dy * (-1.0f / (x * x)); - } -}; - -template<> -struct ReciprocalNoNanFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { - if (fabsf(x) <= 0.0f) { return 0.0f; } - return 1.0f / x; - } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - if (fabsf(x) <= 0.0f) { return 0.0f; } - return dy * (-1.0f / (x * x)); - } -}; - -template<> -struct RintFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(rint, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return 0.0f; } -}; - -template<> -struct RoundFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(nearbyint, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return 0.0f; } -}; - -template<> -struct SigmoidFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { - return 1.0f / (1.0f + MATH_FUNC_F(exp, -x)); - } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - float y = 1.0f / (1.0f + MATH_FUNC_F(exp, -x)); - return dy * (y * (1.0f - y)); - } -}; - -template<> -struct SinFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(sin, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - return dy * MATH_FUNC_F(cos, x); - } -}; - -template<> -struct SinhFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(sinh, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - return dy * MATH_FUNC_F(cosh, x); - } -}; - -template<> -struct SqrtFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(sqrt, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - return dy * 0.5f / MATH_FUNC_F(sqrt, x); - } -}; - -template<> -struct SquareFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return x * x; } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return dy * 2.0f * x; } -}; - -template<> -struct TanFunctor { - static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(tan, x); } - - static OF_DEVICE_FUNC float Backward(const float x, const float dy) { - return dy * (1.0f / (MATH_FUNC_F(cos, x) * MATH_FUNC_F(cos, x))); - } -}; - -// double version - -template<> -struct AcosFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(acos, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - return dy * -RsqrtFunctor::Forward(1.0 - x * x); - } -}; - -template<> -struct AcoshFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(acosh, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - return dy * -RsqrtFunctor::Forward(x * x - 1.0); - } -}; - -template<> -struct AsinFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(asin, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - return dy * RsqrtFunctor::Forward(1.0 - x * x); - } -}; - -template<> -struct AsinhFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(asinh, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - return dy * RsqrtFunctor::Forward(1.0 + x * x); - } -}; - -template<> -struct AtanFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(atan, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - return dy * (1.0 / (1.0 + x * x)); - } -}; - -template<> -struct AtanhFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(atanh, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - return dy * (1.0 / (1.0 - x * x)); - } -}; - -template<> -struct NotEqualZeroFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return x != 0; } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return 0.0f; } -}; - -template<> -struct CeilFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(ceil, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return 0.0; } -}; - -template<> -struct CosFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(cos, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - return dy * (-MATH_FUNC_D(sin, x)); - } -}; - -template<> -struct CoshFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(cosh, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - return dy * MATH_FUNC_D(sinh, x); - } -}; - -template<> -struct ErfFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(erf, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - return dy * 2.0 * RsqrtFunctor::Forward(M_PI) * expf(-x * x); - } -}; - -template<> -struct ErfcFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(erfc, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - return dy * -2.0 * RsqrtFunctor::Forward(M_PI) * expf(-x * x); - } -}; - -template<> -struct ExpFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(exp, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - return dy * MATH_FUNC_D(exp, x); - } -}; - -template<> -struct Expm1Functor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(expm1, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - return dy * MATH_FUNC_D(exp, x); - } -}; - -template<> -struct FloorFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(floor, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return 0.0; } -}; - -template<> -struct LgammaFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(lgamma, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - // TODO(chengcheng): return: dy * digamma(x) - // assert(false); - return 0.0; - } -}; - -template<> -struct LogFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(log, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return dy * (1.0 / x); } -}; - -template<> -struct Log2Functor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(log2, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - return dy * (1.0 / (x * MATH_FUNC_D(log, 2.0))); - } -}; - -template<> -struct Log1pFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(log1p, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - return dy * (1.0 / (x + 1.0)); - } -}; - -template<> -struct LogSigmoidFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { - return -MATH_FUNC_D(log, (1.0 + MATH_FUNC_D(exp, -x))); - } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - return dy * (1.0 / (MATH_FUNC_D(exp, x) + 1.0)); - } -}; - -template<> -struct NegativeFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return -x; } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return -dy; } -}; - -template<> -struct ReciprocalFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return 1.0 / x; } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - return dy * (-1.0 / (x * x)); - } -}; - -template<> -struct ReciprocalNoNanFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { - if (fabs(x) <= 0.0) { return 0.0; } - return 1.0 / x; - } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - if (fabs(x) <= 0.0) { return 0.0; } - return dy * (-1.0 / (x * x)); - } -}; - -template<> -struct RintFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(rint, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return 0.0; } -}; - -template<> -struct RoundFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(nearbyint, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return 0.0; } -}; - -template<> -struct SigmoidFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { - return 1.0 / (1.0 + MATH_FUNC_D(exp, -x)); - } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - double y = 1.0 / (1.0 + MATH_FUNC_D(exp, -x)); - return dy * (y * (1.0 - y)); - } -}; - -template<> -struct SinFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(sin, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - return dy * MATH_FUNC_D(cos, x); - } -}; - -template<> -struct SinhFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(sinh, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - return dy * MATH_FUNC_D(cosh, x); - } -}; - -template<> -struct SqrtFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(sqrt, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - return dy * (double)0.5 / MATH_FUNC_D(sqrt, x); - } -}; - -template<> -struct SquareFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return x * x; } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return dy * 2.0 * x; } -}; - -template<> -struct TanFunctor { - static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(tan, x); } - - static OF_DEVICE_FUNC double Backward(const double x, const double dy) { - return dy * (1.0 / (MATH_FUNC_D(cos, x) * MATH_FUNC_D(cos, x))); - } -}; - -#if defined(__CUDACC__) || defined(__HIPCC__) -// half version - -#define OF_HALF_FUNC __device__ __forceinline__ - -#define MATH_FUNC_H(name, x) __float2half(name##f(__half2float(x))) -#define HALF_VAL_HALF __float2half(0.5f) -#define HALF_VAL_TWO __float2half(2.0f) -#define HALF_VAL_2RSQRT_PI __float2half(1.1283791671f) - -template<> -struct AbsFunctor { - static OF_HALF_FUNC half Forward(const half x) { - return __hlt(x, GetZeroVal()) ? __hneg(x) : x; - } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - return __hlt(x, GetZeroVal()) ? __hneg(dy) : dy; - } -}; - -template<> -struct AcosFunctor { - static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(acos, x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - return __hmul(dy, __hneg(hrsqrt(__hsub(GetOneVal(), __hmul(x, x))))); - } -}; - -template<> -struct AcoshFunctor { - static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(acosh, x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - return __hmul(dy, hrsqrt(__hsub(__hmul(x, x), GetOneVal()))); - } -}; - -template<> -struct AsinFunctor { - static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(asin, x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - return __hmul(dy, hrsqrt(__hsub(GetOneVal(), __hmul(x, x)))); - } -}; - -template<> -struct AsinhFunctor { - static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(asinh, x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - return __hmul(dy, hrsqrt(__hadd(GetOneVal(), __hmul(x, x)))); - } -}; - -template<> -struct AtanFunctor { - static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(atan, x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - return __hmul(dy, __hdiv(GetOneVal(), __hadd(GetOneVal(), __hmul(x, x)))); - } -}; - -template<> -struct AtanhFunctor { - static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(atanh, x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - return __hmul(dy, __hdiv(GetOneVal(), __hsub(GetOneVal(), __hmul(x, x)))); - } -}; - -template<> -struct CeilFunctor { - static OF_HALF_FUNC half Forward(const half x) { return hceil(x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal(); } -}; - -template<> -struct NotEqualZeroFunctor { - static OF_HALF_FUNC half Forward(const half x) { return x != static_cast(0.0); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal(); } -}; - -template<> -struct CosFunctor { - static OF_HALF_FUNC half Forward(const half x) { return hcos(x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - return __hmul(dy, __hneg(hsin(x))); - } -}; - -template<> -struct CoshFunctor { - static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(cosh, x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - return __hmul(dy, MATH_FUNC_H(sinh, x)); - } -}; - -template<> -struct ErfFunctor { - static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(erf, x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - return __hmul(dy, __hmul(HALF_VAL_2RSQRT_PI, hexp(__hmul(__hneg(x), x)))); - } -}; - -template<> -struct ErfcFunctor { - static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(erfc, x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - return __hmul(dy, __hneg(__hmul(HALF_VAL_2RSQRT_PI, hexp(__hmul(__hneg(x), x))))); - } -}; - -template<> -struct ExpFunctor { - static OF_HALF_FUNC half Forward(const half x) { return hexp(x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { return __hmul(dy, hexp(x)); } -}; - -template<> -struct Expm1Functor { - static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(expm1, x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { return __hmul(dy, hexp(x)); } -}; - -template<> -struct FloorFunctor { - static OF_HALF_FUNC half Forward(const half x) { return hfloor(x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal(); } -}; - -template<> -struct LgammaFunctor { - static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(lgamma, x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - // TODO(chengcheng): return: dy * digamma(x) - // assert(false); - return GetZeroVal(); - } -}; - -template<> -struct LogFunctor { - static OF_HALF_FUNC half Forward(const half x) { return hlog(x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { return __hmul(dy, hrcp(x)); } -}; - -template<> -struct Log2Functor { - static OF_HALF_FUNC half Forward(const half x) { return hlog2(x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - return __hmul(dy, hrcp(__hmul(x, hlog(HALF_VAL_TWO)))); - } -}; - -template<> -struct Log1pFunctor { - static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(log1p, x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - return __hmul(dy, hrcp(__hadd(x, GetOneVal()))); - } -}; - -template<> -struct LogSigmoidFunctor { - static OF_HALF_FUNC half Forward(const half x) { - return __hneg(hlog(__hadd(GetOneVal(), hexp(__hneg(x))))); - } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - return __hmul(dy, hrcp(__hadd(hexp(x), GetOneVal()))); - } -}; - -template<> -struct NegativeFunctor { - static OF_HALF_FUNC half Forward(const half x) { return __hneg(x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { return __hneg(dy); } -}; - -template<> -struct ReciprocalFunctor { - static OF_HALF_FUNC half Forward(const half x) { return hrcp(x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - return __hmul(dy, __hneg(hrcp(__hmul(x, x)))); - } -}; - -template<> -struct ReciprocalNoNanFunctor { - static OF_HALF_FUNC half Forward(const half x) { - if (__heq(GetZeroVal(), x)) { return GetZeroVal(); } - return hrcp(x); - } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - if (__heq(GetZeroVal(), x)) { return GetZeroVal(); } - return __hmul(dy, __hneg(hrcp(__hmul(x, x)))); - } -}; - -template<> -struct RintFunctor { - static OF_HALF_FUNC half Forward(const half x) { return hrint(x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal(); } -}; - -template<> -struct RoundFunctor { - static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(nearbyint, x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal(); } -}; - -template<> -struct RsqrtFunctor { - static OF_HALF_FUNC half Forward(const half x) { return hrsqrt(x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - return __hmul(dy, __hneg(hrcp(__hmul(HALF_VAL_TWO, hsqrt(__hmul(x, __hmul(x, x))))))); - } -}; - -template<> -struct SigmoidFunctor { - static OF_HALF_FUNC half Forward(const half x) { - return hrcp(__hadd(GetOneVal(), hexp(__hneg(x)))); - } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - half y = hrcp(__hadd(GetOneVal(), hexp(__hneg(x)))); - return __hmul(dy, __hmul(y, __hsub(GetOneVal(), y))); - } -}; - -template<> -struct SignFunctor { - static OF_HALF_FUNC half Forward(const half x) { - if (__hgt(x, GetZeroVal())) { return GetOneVal(); } - if (__hlt(x, GetZeroVal())) { return __hneg(GetOneVal()); } - return GetZeroVal(); - } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal(); } -}; - -template<> -struct SinFunctor { - static OF_HALF_FUNC half Forward(const half x) { return hsin(x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { return __hmul(dy, hcos(x)); } -}; - -template<> -struct SinhFunctor { - static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(sinh, x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - return __hmul(dy, MATH_FUNC_H(cosh, x)); - } -}; - -template<> -struct SqrtFunctor { - static OF_HALF_FUNC half Forward(const half x) { return hsqrt(x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - return __hmul(dy, __hdiv(HALF_VAL_HALF, hsqrt(x))); - } -}; - -template<> -struct SquareFunctor { - static OF_HALF_FUNC half Forward(const half x) { return __hmul(x, x); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - return __hmul(dy, __hmul(HALF_VAL_TWO, x)); - } -}; - -template<> -struct TanFunctor { - static OF_HALF_FUNC half Forward(const half x) { return __hdiv(hsin(x), hcos(x)); } - - static OF_HALF_FUNC half Backward(const half x, const half dy) { - return __hmul(dy, hrcp(__hmul(hcos(x), hcos(x)))); - } -}; - -#endif - -} // namespace oneflow - -#endif // ONEFLOW_USER_KERNELS_MATH_UNARY_ELEMENTWISE_FUNC_H_ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_USER_KERNELS_MATH_UNARY_ELEMENTWISE_FUNC_H_ +#define ONEFLOW_USER_KERNELS_MATH_UNARY_ELEMENTWISE_FUNC_H_ + +#include "oneflow/core/common/util.h" +#include "oneflow/core/common/data_type.h" +#include "oneflow/user/ops/math_unary_elementwise_seq.h" +#include "oneflow/core/device/cuda_pseudo_half.h" + +#if defined(__CUDACC__) + +#include +#define MATH_FUNC_F(name, x) name##f(x) +#define MATH_FUNC_D(name, x) name(x) + +#elif defined(__HIPCC__) +#include +#include + +#if defined(__HIP_DEVICE_COMPILE__) +#define MATH_FUNC_F(name, x) name##f(x) +#define MATH_FUNC_D(name, x) name(x) +#else +#define MATH_FUNC_F(name, x) std::name(x) +#define MATH_FUNC_D(name, x) std::name(x) +#endif + +#else + +#include +#define MATH_FUNC_F(name, x) std::name(x) +#define MATH_FUNC_D(name, x) std::name(x) + +#endif + +namespace oneflow { + +#define DECLARE_UNARY_FUNCTOR(math_unary_elementwise_type, func_prefix) \ + template \ + struct func_prefix##Functor; + +OF_PP_FOR_EACH_TUPLE(DECLARE_UNARY_FUNCTOR, MATH_UNARY_ELEMENTWISE_FUNC_SEQ) + +template +struct AbsFunctor { + static OF_DEVICE_FUNC T Forward(const T x) { + if (x == T(0)) + return T(0); + else + return x < T(0) ? -x : x; + } + + static OF_DEVICE_FUNC T Backward(const T x, const T dy) { + if (x == T(0)) + return T(0); + else + return x < T(0) ? -dy : dy; + } +}; + +template +struct SignFunctor { + static OF_DEVICE_FUNC T Forward(const T x) { return (T(0) < x) - (x < T(0)); } + + static OF_DEVICE_FUNC T Backward(const T x, const T dy) { return T(0); } +}; + +template<> +struct RsqrtFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { +#if defined(__CUDACC__) + return rsqrtf(x); +#elif defined(__HIP_DEVICE_COMPILE__) + return rsqrtf(x); +#else + return 1.0f / std::sqrt(x); +#endif + } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + return dy * (-1.0f / (2.0f * MATH_FUNC_F(sqrt, x * x * x))); + } +}; + +template<> +struct RsqrtFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { +#if defined(__CUDACC__) + return rsqrt(x); +#elif defined(__HIP_DEVICE_COMPILE__) + return rsqrt(x); +#else + return 1.0 / std::sqrt(x); +#endif + } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + return dy * (-1.0 / (2.0 * MATH_FUNC_D(sqrt, x * x * x))); + } +}; + +// float version + +template<> +struct AcosFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(acos, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + return dy * -RsqrtFunctor::Forward(1.0f - x * x); + } +}; + +template<> +struct AcoshFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(acosh, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + return dy * RsqrtFunctor::Forward(x * x - 1.0f); + } +}; + +template<> +struct AsinFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(asin, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + return dy * RsqrtFunctor::Forward(1.0f - x * x); + } +}; + +template<> +struct AsinhFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(asinh, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + return dy * RsqrtFunctor::Forward(1.0f + x * x); + } +}; + +template<> +struct AtanFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(atan, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + return dy * (1.0f / (1.0f + x * x)); + } +}; + +template<> +struct AtanhFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(atanh, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + return dy * (1.0f / (1.0f - x * x)); + } +}; + +template<> +struct NotEqualZeroFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return x != 0; } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return 0.0f; } +}; + +template<> +struct CeilFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(ceil, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return 0.0f; } +}; + +template<> +struct CosFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(cos, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + return dy * (-MATH_FUNC_F(sin, x)); + } +}; + +template<> +struct CoshFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(cosh, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + return dy * MATH_FUNC_F(sinh, x); + } +}; + +template<> +struct ErfFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(erf, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + return dy * 2.0f * RsqrtFunctor::Forward(M_PI) * expf(-x * x); + } +}; + +template<> +struct ErfcFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(erfc, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + return dy * -2.0f * RsqrtFunctor::Forward(M_PI) * expf(-x * x); + } +}; + +template<> +struct ExpFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(exp, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + return dy * MATH_FUNC_F(exp, x); + } +}; + +template<> +struct Expm1Functor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(expm1, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + return dy * MATH_FUNC_F(exp, x); + } +}; + +template<> +struct FloorFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(floor, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return 0.0f; } +}; + +template<> +struct LgammaFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(lgamma, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + // TODO(chengcheng): return: dy * digamma(x) + // assert(false); + return 0.0f; + } +}; + +template<> +struct LogFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(log, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return dy * (1.0f / x); } +}; + +template<> +struct Log2Functor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(log2, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + return dy * (1.0f / (x * MATH_FUNC_F(log, 2.0f))); + } +}; + +template<> +struct Log1pFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(log1p, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + return dy * (1.0f / (x + 1.0f)); + } +}; + +template<> +struct LogSigmoidFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { + return -MATH_FUNC_F(log, (1.0f + MATH_FUNC_F(exp, -x))); + } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + return dy * (1.0f / (MATH_FUNC_F(exp, x) + 1.0f)); + } +}; + +template<> +struct NegativeFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return -x; } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return -dy; } +}; + +template<> +struct ReciprocalFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return 1.0f / x; } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + return dy * (-1.0f / (x * x)); + } +}; + +template<> +struct ReciprocalNoNanFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { + if (fabsf(x) <= 0.0f) { return 0.0f; } + return 1.0f / x; + } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + if (fabsf(x) <= 0.0f) { return 0.0f; } + return dy * (-1.0f / (x * x)); + } +}; + +template<> +struct RintFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(rint, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return 0.0f; } +}; + +template<> +struct RoundFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(nearbyint, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return 0.0f; } +}; + +template<> +struct SigmoidFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { + return 1.0f / (1.0f + MATH_FUNC_F(exp, -x)); + } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + float y = 1.0f / (1.0f + MATH_FUNC_F(exp, -x)); + return dy * (y * (1.0f - y)); + } +}; + +template<> +struct SinFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(sin, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + return dy * MATH_FUNC_F(cos, x); + } +}; + +template<> +struct SinhFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(sinh, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + return dy * MATH_FUNC_F(cosh, x); + } +}; + +template<> +struct SqrtFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(sqrt, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + return dy * 0.5f / MATH_FUNC_F(sqrt, x); + } +}; + +template<> +struct SquareFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return x * x; } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { return dy * 2.0f * x; } +}; + +template<> +struct TanFunctor { + static OF_DEVICE_FUNC float Forward(const float x) { return MATH_FUNC_F(tan, x); } + + static OF_DEVICE_FUNC float Backward(const float x, const float dy) { + return dy * (1.0f / (MATH_FUNC_F(cos, x) * MATH_FUNC_F(cos, x))); + } +}; + +// double version + +template<> +struct AcosFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(acos, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + return dy * -RsqrtFunctor::Forward(1.0 - x * x); + } +}; + +template<> +struct AcoshFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(acosh, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + return dy * -RsqrtFunctor::Forward(x * x - 1.0); + } +}; + +template<> +struct AsinFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(asin, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + return dy * RsqrtFunctor::Forward(1.0 - x * x); + } +}; + +template<> +struct AsinhFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(asinh, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + return dy * RsqrtFunctor::Forward(1.0 + x * x); + } +}; + +template<> +struct AtanFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(atan, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + return dy * (1.0 / (1.0 + x * x)); + } +}; + +template<> +struct AtanhFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(atanh, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + return dy * (1.0 / (1.0 - x * x)); + } +}; + +template<> +struct NotEqualZeroFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return x != 0; } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return 0.0f; } +}; + +template<> +struct CeilFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(ceil, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return 0.0; } +}; + +template<> +struct CosFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(cos, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + return dy * (-MATH_FUNC_D(sin, x)); + } +}; + +template<> +struct CoshFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(cosh, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + return dy * MATH_FUNC_D(sinh, x); + } +}; + +template<> +struct ErfFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(erf, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + return dy * 2.0 * RsqrtFunctor::Forward(M_PI) * expf(-x * x); + } +}; + +template<> +struct ErfcFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(erfc, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + return dy * -2.0 * RsqrtFunctor::Forward(M_PI) * expf(-x * x); + } +}; + +template<> +struct ExpFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(exp, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + return dy * MATH_FUNC_D(exp, x); + } +}; + +template<> +struct Expm1Functor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(expm1, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + return dy * MATH_FUNC_D(exp, x); + } +}; + +template<> +struct FloorFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(floor, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return 0.0; } +}; + +template<> +struct LgammaFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(lgamma, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + // TODO(chengcheng): return: dy * digamma(x) + // assert(false); + return 0.0; + } +}; + +template<> +struct LogFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(log, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return dy * (1.0 / x); } +}; + +template<> +struct Log2Functor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(log2, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + return dy * (1.0 / (x * MATH_FUNC_D(log, 2.0))); + } +}; + +template<> +struct Log1pFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(log1p, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + return dy * (1.0 / (x + 1.0)); + } +}; + +template<> +struct LogSigmoidFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { + return -MATH_FUNC_D(log, (1.0 + MATH_FUNC_D(exp, -x))); + } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + return dy * (1.0 / (MATH_FUNC_D(exp, x) + 1.0)); + } +}; + +template<> +struct NegativeFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return -x; } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return -dy; } +}; + +template<> +struct ReciprocalFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return 1.0 / x; } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + return dy * (-1.0 / (x * x)); + } +}; + +template<> +struct ReciprocalNoNanFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { + if (fabs(x) <= 0.0) { return 0.0; } + return 1.0 / x; + } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + if (fabs(x) <= 0.0) { return 0.0; } + return dy * (-1.0 / (x * x)); + } +}; + +template<> +struct RintFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(rint, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return 0.0; } +}; + +template<> +struct RoundFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(nearbyint, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return 0.0; } +}; + +template<> +struct SigmoidFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { + return 1.0 / (1.0 + MATH_FUNC_D(exp, -x)); + } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + double y = 1.0 / (1.0 + MATH_FUNC_D(exp, -x)); + return dy * (y * (1.0 - y)); + } +}; + +template<> +struct SinFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(sin, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + return dy * MATH_FUNC_D(cos, x); + } +}; + +template<> +struct SinhFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(sinh, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + return dy * MATH_FUNC_D(cosh, x); + } +}; + +template<> +struct SqrtFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(sqrt, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + return dy * (double)0.5 / MATH_FUNC_D(sqrt, x); + } +}; + +template<> +struct SquareFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return x * x; } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { return dy * 2.0 * x; } +}; + +template<> +struct TanFunctor { + static OF_DEVICE_FUNC double Forward(const double x) { return MATH_FUNC_D(tan, x); } + + static OF_DEVICE_FUNC double Backward(const double x, const double dy) { + return dy * (1.0 / (MATH_FUNC_D(cos, x) * MATH_FUNC_D(cos, x))); + } +}; + +#if defined(__CUDACC__) || defined(__HIPCC__) +// half version + +#define OF_HALF_FUNC __device__ __forceinline__ + +#define MATH_FUNC_H(name, x) __float2half(name##f(__half2float(x))) +#define HALF_VAL_HALF __float2half(0.5f) +#define HALF_VAL_TWO __float2half(2.0f) +#define HALF_VAL_2RSQRT_PI __float2half(1.1283791671f) + +template<> +struct AbsFunctor { + static OF_HALF_FUNC half Forward(const half x) { + return __hlt(x, GetZeroVal()) ? __hneg(x) : x; + } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + return __hlt(x, GetZeroVal()) ? __hneg(dy) : dy; + } +}; + +template<> +struct AcosFunctor { + static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(acos, x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + return __hmul(dy, __hneg(hrsqrt(__hsub(GetOneVal(), __hmul(x, x))))); + } +}; + +template<> +struct AcoshFunctor { + static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(acosh, x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + return __hmul(dy, hrsqrt(__hsub(__hmul(x, x), GetOneVal()))); + } +}; + +template<> +struct AsinFunctor { + static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(asin, x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + return __hmul(dy, hrsqrt(__hsub(GetOneVal(), __hmul(x, x)))); + } +}; + +template<> +struct AsinhFunctor { + static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(asinh, x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + return __hmul(dy, hrsqrt(__hadd(GetOneVal(), __hmul(x, x)))); + } +}; + +template<> +struct AtanFunctor { + static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(atan, x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + return __hmul(dy, __hdiv(GetOneVal(), __hadd(GetOneVal(), __hmul(x, x)))); + } +}; + +template<> +struct AtanhFunctor { + static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(atanh, x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + return __hmul(dy, __hdiv(GetOneVal(), __hsub(GetOneVal(), __hmul(x, x)))); + } +}; + +template<> +struct CeilFunctor { + static OF_HALF_FUNC half Forward(const half x) { return hceil(x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal(); } +}; + +template<> +struct NotEqualZeroFunctor { + static OF_HALF_FUNC half Forward(const half x) { return x != static_cast(0.0); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal(); } +}; + +template<> +struct CosFunctor { + static OF_HALF_FUNC half Forward(const half x) { return hcos(x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + return __hmul(dy, __hneg(hsin(x))); + } +}; + +template<> +struct CoshFunctor { + static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(cosh, x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + return __hmul(dy, MATH_FUNC_H(sinh, x)); + } +}; + +template<> +struct ErfFunctor { + static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(erf, x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + return __hmul(dy, __hmul(HALF_VAL_2RSQRT_PI, hexp(__hmul(__hneg(x), x)))); + } +}; + +template<> +struct ErfcFunctor { + static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(erfc, x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + return __hmul(dy, __hneg(__hmul(HALF_VAL_2RSQRT_PI, hexp(__hmul(__hneg(x), x))))); + } +}; + +template<> +struct ExpFunctor { + static OF_HALF_FUNC half Forward(const half x) { return hexp(x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { return __hmul(dy, hexp(x)); } +}; + +template<> +struct Expm1Functor { + static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(expm1, x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { return __hmul(dy, hexp(x)); } +}; + +template<> +struct FloorFunctor { + static OF_HALF_FUNC half Forward(const half x) { return hfloor(x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal(); } +}; + +template<> +struct LgammaFunctor { + static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(lgamma, x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + // TODO(chengcheng): return: dy * digamma(x) + // assert(false); + return GetZeroVal(); + } +}; + +template<> +struct LogFunctor { + static OF_HALF_FUNC half Forward(const half x) { return hlog(x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { return __hmul(dy, hrcp(x)); } +}; + +template<> +struct Log2Functor { + static OF_HALF_FUNC half Forward(const half x) { return hlog2(x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + return __hmul(dy, hrcp(__hmul(x, hlog(HALF_VAL_TWO)))); + } +}; + +template<> +struct Log1pFunctor { + static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(log1p, x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + return __hmul(dy, hrcp(__hadd(x, GetOneVal()))); + } +}; + +template<> +struct LogSigmoidFunctor { + static OF_HALF_FUNC half Forward(const half x) { + return __hneg(hlog(__hadd(GetOneVal(), hexp(__hneg(x))))); + } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + return __hmul(dy, hrcp(__hadd(hexp(x), GetOneVal()))); + } +}; + +template<> +struct NegativeFunctor { + static OF_HALF_FUNC half Forward(const half x) { return __hneg(x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { return __hneg(dy); } +}; + +template<> +struct ReciprocalFunctor { + static OF_HALF_FUNC half Forward(const half x) { return hrcp(x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + return __hmul(dy, __hneg(hrcp(__hmul(x, x)))); + } +}; + +template<> +struct ReciprocalNoNanFunctor { + static OF_HALF_FUNC half Forward(const half x) { + if (__heq(GetZeroVal(), x)) { return GetZeroVal(); } + return hrcp(x); + } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + if (__heq(GetZeroVal(), x)) { return GetZeroVal(); } + return __hmul(dy, __hneg(hrcp(__hmul(x, x)))); + } +}; + +template<> +struct RintFunctor { + static OF_HALF_FUNC half Forward(const half x) { return hrint(x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal(); } +}; + +template<> +struct RoundFunctor { + static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(nearbyint, x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal(); } +}; + +template<> +struct RsqrtFunctor { + static OF_HALF_FUNC half Forward(const half x) { return hrsqrt(x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + return __hmul(dy, __hneg(hrcp(__hmul(HALF_VAL_TWO, hsqrt(__hmul(x, __hmul(x, x))))))); + } +}; + +template<> +struct SigmoidFunctor { + static OF_HALF_FUNC half Forward(const half x) { + return hrcp(__hadd(GetOneVal(), hexp(__hneg(x)))); + } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + half y = hrcp(__hadd(GetOneVal(), hexp(__hneg(x)))); + return __hmul(dy, __hmul(y, __hsub(GetOneVal(), y))); + } +}; + +template<> +struct SignFunctor { + static OF_HALF_FUNC half Forward(const half x) { + if (__hgt(x, GetZeroVal())) { return GetOneVal(); } + if (__hlt(x, GetZeroVal())) { return __hneg(GetOneVal()); } + return GetZeroVal(); + } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { return GetZeroVal(); } +}; + +template<> +struct SinFunctor { + static OF_HALF_FUNC half Forward(const half x) { return hsin(x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { return __hmul(dy, hcos(x)); } +}; + +template<> +struct SinhFunctor { + static OF_HALF_FUNC half Forward(const half x) { return MATH_FUNC_H(sinh, x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + return __hmul(dy, MATH_FUNC_H(cosh, x)); + } +}; + +template<> +struct SqrtFunctor { + static OF_HALF_FUNC half Forward(const half x) { return hsqrt(x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + return __hmul(dy, __hdiv(HALF_VAL_HALF, hsqrt(x))); + } +}; + +template<> +struct SquareFunctor { + static OF_HALF_FUNC half Forward(const half x) { return __hmul(x, x); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + return __hmul(dy, __hmul(HALF_VAL_TWO, x)); + } +}; + +template<> +struct TanFunctor { + static OF_HALF_FUNC half Forward(const half x) { return __hdiv(hsin(x), hcos(x)); } + + static OF_HALF_FUNC half Backward(const half x, const half dy) { + return __hmul(dy, hrcp(__hmul(hcos(x), hcos(x)))); + } +}; + +#endif + +} // namespace oneflow + +#endif // ONEFLOW_USER_KERNELS_MATH_UNARY_ELEMENTWISE_FUNC_H_ diff --git a/oneflow/user/kernels/math_unary_elementwise_kernel.hip.cpp b/oneflow/user/kernels/math_unary_elementwise_kernel.hip.cpp index 160dd50..1c276ab 100644 --- a/oneflow/user/kernels/math_unary_elementwise_kernel.hip.cpp +++ b/oneflow/user/kernels/math_unary_elementwise_kernel.hip.cpp @@ -1,177 +1,177 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/user/kernels/math_unary_elementwise_func.h" -#include "oneflow/core/kernel/cuda_graph_support.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include "oneflow/core/device/cuda_util.h" - -namespace oneflow { - -namespace { - -template class UnaryFunctor, typename T> -__global__ void MathUnaryElementwiseForwardGpu(const int64_t n, const T* x, T* y) { - CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) { y[i] = UnaryFunctor::Forward(x[i]); } -} - -template class UnaryFunctor, typename T> -__global__ void MathUnaryElementwiseBackwardGpu(const int64_t n, const T* x, const T* dy, T* dx) { - CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) { dx[i] = UnaryFunctor::Backward(x[i], dy[i]); } -} - -} // namespace - -template class UnaryFunctor, typename T> -class MathUnaryElementwiseGpuKernel final : public user_op::OpKernel, - public user_op::CudaGraphSupport { - public: - MathUnaryElementwiseGpuKernel() = default; - ~MathUnaryElementwiseGpuKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); - user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); - const T* x = tensor_x->dptr(); - T* y = tensor_y->mut_dptr(); - int64_t n = tensor_x->shape_view().elem_cnt(); - if (n == 0) { return; } - MathUnaryElementwiseForwardGpu - <<stream()->As()->cuda_stream()>>>(n, x, y); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template class UnaryFunctor, typename T> -class MathUnaryElementwiseGradGpuKernel final : public user_op::OpKernel, - public user_op::CudaGraphSupport { - public: - MathUnaryElementwiseGradGpuKernel() = default; - ~MathUnaryElementwiseGradGpuKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* tensor_dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - user_op::Tensor* tensor_dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - - const T* x = tensor_x->dptr(); - const T* dy = tensor_dy->dptr(); - T* dx = tensor_dx->mut_dptr(); - int64_t n = tensor_x->shape_view().elem_cnt(); - if (n == 0) { return; } - MathUnaryElementwiseBackwardGpu - <<stream()->As()->cuda_stream()>>>(n, x, dy, dx); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_MATH_UNARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD(math_type_pair, data_type_pair) \ - REGISTER_USER_KERNEL(OF_PP_PAIR_FIRST(math_type_pair)) \ - .SetCreateFn< \ - MathUnaryElementwiseGpuKernel>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair)) \ - && (user_op::HobDataType("y", 0) == OF_PP_PAIR_SECOND(data_type_pair))); \ - \ - REGISTER_USER_KERNEL((std::string("") + OF_PP_PAIR_FIRST(math_type_pair) + "_grad")) \ - .SetCreateFn< \ - MathUnaryElementwiseGradGpuKernel>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair))); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_UNARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD, - MATH_UNARY_ELEMENTWISE_FUNC_SEQ, FLOATING_DATA_TYPE_SEQ) - -// For some special dtype kernel register. -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_UNARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD, - OF_PP_MAKE_TUPLE_SEQ("abs", Abs), UNSIGNED_INT_DATA_TYPE_SEQ) -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_UNARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD, - OF_PP_MAKE_TUPLE_SEQ("abs", Abs), INT_DATA_TYPE_SEQ) - -template class UnaryFunctor> -class MathUnaryElementwiseGpuHalfKernel final : public user_op::OpKernel, - public user_op::CudaGraphSupport { - public: - MathUnaryElementwiseGpuHalfKernel() = default; - ~MathUnaryElementwiseGpuHalfKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); - user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); - const half* x = reinterpret_cast(tensor_x->dptr()); - half* y = reinterpret_cast(tensor_y->mut_dptr()); - int64_t n = tensor_x->shape_view().elem_cnt(); - if (n == 0) { return; } - MathUnaryElementwiseForwardGpu - <<stream()->As()->cuda_stream()>>>(n, x, y); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template class UnaryFunctor> -class MathUnaryElementwiseGradGpuHalfKernel final : public user_op::OpKernel, - public user_op::CudaGraphSupport { - public: - MathUnaryElementwiseGradGpuHalfKernel() = default; - ~MathUnaryElementwiseGradGpuHalfKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* tensor_dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - user_op::Tensor* tensor_dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - - const half* x = reinterpret_cast(tensor_x->dptr()); - const half* dy = reinterpret_cast(tensor_dy->dptr()); - half* dx = reinterpret_cast(tensor_dx->mut_dptr()); - int64_t n = tensor_x->shape_view().elem_cnt(); - if (n == 0) { return; } - MathUnaryElementwiseBackwardGpu - <<stream()->As()->cuda_stream()>>>(n, x, dy, dx); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_MATH_UNARY_ELEMENTWISE_GUDA_HALF_KERNEL_AND_GRAD(math_type_str, math_func_prefix) \ - REGISTER_USER_KERNEL(math_type_str) \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("x", 0) == DataType::kFloat16) \ - && (user_op::HobDataType("y", 0) == DataType::kFloat16)); \ - \ - REGISTER_USER_KERNEL((std::string("") + math_type_str + "_grad")) \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("x", 0) == DataType::kFloat16)); - -// OF_PP_FOR_EACH_TUPLE(REGISTER_MATH_UNARY_ELEMENTWISE_GUDA_HALF_KERNEL_AND_GRAD, -// MATH_UNARY_ELEMENTWISE_FUNC_SEQ) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/user/kernels/math_unary_elementwise_func.h" +#include "oneflow/core/kernel/cuda_graph_support.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include "oneflow/core/device/cuda_util.h" + +namespace oneflow { + +namespace { + +template class UnaryFunctor, typename T> +__global__ void MathUnaryElementwiseForwardGpu(const int64_t n, const T* x, T* y) { + CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) { y[i] = UnaryFunctor::Forward(x[i]); } +} + +template class UnaryFunctor, typename T> +__global__ void MathUnaryElementwiseBackwardGpu(const int64_t n, const T* x, const T* dy, T* dx) { + CUDA_1D_KERNEL_LOOP_T(int64_t, i, n) { dx[i] = UnaryFunctor::Backward(x[i], dy[i]); } +} + +} // namespace + +template class UnaryFunctor, typename T> +class MathUnaryElementwiseGpuKernel final : public user_op::OpKernel, + public user_op::CudaGraphSupport { + public: + MathUnaryElementwiseGpuKernel() = default; + ~MathUnaryElementwiseGpuKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); + user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); + const T* x = tensor_x->dptr(); + T* y = tensor_y->mut_dptr(); + int64_t n = tensor_x->shape_view().elem_cnt(); + if (n == 0) { return; } + MathUnaryElementwiseForwardGpu + <<stream()->As()->cuda_stream()>>>(n, x, y); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template class UnaryFunctor, typename T> +class MathUnaryElementwiseGradGpuKernel final : public user_op::OpKernel, + public user_op::CudaGraphSupport { + public: + MathUnaryElementwiseGradGpuKernel() = default; + ~MathUnaryElementwiseGradGpuKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* tensor_dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + user_op::Tensor* tensor_dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + + const T* x = tensor_x->dptr(); + const T* dy = tensor_dy->dptr(); + T* dx = tensor_dx->mut_dptr(); + int64_t n = tensor_x->shape_view().elem_cnt(); + if (n == 0) { return; } + MathUnaryElementwiseBackwardGpu + <<stream()->As()->cuda_stream()>>>(n, x, dy, dx); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_MATH_UNARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD(math_type_pair, data_type_pair) \ + REGISTER_USER_KERNEL(OF_PP_PAIR_FIRST(math_type_pair)) \ + .SetCreateFn< \ + MathUnaryElementwiseGpuKernel>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair)) \ + && (user_op::HobDataType("y", 0) == OF_PP_PAIR_SECOND(data_type_pair))); \ + \ + REGISTER_USER_KERNEL((std::string("") + OF_PP_PAIR_FIRST(math_type_pair) + "_grad")) \ + .SetCreateFn< \ + MathUnaryElementwiseGradGpuKernel>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(data_type_pair))); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_UNARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD, + MATH_UNARY_ELEMENTWISE_FUNC_SEQ, FLOATING_DATA_TYPE_SEQ) + +// For some special dtype kernel register. +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_UNARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD, + OF_PP_MAKE_TUPLE_SEQ("abs", Abs), UNSIGNED_INT_DATA_TYPE_SEQ) +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_MATH_UNARY_ELEMENTWISE_CUDA_KERNEL_AND_GRAD, + OF_PP_MAKE_TUPLE_SEQ("abs", Abs), INT_DATA_TYPE_SEQ) + +template class UnaryFunctor> +class MathUnaryElementwiseGpuHalfKernel final : public user_op::OpKernel, + public user_op::CudaGraphSupport { + public: + MathUnaryElementwiseGpuHalfKernel() = default; + ~MathUnaryElementwiseGpuHalfKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); + user_op::Tensor* tensor_y = ctx->Tensor4ArgNameAndIndex("y", 0); + const half* x = reinterpret_cast(tensor_x->dptr()); + half* y = reinterpret_cast(tensor_y->mut_dptr()); + int64_t n = tensor_x->shape_view().elem_cnt(); + if (n == 0) { return; } + MathUnaryElementwiseForwardGpu + <<stream()->As()->cuda_stream()>>>(n, x, y); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template class UnaryFunctor> +class MathUnaryElementwiseGradGpuHalfKernel final : public user_op::OpKernel, + public user_op::CudaGraphSupport { + public: + MathUnaryElementwiseGradGpuHalfKernel() = default; + ~MathUnaryElementwiseGradGpuHalfKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* tensor_x = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* tensor_dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + user_op::Tensor* tensor_dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + + const half* x = reinterpret_cast(tensor_x->dptr()); + const half* dy = reinterpret_cast(tensor_dy->dptr()); + half* dx = reinterpret_cast(tensor_dx->mut_dptr()); + int64_t n = tensor_x->shape_view().elem_cnt(); + if (n == 0) { return; } + MathUnaryElementwiseBackwardGpu + <<stream()->As()->cuda_stream()>>>(n, x, dy, dx); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_MATH_UNARY_ELEMENTWISE_GUDA_HALF_KERNEL_AND_GRAD(math_type_str, math_func_prefix) \ + REGISTER_USER_KERNEL(math_type_str) \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == DataType::kFloat16) \ + && (user_op::HobDataType("y", 0) == DataType::kFloat16)); \ + \ + REGISTER_USER_KERNEL((std::string("") + math_type_str + "_grad")) \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == DataType::kFloat16)); + +// OF_PP_FOR_EACH_TUPLE(REGISTER_MATH_UNARY_ELEMENTWISE_GUDA_HALF_KERNEL_AND_GRAD, +// MATH_UNARY_ELEMENTWISE_FUNC_SEQ) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/max_pool_kernel.hip.cpp b/oneflow/user/kernels/max_pool_kernel.hip.cpp index cd3e86f..9ac75d0 100644 --- a/oneflow/user/kernels/max_pool_kernel.hip.cpp +++ b/oneflow/user/kernels/max_pool_kernel.hip.cpp @@ -1,289 +1,289 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include -#ifdef WITH_ROCM -#include "hip/hip_runtime.h" -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/user/kernels/max_pool_kernel_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include - -namespace oneflow { -namespace { - -constexpr int kBlockSize = cuda::elementwise::kBlockSize; - -int GetMinThreadNum(int64_t elem_num) { return std::min(elem_num, kBlockSize); } - -int GetNumBlocks(int64_t elem_cnt) { - int num_blocks = 0; - OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks)); - return num_blocks; -} - -template -__device__ __inline__ void Maxpool2dForwardComputeCLast( - const NdIndexOffsetHelper& index_helper, IDX elem_num, const T* src, T* dest, - int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int64_t n_batch, - const int64_t n_channel, const int64_t x_height, const int64_t x_width, const int64_t y_height, - const int64_t y_width, const int32_t kernel_size_h, const int32_t kernel_size_w, - const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h, - const int32_t dilation_w) { - IDX n, h, w, c; - CUDA_1D_KERNEL_LOOP(num, elem_num) { - index_helper.OffsetToNdIndex(num, n, h, w, c); - - const IDX x_start_idx = n * n_channel * x_width * x_height; - const IDX y_start_idx = n * n_channel * y_height * y_width; - IDX hstart = h * stride_h - padding_h; - IDX wstart = w * stride_w - padding_w; - const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height - ? (hstart + (kernel_size_h - 1) * dilation_h + 1) - : x_height; - const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width - ? (wstart + (kernel_size_w - 1) * dilation_w + 1) - : x_width; - - while (hstart < 0) { hstart += dilation_h; } - while (wstart < 0) { wstart += dilation_w; } - /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */ - IDX max_index = hstart * x_width + wstart; - IDX src_idx = 0; - /* equal to -std::numeric_limits::infinity(); */ - T max_value = detail::numeric_limits::lower_bound(); - - for (IDX i = hstart; i < hend; i++) { - for (IDX j = wstart; j < wend; j++) { - const IDX window_idx = i * x_width * n_channel + j * n_channel + c; - const IDX search_idx = x_start_idx + window_idx; - T val = src[search_idx]; - if (val > max_value || detail::numerics::isnan(val)) { - max_value = val; - max_index = window_idx; - src_idx = search_idx; - } - } - } - const IDX out_idx = y_start_idx + h * y_width * n_channel + w * n_channel + c; - dest[out_idx] = src[src_idx]; - indice_ptr[out_idx] = max_index; - } -} - -} // namespace - -template -__launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool1dForward(const NdIndexOffsetHelper index_helper, IDX elem_num, - const T* src, T* dest, int64_t* indice_ptr, int32_t padding_l, - int32_t n_batch, int32_t n_channel, int32_t x_length, - int32_t kernel_size_l, int32_t stride_l, int32_t dilation_l) { - Maxpool1dForwardCompute(index_helper, elem_num, src, dest, indice_ptr, padding_l, n_batch, - n_channel, x_length, kernel_size_l, stride_l, dilation_l); -}; - -template -__launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool2dForwardCFirst(const NdIndexOffsetHelper index_helper, IDX elem_num, - const T* src, T* dest, int64_t* indice_ptr, int32_t padding_h, - int32_t padding_w, int32_t n_batch, int32_t n_channel, - int32_t x_height, int32_t x_width, int32_t kernel_size_h, - int32_t kernel_size_w, int32_t stride_h, int32_t stride_w, - int32_t dilation_h, int32_t dilation_w) { - Maxpool2dForwardComputeCFirst( - index_helper, elem_num, src, dest, indice_ptr, padding_h, padding_w, n_batch, n_channel, - x_height, x_width, kernel_size_h, kernel_size_w, stride_h, stride_w, dilation_h, dilation_w); -}; - -template -__launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool2dForwardCLast(const NdIndexOffsetHelper index_helper, IDX elem_num, - const T* src, T* dest, int64_t* indice_ptr, int32_t padding_h, - int32_t padding_w, int32_t n_batch, int32_t n_channel, - int32_t x_height, int32_t x_width, int32_t y_height, - int32_t y_width, int32_t kernel_size_h, int32_t kernel_size_w, - int32_t stride_h, int32_t stride_w, int32_t dilation_h, - int32_t dilation_w) { - Maxpool2dForwardComputeCLast(index_helper, elem_num, src, dest, indice_ptr, padding_h, - padding_w, n_batch, n_channel, x_height, x_width, y_height, - y_width, kernel_size_h, kernel_size_w, stride_h, stride_w, - dilation_h, dilation_w); -}; - -template -__launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool3dForward(const NdIndexOffsetHelper index_helper, IDX elem_num, - const T* src, T* dest, int64_t* indice_ptr, int32_t padding_t, - int32_t padding_h, int32_t padding_w, int32_t n_batch, - int32_t n_channel, int32_t x_time, int32_t x_height, - int32_t x_width, int32_t kernel_size_t, int32_t kernel_size_h, - int32_t kernel_size_w, int32_t stride_t, int32_t stride_h, - int32_t stride_w, int32_t dilation_t, int32_t dilation_h, - int32_t dilation_w) { - Maxpool3dForwardCompute(index_helper, elem_num, src, dest, indice_ptr, padding_t, - padding_h, padding_w, n_batch, n_channel, x_time, x_height, - x_width, kernel_size_t, kernel_size_h, kernel_size_w, stride_t, - stride_h, stride_w, dilation_t, dilation_h, dilation_w); -}; - -template -__launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool1dBackward(const NdIndexOffsetHelper index_helper, const IDX elem_num, - const T* src, T* dest, const int64_t* indice_ptr, - const int32_t n_batch, const int32_t n_channel, - const int32_t src_length, const int32_t dst_length) { - Maxpool1dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, n_batch, - n_channel, src_length, dst_length); -}; - -template -__launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool2dBackwardCFirst(const NdIndexOffsetHelper index_helper, - const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const int32_t n_batch, - const int32_t n_channel, const int32_t src_height, - const int32_t src_width, const int32_t dst_height, - const int32_t dst_width) { - Maxpool2dBackwardComputeCFirst(index_helper, elem_num, src, dest, indice_ptr, n_batch, - n_channel, src_height, src_width, dst_height, dst_width); -}; - -template -__launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool2dBackwardCLast(const NdIndexOffsetHelper index_helper, - const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const int32_t n_batch, - const int32_t n_channel, const int32_t src_height, - const int32_t src_width, const int32_t dst_height, - const int32_t dst_width) { - Maxpool2dBackwardComputeCLast(index_helper, elem_num, src, dest, indice_ptr, n_batch, - n_channel, src_height, src_width, dst_height, dst_width); -}; - -template -__launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool3dBackward(const NdIndexOffsetHelper index_helper, const IDX elem_num, - const T* src, T* dest, const int64_t* indice_ptr, - const int32_t n_batch, const int32_t n_channel, - const int32_t src_time, const int32_t src_height, - const int32_t src_width, const int32_t dst_time, - const int32_t dst_height, const int32_t dst_width) { - Maxpool3dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, n_batch, - n_channel, src_time, src_height, src_width, dst_time, dst_height, - dst_width); -}; - -template -struct PoolKernelUtil { - static void Maxpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, - const MaxPoolParams3D& params_3d) { - DoCUDAMaxPool1dForward<<As()->cuda_stream()>>>( - index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[2], - params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(4), - params_3d.pool_size_3d()[2], params_3d.stride_3d()[2], params_3d.dilation_3d()[2]); - } - - static void Maxpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const MaxPoolParams3D& params_3d) { - DoCUDAMaxPool1dBackward<<As()->cuda_stream()>>>( - index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), - params_3d.num_channel(), params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(4)); - } - - static void Maxpool2dForwardCFirst(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, - const MaxPoolParams3D& params_3d) { - DoCUDAMaxPool2dForwardCFirst<<As()->cuda_stream()>>>( - index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1], - params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), - params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[1], - params_3d.pool_size_3d()[2], params_3d.stride_3d()[1], params_3d.stride_3d()[2], - params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]); - } - - static void Maxpool2dBackwardCFirst(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const MaxPoolParams3D& params_3d) { - DoCUDAMaxPool2dBackwardCFirst<<As()->cuda_stream()>>>( - index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), - params_3d.num_channel(), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4), - params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4)); - } - - static void Maxpool2dForwardCLast(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, - const MaxPoolParams3D& params_3d) { - DoCUDAMaxPool2dForwardCLast<<As()->cuda_stream()>>>( - index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1], - params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), - params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), params_3d.GetYShape5D().At(3), - params_3d.GetYShape5D().At(4), params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2], - params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.dilation_3d()[1], - params_3d.dilation_3d()[2]); - } - - static void Maxpool2dBackwardCLast(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const MaxPoolParams3D& params_3d) { - DoCUDAMaxPool2dBackwardCLast<<As()->cuda_stream()>>>( - index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), - params_3d.num_channel(), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4), - params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4)); - } - - static void Maxpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, - const MaxPoolParams3D& params_3d) { - DoCUDAMaxPool3dForward<<As()->cuda_stream()>>>( - index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[0], - params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(), - params_3d.num_channel(), params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), - params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[0], params_3d.pool_size_3d()[1], - params_3d.pool_size_3d()[2], params_3d.stride_3d()[0], params_3d.stride_3d()[1], - params_3d.stride_3d()[2], params_3d.dilation_3d()[0], params_3d.dilation_3d()[1], - params_3d.dilation_3d()[2]); - } - - static void Maxpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const MaxPoolParams3D& params_3d) { - DoCUDAMaxPool3dBackward<<As()->cuda_stream()>>>( - index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), - params_3d.num_channel(), params_3d.GetYShape5D().At(2), params_3d.GetYShape5D().At(3), - params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), - params_3d.GetXShape5D().At(4)); - } -}; - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_POOL_KERNEL_UTIL, (DeviceType::kCUDA), - POOL_DATA_TYPE_CUDA_SEQ, POOL_IDX_DATA_TYPE_SEQ); - -} // namespace oneflow +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include +#ifdef WITH_ROCM +#include "hip/hip_runtime.h" +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/user/kernels/max_pool_kernel_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include + +namespace oneflow { +namespace { + +constexpr int kBlockSize = cuda::elementwise::kBlockSize; + +int GetMinThreadNum(int64_t elem_num) { return std::min(elem_num, kBlockSize); } + +int GetNumBlocks(int64_t elem_cnt) { + int num_blocks = 0; + OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks)); + return num_blocks; +} + +template +__device__ __inline__ void Maxpool2dForwardComputeCLast( + const NdIndexOffsetHelper& index_helper, IDX elem_num, const T* src, T* dest, + int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int64_t n_batch, + const int64_t n_channel, const int64_t x_height, const int64_t x_width, const int64_t y_height, + const int64_t y_width, const int32_t kernel_size_h, const int32_t kernel_size_w, + const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h, + const int32_t dilation_w) { + IDX n, h, w, c; + CUDA_1D_KERNEL_LOOP(num, elem_num) { + index_helper.OffsetToNdIndex(num, n, h, w, c); + + const IDX x_start_idx = n * n_channel * x_width * x_height; + const IDX y_start_idx = n * n_channel * y_height * y_width; + IDX hstart = h * stride_h - padding_h; + IDX wstart = w * stride_w - padding_w; + const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height + ? (hstart + (kernel_size_h - 1) * dilation_h + 1) + : x_height; + const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width + ? (wstart + (kernel_size_w - 1) * dilation_w + 1) + : x_width; + + while (hstart < 0) { hstart += dilation_h; } + while (wstart < 0) { wstart += dilation_w; } + /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */ + IDX max_index = hstart * x_width + wstart; + IDX src_idx = 0; + /* equal to -std::numeric_limits::infinity(); */ + T max_value = detail::numeric_limits::lower_bound(); + + for (IDX i = hstart; i < hend; i++) { + for (IDX j = wstart; j < wend; j++) { + const IDX window_idx = i * x_width * n_channel + j * n_channel + c; + const IDX search_idx = x_start_idx + window_idx; + T val = src[search_idx]; + if (val > max_value || detail::numerics::isnan(val)) { + max_value = val; + max_index = window_idx; + src_idx = search_idx; + } + } + } + const IDX out_idx = y_start_idx + h * y_width * n_channel + w * n_channel + c; + dest[out_idx] = src[src_idx]; + indice_ptr[out_idx] = max_index; + } +} + +} // namespace + +template +__launch_bounds__(kBlockSize) __global__ + void DoCUDAMaxPool1dForward(const NdIndexOffsetHelper index_helper, IDX elem_num, + const T* src, T* dest, int64_t* indice_ptr, int32_t padding_l, + int32_t n_batch, int32_t n_channel, int32_t x_length, + int32_t kernel_size_l, int32_t stride_l, int32_t dilation_l) { + Maxpool1dForwardCompute(index_helper, elem_num, src, dest, indice_ptr, padding_l, n_batch, + n_channel, x_length, kernel_size_l, stride_l, dilation_l); +}; + +template +__launch_bounds__(kBlockSize) __global__ + void DoCUDAMaxPool2dForwardCFirst(const NdIndexOffsetHelper index_helper, IDX elem_num, + const T* src, T* dest, int64_t* indice_ptr, int32_t padding_h, + int32_t padding_w, int32_t n_batch, int32_t n_channel, + int32_t x_height, int32_t x_width, int32_t kernel_size_h, + int32_t kernel_size_w, int32_t stride_h, int32_t stride_w, + int32_t dilation_h, int32_t dilation_w) { + Maxpool2dForwardComputeCFirst( + index_helper, elem_num, src, dest, indice_ptr, padding_h, padding_w, n_batch, n_channel, + x_height, x_width, kernel_size_h, kernel_size_w, stride_h, stride_w, dilation_h, dilation_w); +}; + +template +__launch_bounds__(kBlockSize) __global__ + void DoCUDAMaxPool2dForwardCLast(const NdIndexOffsetHelper index_helper, IDX elem_num, + const T* src, T* dest, int64_t* indice_ptr, int32_t padding_h, + int32_t padding_w, int32_t n_batch, int32_t n_channel, + int32_t x_height, int32_t x_width, int32_t y_height, + int32_t y_width, int32_t kernel_size_h, int32_t kernel_size_w, + int32_t stride_h, int32_t stride_w, int32_t dilation_h, + int32_t dilation_w) { + Maxpool2dForwardComputeCLast(index_helper, elem_num, src, dest, indice_ptr, padding_h, + padding_w, n_batch, n_channel, x_height, x_width, y_height, + y_width, kernel_size_h, kernel_size_w, stride_h, stride_w, + dilation_h, dilation_w); +}; + +template +__launch_bounds__(kBlockSize) __global__ + void DoCUDAMaxPool3dForward(const NdIndexOffsetHelper index_helper, IDX elem_num, + const T* src, T* dest, int64_t* indice_ptr, int32_t padding_t, + int32_t padding_h, int32_t padding_w, int32_t n_batch, + int32_t n_channel, int32_t x_time, int32_t x_height, + int32_t x_width, int32_t kernel_size_t, int32_t kernel_size_h, + int32_t kernel_size_w, int32_t stride_t, int32_t stride_h, + int32_t stride_w, int32_t dilation_t, int32_t dilation_h, + int32_t dilation_w) { + Maxpool3dForwardCompute(index_helper, elem_num, src, dest, indice_ptr, padding_t, + padding_h, padding_w, n_batch, n_channel, x_time, x_height, + x_width, kernel_size_t, kernel_size_h, kernel_size_w, stride_t, + stride_h, stride_w, dilation_t, dilation_h, dilation_w); +}; + +template +__launch_bounds__(kBlockSize) __global__ + void DoCUDAMaxPool1dBackward(const NdIndexOffsetHelper index_helper, const IDX elem_num, + const T* src, T* dest, const int64_t* indice_ptr, + const int32_t n_batch, const int32_t n_channel, + const int32_t src_length, const int32_t dst_length) { + Maxpool1dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, n_batch, + n_channel, src_length, dst_length); +}; + +template +__launch_bounds__(kBlockSize) __global__ + void DoCUDAMaxPool2dBackwardCFirst(const NdIndexOffsetHelper index_helper, + const IDX elem_num, const T* src, T* dest, + const int64_t* indice_ptr, const int32_t n_batch, + const int32_t n_channel, const int32_t src_height, + const int32_t src_width, const int32_t dst_height, + const int32_t dst_width) { + Maxpool2dBackwardComputeCFirst(index_helper, elem_num, src, dest, indice_ptr, n_batch, + n_channel, src_height, src_width, dst_height, dst_width); +}; + +template +__launch_bounds__(kBlockSize) __global__ + void DoCUDAMaxPool2dBackwardCLast(const NdIndexOffsetHelper index_helper, + const IDX elem_num, const T* src, T* dest, + const int64_t* indice_ptr, const int32_t n_batch, + const int32_t n_channel, const int32_t src_height, + const int32_t src_width, const int32_t dst_height, + const int32_t dst_width) { + Maxpool2dBackwardComputeCLast(index_helper, elem_num, src, dest, indice_ptr, n_batch, + n_channel, src_height, src_width, dst_height, dst_width); +}; + +template +__launch_bounds__(kBlockSize) __global__ + void DoCUDAMaxPool3dBackward(const NdIndexOffsetHelper index_helper, const IDX elem_num, + const T* src, T* dest, const int64_t* indice_ptr, + const int32_t n_batch, const int32_t n_channel, + const int32_t src_time, const int32_t src_height, + const int32_t src_width, const int32_t dst_time, + const int32_t dst_height, const int32_t dst_width) { + Maxpool3dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, n_batch, + n_channel, src_time, src_height, src_width, dst_time, dst_height, + dst_width); +}; + +template +struct PoolKernelUtil { + static void Maxpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, + const MaxPoolParams3D& params_3d) { + DoCUDAMaxPool1dForward<<As()->cuda_stream()>>>( + index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[2], + params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(4), + params_3d.pool_size_3d()[2], params_3d.stride_3d()[2], params_3d.dilation_3d()[2]); + } + + static void Maxpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, + const int64_t* indice_ptr, const MaxPoolParams3D& params_3d) { + DoCUDAMaxPool1dBackward<<As()->cuda_stream()>>>( + index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), + params_3d.num_channel(), params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(4)); + } + + static void Maxpool2dForwardCFirst(ep::Stream* stream, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, + const MaxPoolParams3D& params_3d) { + DoCUDAMaxPool2dForwardCFirst<<As()->cuda_stream()>>>( + index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1], + params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), + params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[1], + params_3d.pool_size_3d()[2], params_3d.stride_3d()[1], params_3d.stride_3d()[2], + params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]); + } + + static void Maxpool2dBackwardCFirst(ep::Stream* stream, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, + const int64_t* indice_ptr, const MaxPoolParams3D& params_3d) { + DoCUDAMaxPool2dBackwardCFirst<<As()->cuda_stream()>>>( + index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), + params_3d.num_channel(), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4), + params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4)); + } + + static void Maxpool2dForwardCLast(ep::Stream* stream, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, + const MaxPoolParams3D& params_3d) { + DoCUDAMaxPool2dForwardCLast<<As()->cuda_stream()>>>( + index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1], + params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), + params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), params_3d.GetYShape5D().At(3), + params_3d.GetYShape5D().At(4), params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2], + params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.dilation_3d()[1], + params_3d.dilation_3d()[2]); + } + + static void Maxpool2dBackwardCLast(ep::Stream* stream, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, + const int64_t* indice_ptr, const MaxPoolParams3D& params_3d) { + DoCUDAMaxPool2dBackwardCLast<<As()->cuda_stream()>>>( + index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), + params_3d.num_channel(), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4), + params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4)); + } + + static void Maxpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, + const MaxPoolParams3D& params_3d) { + DoCUDAMaxPool3dForward<<As()->cuda_stream()>>>( + index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[0], + params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(), + params_3d.num_channel(), params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), + params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[0], params_3d.pool_size_3d()[1], + params_3d.pool_size_3d()[2], params_3d.stride_3d()[0], params_3d.stride_3d()[1], + params_3d.stride_3d()[2], params_3d.dilation_3d()[0], params_3d.dilation_3d()[1], + params_3d.dilation_3d()[2]); + } + + static void Maxpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, + const int64_t* indice_ptr, const MaxPoolParams3D& params_3d) { + DoCUDAMaxPool3dBackward<<As()->cuda_stream()>>>( + index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), + params_3d.num_channel(), params_3d.GetYShape5D().At(2), params_3d.GetYShape5D().At(3), + params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), + params_3d.GetXShape5D().At(4)); + } +}; + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_POOL_KERNEL_UTIL, (DeviceType::kCUDA), + POOL_DATA_TYPE_CUDA_SEQ, POOL_IDX_DATA_TYPE_SEQ); + +} // namespace oneflow #endif // WITH_ROCM \ No newline at end of file diff --git a/oneflow/user/kernels/median_kernel.hip.cpp b/oneflow/user/kernels/median_kernel.hip.cpp index 34bdae2..7f71b02 100644 --- a/oneflow/user/kernels/median_kernel.hip.cpp +++ b/oneflow/user/kernels/median_kernel.hip.cpp @@ -1,69 +1,69 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/user/kernels/radix_sort.hip.h" - -namespace oneflow { - -template -class CudaMedianKernel final : public user_op::OpKernel { - public: - CudaMedianKernel() = default; - ~CudaMedianKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("input", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("output", 0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - - const int32_t instance_size = in->shape_view().elem_cnt(); - const size_t sort_tensor_buffer_bytes = GetCudaAlignedSize(instance_size * sizeof(T)); - SortKeysAscending( - in->dptr(), 1, instance_size, - reinterpret_cast(tmp_buffer->mut_dptr() + sort_tensor_buffer_bytes), - tmp_buffer->shape_view().elem_cnt() - sort_tensor_buffer_bytes, tmp_buffer->mut_dptr(), - ctx->stream()->As()->cuda_stream()); - Memcpy(ctx->stream(), out->mut_dptr(), - tmp_buffer->mut_dptr() + (instance_size - 1) / 2, sizeof(T)); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_MEDIAN_KERNEL(dtype) \ - REGISTER_USER_KERNEL("median") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("input", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { \ - const Shape& in_shape = ctx->InputShape("input", 0); \ - const int32_t instance_size = in_shape.elem_cnt(); \ - size_t sort_tmp_buffer_bytes = \ - InferTempStorageForSortKeysAscending(1, instance_size); \ - size_t sort_tensor_buffer_bytes = GetCudaAlignedSize(instance_size * sizeof(dtype)); \ - return sort_tmp_buffer_bytes + sort_tensor_buffer_bytes; \ - }); - -REGISTER_CUDA_MEDIAN_KERNEL(float) -REGISTER_CUDA_MEDIAN_KERNEL(double) -REGISTER_CUDA_MEDIAN_KERNEL(int8_t) -REGISTER_CUDA_MEDIAN_KERNEL(uint8_t) -REGISTER_CUDA_MEDIAN_KERNEL(int32_t) -REGISTER_CUDA_MEDIAN_KERNEL(int64_t) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/user/kernels/radix_sort.hip.h" + +namespace oneflow { + +template +class CudaMedianKernel final : public user_op::OpKernel { + public: + CudaMedianKernel() = default; + ~CudaMedianKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("input", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("output", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + + const int32_t instance_size = in->shape_view().elem_cnt(); + const size_t sort_tensor_buffer_bytes = GetCudaAlignedSize(instance_size * sizeof(T)); + SortKeysAscending( + in->dptr(), 1, instance_size, + reinterpret_cast(tmp_buffer->mut_dptr() + sort_tensor_buffer_bytes), + tmp_buffer->shape_view().elem_cnt() - sort_tensor_buffer_bytes, tmp_buffer->mut_dptr(), + ctx->stream()->As()->cuda_stream()); + Memcpy(ctx->stream(), out->mut_dptr(), + tmp_buffer->mut_dptr() + (instance_size - 1) / 2, sizeof(T)); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_MEDIAN_KERNEL(dtype) \ + REGISTER_USER_KERNEL("median") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("input", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { \ + const Shape& in_shape = ctx->InputShape("input", 0); \ + const int32_t instance_size = in_shape.elem_cnt(); \ + size_t sort_tmp_buffer_bytes = \ + InferTempStorageForSortKeysAscending(1, instance_size); \ + size_t sort_tensor_buffer_bytes = GetCudaAlignedSize(instance_size * sizeof(dtype)); \ + return sort_tmp_buffer_bytes + sort_tensor_buffer_bytes; \ + }); + +REGISTER_CUDA_MEDIAN_KERNEL(float) +REGISTER_CUDA_MEDIAN_KERNEL(double) +REGISTER_CUDA_MEDIAN_KERNEL(int8_t) +REGISTER_CUDA_MEDIAN_KERNEL(uint8_t) +REGISTER_CUDA_MEDIAN_KERNEL(int32_t) +REGISTER_CUDA_MEDIAN_KERNEL(int64_t) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/median_with_indices_kernel.hip.cpp b/oneflow/user/kernels/median_with_indices_kernel.hip.cpp index e5a8187..9485de1 100644 --- a/oneflow/user/kernels/median_with_indices_kernel.hip.cpp +++ b/oneflow/user/kernels/median_with_indices_kernel.hip.cpp @@ -1,156 +1,156 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/user/kernels/radix_sort.hip.h" - -namespace oneflow { - -namespace { - -template -__global__ void MedianSelectCuda(const IDX reduce_elem_cnt, const IDX stride, const T* in, - const int64_t* sort_indices, T* values, int64_t* indices) { - IDX nth = (stride - 1) / 2; - CUDA_1D_KERNEL_LOOP_T(IDX, i, reduce_elem_cnt) { - values[i] = in[i * stride + nth]; - indices[i] = sort_indices[i * stride + nth]; - } -} - -bool IsSafeUseIndex32(int64_t elem_cnt) { return elem_cnt < GetMaxVal() / 2; } - -template -void DispatchIndexSize(ep::Stream* stream, const int64_t elem_cnt, const int64_t stride, - const T* in, const int64_t* sort_indices, T* out, int64_t* out_indices) { - const int64_t reduce_elem_cnt = elem_cnt / stride; - if (IsSafeUseIndex32(elem_cnt)) { - RUN_CUDA_KERNEL((MedianSelectCuda), stream, reduce_elem_cnt, reduce_elem_cnt, - stride, in, sort_indices, out, out_indices); - } else { - RUN_CUDA_KERNEL((MedianSelectCuda), stream, reduce_elem_cnt, reduce_elem_cnt, - stride, in, sort_indices, out, out_indices); - } -} - -template -class TmpBufferManager final { - public: - OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager); - TmpBufferManager(size_t capacity, void* ptr, const ShapeView& in_shape) - : capacity_{capacity}, - sorted_in_elem_cnt_{in_shape.elem_cnt()}, - indices_elem_cnt_{sorted_in_elem_cnt_} { - const size_t sort_tensor_buffer_bytes = GetCudaAlignedSize(sorted_in_elem_cnt_ * sizeof(T)); - const size_t sort_indices_buffer_bytes = - GetCudaAlignedSize(indices_elem_cnt_ * sizeof(int64_t)); - sorted_in_ptr_ = reinterpret_cast(ptr); - in_indices_ptr_ = reinterpret_cast(reinterpret_cast(sorted_in_ptr_) - + sort_tensor_buffer_bytes); - out_indices_ptr_ = reinterpret_cast(reinterpret_cast(in_indices_ptr_) - + sort_indices_buffer_bytes); - temp_storage_ptr_ = reinterpret_cast(reinterpret_cast(out_indices_ptr_) - + sort_indices_buffer_bytes); - temp_storage_bytes_ = capacity_ - sort_tensor_buffer_bytes - sort_indices_buffer_bytes * 2; - CHECK_GE(temp_storage_bytes_, 0); - } - ~TmpBufferManager() = default; - - T* SortedInPtr() const { return sorted_in_ptr_; } - int64_t* InIndicesPtr() const { return in_indices_ptr_; } - int64_t* OutIndicesPtr() const { return out_indices_ptr_; } - void* TempStoragePtr() const { return temp_storage_ptr_; } - - size_t TempStorageBytes() const { return temp_storage_bytes_; } - - private: - size_t capacity_; - - T* sorted_in_ptr_; - int64_t* in_indices_ptr_; - int64_t* out_indices_ptr_; - void* temp_storage_ptr_; - - int64_t sorted_in_elem_cnt_; - int64_t indices_elem_cnt_; - size_t temp_storage_bytes_; -}; - -__global__ void InitializeIndices(int64_t elem_cnt, int64_t* indices_ptr, int64_t instance_size) { - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { indices_ptr[i] = i % instance_size; }; -} - -} // namespace - -template -class CudaMedianWithIndicesKernel final : public user_op::OpKernel { - public: - CudaMedianWithIndicesKernel() = default; - ~CudaMedianWithIndicesKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("input", 0); - if (in->shape_view().elem_cnt() == 0) return; - user_op::Tensor* values = ctx->Tensor4ArgNameAndIndex("values", 0); - user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - TmpBufferManager buf_manager(tmp_buffer->shape_view().elem_cnt(), - tmp_buffer->mut_dptr(), in->shape_view()); - - const int64_t elem_cnt = in->shape_view().elem_cnt(); - const int64_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); - const int64_t instance_num = elem_cnt / instance_size; - RUN_CUDA_KERNEL(InitializeIndices, ctx->stream(), elem_cnt, elem_cnt, - buf_manager.InIndicesPtr(), instance_size); - SortPairsAscending(in->dptr(), buf_manager.InIndicesPtr(), instance_num, instance_size, - buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(), - buf_manager.SortedInPtr(), buf_manager.OutIndicesPtr(), - ctx->stream()->As()->cuda_stream()); - DispatchIndexSize(ctx->stream(), elem_cnt, instance_size, buf_manager.SortedInPtr(), - buf_manager.OutIndicesPtr(), values->mut_dptr(), - indices->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(dtype) \ - REGISTER_USER_KERNEL("median_with_indices") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("input", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { \ - const Shape& in_shape = ctx->InputShape("input", 0); \ - const int64_t instance_size = in_shape.dim_vec().back(); \ - const int64_t instance_num = in_shape.elem_cnt() / instance_size; \ - size_t sort_tmp_buffer_bytes = \ - InferTempStorageForSortPairsAscending(instance_num, instance_size); \ - size_t sort_tensor_buffer_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(dtype)); \ - size_t sort_indices_buffer_bytes = \ - GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(int64_t)); \ - return sort_tmp_buffer_bytes + sort_tensor_buffer_bytes + sort_indices_buffer_bytes * 2; \ - }); - -REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(float) -REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(double) -REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(int8_t) -REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(uint8_t) -REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(int32_t) -REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(int64_t) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/user/kernels/radix_sort.hip.h" + +namespace oneflow { + +namespace { + +template +__global__ void MedianSelectCuda(const IDX reduce_elem_cnt, const IDX stride, const T* in, + const int64_t* sort_indices, T* values, int64_t* indices) { + IDX nth = (stride - 1) / 2; + CUDA_1D_KERNEL_LOOP_T(IDX, i, reduce_elem_cnt) { + values[i] = in[i * stride + nth]; + indices[i] = sort_indices[i * stride + nth]; + } +} + +bool IsSafeUseIndex32(int64_t elem_cnt) { return elem_cnt < GetMaxVal() / 2; } + +template +void DispatchIndexSize(ep::Stream* stream, const int64_t elem_cnt, const int64_t stride, + const T* in, const int64_t* sort_indices, T* out, int64_t* out_indices) { + const int64_t reduce_elem_cnt = elem_cnt / stride; + if (IsSafeUseIndex32(elem_cnt)) { + RUN_CUDA_KERNEL((MedianSelectCuda), stream, reduce_elem_cnt, reduce_elem_cnt, + stride, in, sort_indices, out, out_indices); + } else { + RUN_CUDA_KERNEL((MedianSelectCuda), stream, reduce_elem_cnt, reduce_elem_cnt, + stride, in, sort_indices, out, out_indices); + } +} + +template +class TmpBufferManager final { + public: + OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager); + TmpBufferManager(size_t capacity, void* ptr, const ShapeView& in_shape) + : capacity_{capacity}, + sorted_in_elem_cnt_{in_shape.elem_cnt()}, + indices_elem_cnt_{sorted_in_elem_cnt_} { + const size_t sort_tensor_buffer_bytes = GetCudaAlignedSize(sorted_in_elem_cnt_ * sizeof(T)); + const size_t sort_indices_buffer_bytes = + GetCudaAlignedSize(indices_elem_cnt_ * sizeof(int64_t)); + sorted_in_ptr_ = reinterpret_cast(ptr); + in_indices_ptr_ = reinterpret_cast(reinterpret_cast(sorted_in_ptr_) + + sort_tensor_buffer_bytes); + out_indices_ptr_ = reinterpret_cast(reinterpret_cast(in_indices_ptr_) + + sort_indices_buffer_bytes); + temp_storage_ptr_ = reinterpret_cast(reinterpret_cast(out_indices_ptr_) + + sort_indices_buffer_bytes); + temp_storage_bytes_ = capacity_ - sort_tensor_buffer_bytes - sort_indices_buffer_bytes * 2; + CHECK_GE(temp_storage_bytes_, 0); + } + ~TmpBufferManager() = default; + + T* SortedInPtr() const { return sorted_in_ptr_; } + int64_t* InIndicesPtr() const { return in_indices_ptr_; } + int64_t* OutIndicesPtr() const { return out_indices_ptr_; } + void* TempStoragePtr() const { return temp_storage_ptr_; } + + size_t TempStorageBytes() const { return temp_storage_bytes_; } + + private: + size_t capacity_; + + T* sorted_in_ptr_; + int64_t* in_indices_ptr_; + int64_t* out_indices_ptr_; + void* temp_storage_ptr_; + + int64_t sorted_in_elem_cnt_; + int64_t indices_elem_cnt_; + size_t temp_storage_bytes_; +}; + +__global__ void InitializeIndices(int64_t elem_cnt, int64_t* indices_ptr, int64_t instance_size) { + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { indices_ptr[i] = i % instance_size; }; +} + +} // namespace + +template +class CudaMedianWithIndicesKernel final : public user_op::OpKernel { + public: + CudaMedianWithIndicesKernel() = default; + ~CudaMedianWithIndicesKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("input", 0); + if (in->shape_view().elem_cnt() == 0) return; + user_op::Tensor* values = ctx->Tensor4ArgNameAndIndex("values", 0); + user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + TmpBufferManager buf_manager(tmp_buffer->shape_view().elem_cnt(), + tmp_buffer->mut_dptr(), in->shape_view()); + + const int64_t elem_cnt = in->shape_view().elem_cnt(); + const int64_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); + const int64_t instance_num = elem_cnt / instance_size; + RUN_CUDA_KERNEL(InitializeIndices, ctx->stream(), elem_cnt, elem_cnt, + buf_manager.InIndicesPtr(), instance_size); + SortPairsAscending(in->dptr(), buf_manager.InIndicesPtr(), instance_num, instance_size, + buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(), + buf_manager.SortedInPtr(), buf_manager.OutIndicesPtr(), + ctx->stream()->As()->cuda_stream()); + DispatchIndexSize(ctx->stream(), elem_cnt, instance_size, buf_manager.SortedInPtr(), + buf_manager.OutIndicesPtr(), values->mut_dptr(), + indices->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(dtype) \ + REGISTER_USER_KERNEL("median_with_indices") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("input", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { \ + const Shape& in_shape = ctx->InputShape("input", 0); \ + const int64_t instance_size = in_shape.dim_vec().back(); \ + const int64_t instance_num = in_shape.elem_cnt() / instance_size; \ + size_t sort_tmp_buffer_bytes = \ + InferTempStorageForSortPairsAscending(instance_num, instance_size); \ + size_t sort_tensor_buffer_bytes = GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(dtype)); \ + size_t sort_indices_buffer_bytes = \ + GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(int64_t)); \ + return sort_tmp_buffer_bytes + sort_tensor_buffer_bytes + sort_indices_buffer_bytes * 2; \ + }); + +REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(float) +REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(double) +REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(int8_t) +REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(uint8_t) +REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(int32_t) +REGISTER_CUDA_MEDIAN_WITH_INDICES_KERNEL(int64_t) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/min_max_observer_kernel.hip.cpp b/oneflow/user/kernels/min_max_observer_kernel.hip.cpp index 82c4cb4..df27571 100644 --- a/oneflow/user/kernels/min_max_observer_kernel.hip.cpp +++ b/oneflow/user/kernels/min_max_observer_kernel.hip.cpp @@ -1,260 +1,260 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/hip/atomic.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -#include - -namespace oneflow { - -namespace { - -// NOTE(Liang Depeng): refer to -// https://stackoverflow.com/questions/17371275/implementing-max-reduce-in-cuda -template -__global__ void ReduceMaxMinPerLayer(const T* input_ptr, const int64_t elements, T* max_ptr, - T* min_ptr) { - extern __shared__ unsigned char shared_max_min_memory[]; - T* shared_max = reinterpret_cast(shared_max_min_memory); - T* shared_min = shared_max + blockDim.x; - - int64_t tid = threadIdx.x; - int64_t gid = (blockDim.x * blockIdx.x) + tid; - shared_max[tid] = -FLT_MAX; - shared_min[tid] = -FLT_MAX; - - while (gid < elements) { - shared_max[tid] = max(shared_max[tid], input_ptr[gid]); - shared_min[tid] = max(shared_min[tid], -input_ptr[gid]); - gid += gridDim.x * blockDim.x; - } - __syncthreads(); - gid = (blockDim.x * blockIdx.x) + tid; - for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { - if (tid < s && gid < elements) { - shared_max[tid] = max(shared_max[tid], shared_max[tid + s]); - shared_min[tid] = max(shared_min[tid], shared_min[tid + s]); - } - __syncthreads(); - } - - if (tid == 0) { - cuda::atomic::Max(max_ptr, shared_max[0]); - cuda::atomic::Max(min_ptr, shared_min[0]); - } -} - -template -__global__ void ReduceMaxMinPerChannel(const T* input_ptr, const int64_t elements, - const int64_t num_channels, const int64_t panel_size, - T* max_ptr, T* min_ptr) { - extern __shared__ unsigned char shared_max_min_memory[]; - T* shared_max = reinterpret_cast(shared_max_min_memory); - T* shared_min = shared_max + blockDim.x; - - int64_t cur_channel = blockIdx.x; - int64_t tid = threadIdx.x; - - while (cur_channel < num_channels) { - shared_max[tid] = -FLT_MAX; - shared_min[tid] = -FLT_MAX; - - int64_t index = (panel_size * cur_channel) + tid; - int64_t end = panel_size * (cur_channel + 1); - - while (index < end && index < elements) { - shared_max[tid] = max(shared_max[tid], input_ptr[index]); - shared_min[tid] = max(shared_min[tid], -input_ptr[index]); - index += blockDim.x; - } - __syncthreads(); - - for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { - if (tid < s) { - shared_max[tid] = max(shared_max[tid], shared_max[tid + s]); - shared_min[tid] = max(shared_min[tid], shared_min[tid + s]); - } - __syncthreads(); - } - - if (tid == 0) { - cuda::atomic::Max(&max_ptr[cur_channel], shared_max[0]); - cuda::atomic::Max(&min_ptr[cur_channel], shared_min[0]); - } - - // __syncthreads(); - cur_channel += gridDim.x; - } -} - -template -__global__ void InitMaxMin(const int64_t elements, T* max_ptr, T* min_ptr) { - int64_t tid = threadIdx.x; - int64_t gid = (blockDim.x * blockIdx.x) + tid; - - while (gid < elements) { - max_ptr[gid] = -FLT_MAX; - min_ptr[gid] = -FLT_MAX; - gid += gridDim.x * blockDim.x; - } -} - -template -__global__ void CalScaleZeroPointSymmetric(const T* max_ptr, const T* min_ptr, - const int64_t elements, const double quantization_bit, - T* scale, T* zero_point) { - int64_t tid = threadIdx.x; - int64_t gid = (blockDim.x * blockIdx.x) + tid; - - while (gid < elements) { - T weight_max = max(fabs(max_ptr[gid]), fabs(min_ptr[gid])); - T denominator = static_cast(pow(2.0, quantization_bit - 1)) - 1; - scale[gid] = weight_max / denominator; - zero_point[gid] = 0; - gid += gridDim.x * blockDim.x; - } -} - -template -__global__ void CalScaleZeroPointAffine(const T* max_ptr, const T* min_ptr, const int64_t elements, - const double quantization_bit, T* scale, T* zero_point) { - int64_t tid = threadIdx.x; - int64_t gid = (blockDim.x * blockIdx.x) + tid; - - while (gid < elements) { - T denominator = static_cast(pow(2.0, quantization_bit)) - 1; - T min = -min_ptr[gid]; - T s = (max_ptr[gid] - min) / denominator; - scale[gid] = s; - zero_point[gid] = -nearbyint(min / s); - gid += gridDim.x * blockDim.x; - } -} - -template -__global__ void CalScaleZeroPointCambricon(const T* max_ptr, const T* min_ptr, - const int64_t elements, const double quantization_bit, - T* scale, T* zero_point) { - int64_t tid = threadIdx.x; - int64_t gid = (blockDim.x * blockIdx.x) + tid; - - while (gid < elements) { - T weight_max = max(fabs(max_ptr[gid]), fabs(min_ptr[gid])); - // T denominator = static_cast(pow(2.0, quantization_bit - 1)) - 1; - scale[gid] = floor(log2(weight_max)) - (quantization_bit - 2); - zero_point[gid] = 0; - gid += gridDim.x * blockDim.x; - } -} - -ep::CudaLaunchConfig GetLaunchConfig(ep::CudaStream* stream, size_t thread_num, - size_t shared_mem_size) { - ep::CudaLaunchConfig config; - stream->InitLaunchConfigWithWaves(&config, thread_num, kCudaThreadsNumPerBlock, 1); - config.shared_mem_size = shared_mem_size; - return config; -} - -} // namespace - -#define LAUNCH_CUDA_KERNEL(func, stream, thread_num, shared_mem_size, ...) \ - (stream)->LaunchKernel(func, GetLaunchConfig((stream), thread_num, shared_mem_size), __VA_ARGS__); - -template -class GpuMinMaxObserverKernel final : public user_op::OpKernel { - public: - GpuMinMaxObserverKernel() = default; - ~GpuMinMaxObserverKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0); - user_op::Tensor* zero_point = ctx->Tensor4ArgNameAndIndex("zero_point", 0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - - const std::string quantization_scheme = ctx->Attr("quantization_scheme"); - const int32_t quantization_bit = ctx->Attr("quantization_bit"); - const bool per_layer_quantization = ctx->Attr("per_layer_quantization"); - const std::string quantization_formula = ctx->Attr("quantization_formula"); - - const int64_t elements = in->shape_view().elem_cnt(); - const int64_t channel = scale->shape_view().At(0); - const int64_t panel_size = elements / channel; - T* max_ptr = tmp_buffer->mut_dptr(); - T* min_ptr = max_ptr + channel; - auto* cuda_stream = ctx->stream()->As(); - LAUNCH_CUDA_KERNEL((InitMaxMin), cuda_stream, channel, 0, channel, max_ptr, min_ptr); - - if (per_layer_quantization) { - LAUNCH_CUDA_KERNEL((ReduceMaxMinPerLayer), cuda_stream, elements, - kCudaThreadsNumPerBlock * 2 * sizeof(T), in->dptr(), elements, max_ptr, - min_ptr); - } else { // per-channel quantization - // NOTE(Liang Depeng): each block of threads will be responsible for - // computing the max and min values of the whole channel. - LAUNCH_CUDA_KERNEL((ReduceMaxMinPerChannel), cuda_stream, - channel * kCudaThreadsNumPerBlock, kCudaThreadsNumPerBlock * 2 * sizeof(T), - in->dptr(), elements, channel, panel_size, max_ptr, min_ptr); - } - - if (quantization_formula == "google") { - if (quantization_scheme == "symmetric") { - LAUNCH_CUDA_KERNEL((CalScaleZeroPointSymmetric), cuda_stream, channel, 0, max_ptr, - min_ptr, channel, static_cast(quantization_bit), - scale->mut_dptr(), zero_point->mut_dptr()); - } else { // quantization_scheme == "affine" - LAUNCH_CUDA_KERNEL((CalScaleZeroPointAffine), cuda_stream, channel, 0, max_ptr, min_ptr, - channel, static_cast(quantization_bit), scale->mut_dptr(), - zero_point->mut_dptr()); - } - } else if (quantization_formula == "cambricon") { - if (!per_layer_quantization) { - UNIMPLEMENTED() << " per-channel mode is not supported in cambricon scheme"; - } - LAUNCH_CUDA_KERNEL((CalScaleZeroPointCambricon), cuda_stream, channel, 0, max_ptr, min_ptr, - channel, static_cast(quantization_bit), scale->mut_dptr(), - zero_point->mut_dptr()); - } else { - UNIMPLEMENTED(); - } - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_MIN_MAX_OBSERVER_KERNEL(dtype) \ - REGISTER_USER_KERNEL("min_max_observer") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("in", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { \ - size_t tmp_buffer_size = 1; \ - if (ctx->Attr("per_layer_quantization") == false) { \ - const Shape& in_shape = ctx->InputShape("in", 0); \ - tmp_buffer_size = in_shape.At(0); \ - } \ - return 2 * tmp_buffer_size * sizeof(dtype); \ - }) - -REGISTER_MIN_MAX_OBSERVER_KERNEL(float); -REGISTER_MIN_MAX_OBSERVER_KERNEL(double); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/hip/atomic.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +#include + +namespace oneflow { + +namespace { + +// NOTE(Liang Depeng): refer to +// https://stackoverflow.com/questions/17371275/implementing-max-reduce-in-cuda +template +__global__ void ReduceMaxMinPerLayer(const T* input_ptr, const int64_t elements, T* max_ptr, + T* min_ptr) { + extern __shared__ unsigned char shared_max_min_memory[]; + T* shared_max = reinterpret_cast(shared_max_min_memory); + T* shared_min = shared_max + blockDim.x; + + int64_t tid = threadIdx.x; + int64_t gid = (blockDim.x * blockIdx.x) + tid; + shared_max[tid] = -FLT_MAX; + shared_min[tid] = -FLT_MAX; + + while (gid < elements) { + shared_max[tid] = max(shared_max[tid], input_ptr[gid]); + shared_min[tid] = max(shared_min[tid], -input_ptr[gid]); + gid += gridDim.x * blockDim.x; + } + __syncthreads(); + gid = (blockDim.x * blockIdx.x) + tid; + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s && gid < elements) { + shared_max[tid] = max(shared_max[tid], shared_max[tid + s]); + shared_min[tid] = max(shared_min[tid], shared_min[tid + s]); + } + __syncthreads(); + } + + if (tid == 0) { + cuda::atomic::Max(max_ptr, shared_max[0]); + cuda::atomic::Max(min_ptr, shared_min[0]); + } +} + +template +__global__ void ReduceMaxMinPerChannel(const T* input_ptr, const int64_t elements, + const int64_t num_channels, const int64_t panel_size, + T* max_ptr, T* min_ptr) { + extern __shared__ unsigned char shared_max_min_memory[]; + T* shared_max = reinterpret_cast(shared_max_min_memory); + T* shared_min = shared_max + blockDim.x; + + int64_t cur_channel = blockIdx.x; + int64_t tid = threadIdx.x; + + while (cur_channel < num_channels) { + shared_max[tid] = -FLT_MAX; + shared_min[tid] = -FLT_MAX; + + int64_t index = (panel_size * cur_channel) + tid; + int64_t end = panel_size * (cur_channel + 1); + + while (index < end && index < elements) { + shared_max[tid] = max(shared_max[tid], input_ptr[index]); + shared_min[tid] = max(shared_min[tid], -input_ptr[index]); + index += blockDim.x; + } + __syncthreads(); + + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) { + shared_max[tid] = max(shared_max[tid], shared_max[tid + s]); + shared_min[tid] = max(shared_min[tid], shared_min[tid + s]); + } + __syncthreads(); + } + + if (tid == 0) { + cuda::atomic::Max(&max_ptr[cur_channel], shared_max[0]); + cuda::atomic::Max(&min_ptr[cur_channel], shared_min[0]); + } + + // __syncthreads(); + cur_channel += gridDim.x; + } +} + +template +__global__ void InitMaxMin(const int64_t elements, T* max_ptr, T* min_ptr) { + int64_t tid = threadIdx.x; + int64_t gid = (blockDim.x * blockIdx.x) + tid; + + while (gid < elements) { + max_ptr[gid] = -FLT_MAX; + min_ptr[gid] = -FLT_MAX; + gid += gridDim.x * blockDim.x; + } +} + +template +__global__ void CalScaleZeroPointSymmetric(const T* max_ptr, const T* min_ptr, + const int64_t elements, const double quantization_bit, + T* scale, T* zero_point) { + int64_t tid = threadIdx.x; + int64_t gid = (blockDim.x * blockIdx.x) + tid; + + while (gid < elements) { + T weight_max = max(fabs(max_ptr[gid]), fabs(min_ptr[gid])); + T denominator = static_cast(pow(2.0, quantization_bit - 1)) - 1; + scale[gid] = weight_max / denominator; + zero_point[gid] = 0; + gid += gridDim.x * blockDim.x; + } +} + +template +__global__ void CalScaleZeroPointAffine(const T* max_ptr, const T* min_ptr, const int64_t elements, + const double quantization_bit, T* scale, T* zero_point) { + int64_t tid = threadIdx.x; + int64_t gid = (blockDim.x * blockIdx.x) + tid; + + while (gid < elements) { + T denominator = static_cast(pow(2.0, quantization_bit)) - 1; + T min = -min_ptr[gid]; + T s = (max_ptr[gid] - min) / denominator; + scale[gid] = s; + zero_point[gid] = -nearbyint(min / s); + gid += gridDim.x * blockDim.x; + } +} + +template +__global__ void CalScaleZeroPointCambricon(const T* max_ptr, const T* min_ptr, + const int64_t elements, const double quantization_bit, + T* scale, T* zero_point) { + int64_t tid = threadIdx.x; + int64_t gid = (blockDim.x * blockIdx.x) + tid; + + while (gid < elements) { + T weight_max = max(fabs(max_ptr[gid]), fabs(min_ptr[gid])); + // T denominator = static_cast(pow(2.0, quantization_bit - 1)) - 1; + scale[gid] = floor(log2(weight_max)) - (quantization_bit - 2); + zero_point[gid] = 0; + gid += gridDim.x * blockDim.x; + } +} + +ep::CudaLaunchConfig GetLaunchConfig(ep::CudaStream* stream, size_t thread_num, + size_t shared_mem_size) { + ep::CudaLaunchConfig config; + stream->InitLaunchConfigWithWaves(&config, thread_num, kCudaThreadsNumPerBlock, 1); + config.shared_mem_size = shared_mem_size; + return config; +} + +} // namespace + +#define LAUNCH_CUDA_KERNEL(func, stream, thread_num, shared_mem_size, ...) \ + (stream)->LaunchKernel(func, GetLaunchConfig((stream), thread_num, shared_mem_size), __VA_ARGS__); + +template +class GpuMinMaxObserverKernel final : public user_op::OpKernel { + public: + GpuMinMaxObserverKernel() = default; + ~GpuMinMaxObserverKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0); + user_op::Tensor* zero_point = ctx->Tensor4ArgNameAndIndex("zero_point", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + + const std::string quantization_scheme = ctx->Attr("quantization_scheme"); + const int32_t quantization_bit = ctx->Attr("quantization_bit"); + const bool per_layer_quantization = ctx->Attr("per_layer_quantization"); + const std::string quantization_formula = ctx->Attr("quantization_formula"); + + const int64_t elements = in->shape_view().elem_cnt(); + const int64_t channel = scale->shape_view().At(0); + const int64_t panel_size = elements / channel; + T* max_ptr = tmp_buffer->mut_dptr(); + T* min_ptr = max_ptr + channel; + auto* cuda_stream = ctx->stream()->As(); + LAUNCH_CUDA_KERNEL((InitMaxMin), cuda_stream, channel, 0, channel, max_ptr, min_ptr); + + if (per_layer_quantization) { + LAUNCH_CUDA_KERNEL((ReduceMaxMinPerLayer), cuda_stream, elements, + kCudaThreadsNumPerBlock * 2 * sizeof(T), in->dptr(), elements, max_ptr, + min_ptr); + } else { // per-channel quantization + // NOTE(Liang Depeng): each block of threads will be responsible for + // computing the max and min values of the whole channel. + LAUNCH_CUDA_KERNEL((ReduceMaxMinPerChannel), cuda_stream, + channel * kCudaThreadsNumPerBlock, kCudaThreadsNumPerBlock * 2 * sizeof(T), + in->dptr(), elements, channel, panel_size, max_ptr, min_ptr); + } + + if (quantization_formula == "google") { + if (quantization_scheme == "symmetric") { + LAUNCH_CUDA_KERNEL((CalScaleZeroPointSymmetric), cuda_stream, channel, 0, max_ptr, + min_ptr, channel, static_cast(quantization_bit), + scale->mut_dptr(), zero_point->mut_dptr()); + } else { // quantization_scheme == "affine" + LAUNCH_CUDA_KERNEL((CalScaleZeroPointAffine), cuda_stream, channel, 0, max_ptr, min_ptr, + channel, static_cast(quantization_bit), scale->mut_dptr(), + zero_point->mut_dptr()); + } + } else if (quantization_formula == "cambricon") { + if (!per_layer_quantization) { + UNIMPLEMENTED() << " per-channel mode is not supported in cambricon scheme"; + } + LAUNCH_CUDA_KERNEL((CalScaleZeroPointCambricon), cuda_stream, channel, 0, max_ptr, min_ptr, + channel, static_cast(quantization_bit), scale->mut_dptr(), + zero_point->mut_dptr()); + } else { + UNIMPLEMENTED(); + } + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_MIN_MAX_OBSERVER_KERNEL(dtype) \ + REGISTER_USER_KERNEL("min_max_observer") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("in", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { \ + size_t tmp_buffer_size = 1; \ + if (ctx->Attr("per_layer_quantization") == false) { \ + const Shape& in_shape = ctx->InputShape("in", 0); \ + tmp_buffer_size = in_shape.At(0); \ + } \ + return 2 * tmp_buffer_size * sizeof(dtype); \ + }) + +REGISTER_MIN_MAX_OBSERVER_KERNEL(float); +REGISTER_MIN_MAX_OBSERVER_KERNEL(double); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/model_update_kernel_util.hip.cpp b/oneflow/user/kernels/model_update_kernel_util.hip.cpp index 22e1510..ddb698d 100644 --- a/oneflow/user/kernels/model_update_kernel_util.hip.cpp +++ b/oneflow/user/kernels/model_update_kernel_util.hip.cpp @@ -1,799 +1,799 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/hip/atomic.hip.h" -#include "oneflow/user/kernels/model_update_kernel_util.h" -#include -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -__global__ void SGDUpdateGpu(int64_t n, T scale, float l1, float l2, float weight_decay, - float learning_rate_val, const float* learning_rate, - const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff, - T* model) { - if (skip_if != nullptr && *skip_if != 0) { return; } - if (learning_rate != nullptr) { learning_rate_val = *learning_rate; } - if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } - CUDA_1D_KERNEL_LOOP(i, n) { - SGDUpdateFunctor()(model_diff + i, model + i, scale, l1, l2, weight_decay, - learning_rate_val); - } -} - -template -__global__ void IndexedSlicesSGDUpdateGpu(float weight_decay, const IDX feature_size, - const int64_t lower_bound, const int64_t upper_bound, - const IDX* num_unique_instance, - const float* learning_rate, const K* indices, - const T* values, T* model) { - const int64_t n = *num_unique_instance * feature_size; - const T lr = *learning_rate; - CUDA_1D_KERNEL_LOOP_T(IDX, i, n) { - const IDX indices_idx = i / feature_size; - const IDX inner_idx = i - indices_idx * feature_size; - const IDX instance_id = indices[indices_idx]; - if (instance_id >= lower_bound && instance_id < upper_bound) { - const IDX model_idx = (instance_id - lower_bound) * feature_size + inner_idx; - SGDUpdateFunctor()(values + i, model + model_idx, static_cast(1), 0.0, 0.0, - weight_decay, lr); - } - } -} - -template -__global__ void SumSquares2(int64_t n, const T* src0, T* dst0, const T* src1, T* dst1) { - T t_sum0 = 0; - T t_sum1 = 0; - CUDA_1D_KERNEL_LOOP(i, n) { - t_sum0 += src0[i] * src0[i]; - t_sum1 += src1[i] * src1[i]; - } - typedef hipcub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage0; - __shared__ typename BlockReduce::TempStorage temp_storage1; - T b_sum0 = BlockReduce(temp_storage0).Sum(t_sum0); - T b_sum1 = BlockReduce(temp_storage1).Sum(t_sum1); - if (threadIdx.x == 0) { - cuda::atomic::Add(dst0, b_sum0); - cuda::atomic::Add(dst1, b_sum1); - } -} - -} // namespace - -template -struct SGDUpdateKernelUtil { - static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float weight_decay, - float learning_rate_val, const float* learning_rate, const T* scale_by_ptr, - const int64_t* skip_if, const G* model_diff, T* model); -}; - -template -void SGDUpdateKernelUtil::Update( - ep::Stream* stream, int64_t n, T scale, float l1, float l2, float weight_decay, - float learning_rate_val, const float* learning_rate, const T* scale_by_ptr, - const int64_t* skip_if, const G* model_diff, T* model) { - SGDUpdateGpu<<As()->cuda_stream()>>>( - n, scale, l1, l2, weight_decay, learning_rate_val, learning_rate, scale_by_ptr, skip_if, - model_diff, model); -} - -template -struct SGDUpdateKernelUtil { - static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float weight_decay, - float learning_rate_val, const float* learning_rate, const T* scale_by_ptr, - const int64_t* skip_if, const float16* model_diff, T* model); -}; - -template -void SGDUpdateKernelUtil::Update( - ep::Stream* stream, int64_t n, T scale, float l1, float l2, float weight_decay, - float learning_rate_val, const float* learning_rate, const T* scale_by_ptr, - const int64_t* skip_if, const float16* model_diff, T* model) { - SGDUpdateKernelUtil::Update( - stream, n, scale, l1, l2, weight_decay, learning_rate_val, learning_rate, scale_by_ptr, - skip_if, reinterpret_cast(model_diff), model); -} - -template struct SGDUpdateKernelUtil; -template struct SGDUpdateKernelUtil; -template struct SGDUpdateKernelUtil; - -template -struct IndexedSlicesSGDUpdateKernelUtil { - static void Update(ep::Stream* stream, float weight_decay, int64_t num_indices, - int64_t feature_size, int64_t lower_bound, int64_t upper_bound, - const IDX* num_unique_instance, const float* learning_rate, const K* indices, - const T* values, T* model); -}; - -template -void IndexedSlicesSGDUpdateKernelUtil::Update( - ep::Stream* stream, float weight_decay, int64_t num_indices, int64_t feature_size, - int64_t lower_bound, int64_t upper_bound, const IDX* num_unique_instance, - const float* learning_rate, const K* indices, const T* values, T* model) { - IndexedSlicesSGDUpdateGpu - <<As()->cuda_stream()>>>(weight_decay, feature_size, lower_bound, - upper_bound, num_unique_instance, - learning_rate, indices, values, model); -} - -#define INITIATE_INDEXED_SLICES_SGD_UPDATE_KERNEL_UTIL_CUDA(val_type_pair, key_type_pair, \ - idx_type_pair) \ - template struct IndexedSlicesSGDUpdateKernelUtil< \ - DeviceType::kCUDA, OF_PP_PAIR_FIRST(val_type_pair), OF_PP_PAIR_FIRST(key_type_pair), \ - OF_PP_PAIR_FIRST(idx_type_pair)>; -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_INDEXED_SLICES_SGD_UPDATE_KERNEL_UTIL_CUDA, - FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ, INT_DATA_TYPE_SEQ); -#undef INITIATE_INDEXED_SLICES_SGD_UPDATE_KERNEL_UTIL_CUDA - -namespace { - -template -__global__ void MomentumUpdateGpu(int64_t n, T scale, float l1, float l2, float beta, - float weight_decay, float learning_rate_val, - const float* learning_rate, const T* scale_by_ptr, - const int64_t* skip_if, const G* model_diff, T* model, - T* momentum) { - if (skip_if != nullptr && *skip_if != 0) { return; } - if (learning_rate != nullptr) { learning_rate_val = *learning_rate; } - if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } - CUDA_1D_KERNEL_LOOP(i, n) { - MomentumUpdateFunctor()(model_diff + i, model + i, momentum + i, scale, l1, l2, beta, - weight_decay, learning_rate_val); - } -} - -template -__global__ void IndexedSlicesMomentumUpdateGpu(T beta, float weight_decay, int64_t feature_size, - int64_t lower_bound, int64_t upper_bound, - const IDX* num_unique_instance, - const float* learning_rate, const K* indices, - const T* values, T* model, T* momentum) { - const int64_t n = *num_unique_instance * feature_size; - const T lr = *learning_rate; - CUDA_1D_KERNEL_LOOP(i, n) { - const IDX indices_idx = i / feature_size; - const IDX inner_idx = i - indices_idx * feature_size; - const IDX instance_id = indices[indices_idx]; - if (instance_id >= lower_bound && instance_id < upper_bound) { - const IDX model_idx = (instance_id - lower_bound) * feature_size + inner_idx; - MomentumUpdateFunctor()(values + i, model + model_idx, momentum + model_idx, - static_cast(1), 0.0, 0.0, beta, weight_decay, lr); - } - } -} - -} // namespace - -template -struct MomentumUpdateKernelUtil { - static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta, - float weight_decay, float learning_rate_val, const float* learning_rate, - const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff, T* model, - T* momentum); -}; - -template -void MomentumUpdateKernelUtil::Update( - ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta, float weight_decay, - float learning_rate_val, const float* learning_rate, const T* scale_by_ptr, - const int64_t* skip_if, const G* model_diff, T* model, T* momentum) { - MomentumUpdateGpu<<As()->cuda_stream()>>>( - n, scale, l1, l2, beta, weight_decay, learning_rate_val, learning_rate, scale_by_ptr, skip_if, - model_diff, model, momentum); -} - -template -struct MomentumUpdateKernelUtil { - static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta, - float weight_decay, float learning_rate_val, const float* learning_rate, - const T* scale_by_ptr, const int64_t* skip_if, const float16* model_diff, - T* model, T* momentum); -}; - -template -void MomentumUpdateKernelUtil::Update( - ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta, float weight_decay, - float learning_rate_val, const float* learning_rate, const T* scale_by_ptr, - const int64_t* skip_if, const float16* model_diff, T* model, T* momentum) { - MomentumUpdateKernelUtil::Update( - stream, n, scale, l1, l2, beta, weight_decay, learning_rate_val, learning_rate, scale_by_ptr, - skip_if, reinterpret_cast(model_diff), model, momentum); -} - -template struct MomentumUpdateKernelUtil; -template struct MomentumUpdateKernelUtil; -template struct MomentumUpdateKernelUtil; - -template -struct IndexedSlicesMomentumMdUpdateKernelUtil { - static void Update(ep::Stream* stream, T beta, float weight_decay, int64_t num_instance, - int64_t feature_size, int64_t lower_bound, int64_t upper_bound, - const IDX* num_unique_instance, const float* learning_rate, const K* indices, - const T* values, T* model, T* momentum); -}; - -template -void IndexedSlicesMomentumMdUpdateKernelUtil::Update( - ep::Stream* stream, T beta, float weight_decay, int64_t num_instance, int64_t feature_size, - int64_t lower_bound, int64_t upper_bound, const IDX* num_unique_instance, - const float* learning_rate, const K* indices, const T* values, T* model, T* momentum) { - IndexedSlicesMomentumUpdateGpu - <<As()->cuda_stream()>>>( - beta, weight_decay, feature_size, lower_bound, upper_bound, num_unique_instance, - learning_rate, indices, values, model, momentum); -} - -#define INSTANTIATE_INDEXED_SLICES_MOMENTUM_MODEL_UPDATE_KERNEL_UTIL_CUDA( \ - val_type_pair, key_type_pair, idx_type_pair) \ - template struct IndexedSlicesMomentumMdUpdateKernelUtil< \ - DeviceType::kCUDA, OF_PP_PAIR_FIRST(val_type_pair), OF_PP_PAIR_FIRST(key_type_pair), \ - OF_PP_PAIR_FIRST(idx_type_pair)>; -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_INDEXED_SLICES_MOMENTUM_MODEL_UPDATE_KERNEL_UTIL_CUDA, - FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ, INT_DATA_TYPE_SEQ); -#undef INSTANTIATE_INDEXED_SLICES_MOMENTUM_MODEL_UPDATE_KERNEL_UTIL_CUDA - -namespace { - -__global__ void BiasCorrectionFactorKernelGpu(float beta, const int64_t* train_step, float* out) { - const auto exponent = static_cast(*train_step + 1); - const float bias_correction_factor = 1.0 - static_cast(pow(beta, exponent)); - *out = bias_correction_factor; -} - -template -__global__ void AdamUpdateGpu(int64_t n, T scale, float l1, float l2, float beta1, float beta2, - float epsilon, float weight_decay, bool amsgrad, - bool do_bias_correction, float learning_rate_val, - float bias_correction1_val, float bias_correction2_val, - const float* learning_rate, const T* scale_by_ptr, - const int64_t* skip_if, const float* bias_correction1_ptr, - const float* bias_correction2_ptr, const G* model_diff, T* model, - T* m, T* v, T* max_v) { - if (skip_if != nullptr && *skip_if != 0) { return; } - if (learning_rate != nullptr) { learning_rate_val = *learning_rate; } - if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } - if (bias_correction1_ptr != nullptr) { bias_correction1_val = *bias_correction1_ptr; } - if (bias_correction2_ptr != nullptr) { bias_correction2_val = *bias_correction2_ptr; } - - CUDA_1D_KERNEL_LOOP(i, n) { - AdamUpdateFunctor()(model_diff + i, model + i, m + i, v + i, max_v + i, scale, l1, l2, - beta1, beta2, epsilon, weight_decay, amsgrad, bias_correction1_val, - bias_correction2_val, learning_rate_val); - } -} - -template -__global__ void AdamUpdateBetaTGpu(const T beta1, const T beta2, const int64_t* skip_if, T* beta1_t, - T* beta2_t) { - if (skip_if != nullptr && *skip_if != 0) { return; } - *beta1_t *= beta1; - *beta2_t *= beta2; -} - -template -__global__ void IndexedSlicesAdamUpdateGpu( - float beta1, float beta2, float epsilon, float weight_decay, bool amsgrad, - bool do_bias_correction, float lr, int64_t feature_size, int64_t lower_bound, - int64_t upper_bound, const IDX* num_unique_instance, const float* learning_rate, - const float* bias_correction1_ptr, const float* bias_correction2_ptr, const K* indices, - const T* values, T* model, T* m, T* v, T* max_v) { - if (learning_rate != nullptr) { lr = *learning_rate; } - float bias_correction1 = 1.0; - float bias_correction2 = 1.0; - if (bias_correction1_ptr != nullptr) { bias_correction1 = *bias_correction1_ptr; } - if (bias_correction2_ptr != nullptr) { bias_correction2 = *bias_correction2_ptr; } - - const int64_t n = *num_unique_instance * feature_size; - CUDA_1D_KERNEL_LOOP(i, n) { - const IDX indices_idx = i / feature_size; - const IDX inner_idx = i - indices_idx * feature_size; - const IDX instance_id = indices[indices_idx]; - if (instance_id >= lower_bound && instance_id < upper_bound) { - const IDX model_idx = (instance_id - lower_bound) * feature_size + inner_idx; - AdamUpdateFunctor()(values + i, model + model_idx, m + model_idx, v + model_idx, - max_v + i, static_cast(1), 0, 0, beta1, beta2, epsilon, - weight_decay, amsgrad, bias_correction1, bias_correction2, lr); - } - } -} - -template -__global__ void LambGradGpu(int64_t n, T scale, float l1, float l2, float beta1, float beta2, - float epsilon, const T* scale_by_ptr, const int64_t* skip_if, - const G* model_diff, T* adam_diff, T* model, T* m, T* v, - bool do_bias_correction, float bias_correction1_val, - float bias_correction2_val, const float* bias_correction1_ptr, - const float* bias_correction2_ptr) { - if (skip_if != nullptr && *skip_if != 0) { return; } - if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } - if (bias_correction1_ptr != nullptr) { bias_correction1_val = *bias_correction1_ptr; } - if (bias_correction2_ptr != nullptr) { bias_correction2_val = *bias_correction2_ptr; } - CUDA_1D_KERNEL_LOOP(i, n) { - LambGradFunctor()(model_diff + i, adam_diff + i, model + i, m + i, v + i, scale, l1, l2, - beta1, beta2, epsilon, do_bias_correction, bias_correction1_val, - bias_correction2_val); - } -} - -template -__global__ void LambUpdateGpu(int64_t n, float weight_decay, float learning_rate_val, - const float* learning_rate_ptr, const int64_t* skip_if, - const T* w_norm_2, const T* g_norm_2, const T* adam_diff, T* model) { - if (skip_if != nullptr && *skip_if != 0) { return; } - if (learning_rate_ptr != nullptr) { learning_rate_val = *learning_rate_ptr; } - const float lr = LambLRFunctor()(learning_rate_val, w_norm_2, g_norm_2); - CUDA_1D_KERNEL_LOOP(i, n) { LambUpdateFunctor()(lr, weight_decay, adam_diff + i, model + i); } -} - -} // namespace - -template -struct AdamUpdateKernelUtil { - static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta1, - float beta2, float epsilon, float weight_decay, bool amsgrad, - bool do_bias_correction, float learning_rate_val, float bias_correction1_val, - float bias_correction2_val, const float* learning_rate, const T* scale_by_ptr, - const int64_t* skip_if, const float* bias_correction1_ptr, - const float* bias_correction2_ptr, const G* model_diff, T* model, T* m, T* v, - T* max_v); -}; - -template -void AdamUpdateKernelUtil::Update( - ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta1, float beta2, - float epsilon, float weight_decay, bool amsgrad, bool do_bias_correction, - float learning_rate_val, float bias_correction1_val, float bias_correction2_val, - const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if, - const float* bias_correction1_ptr, const float* bias_correction2_ptr, const G* model_diff, - T* model, T* m, T* v, T* max_v) { - AdamUpdateGpu<<As()->cuda_stream()>>>( - n, scale, l1, l2, beta1, beta2, epsilon, weight_decay, amsgrad, do_bias_correction, - learning_rate_val, bias_correction1_val, bias_correction2_val, learning_rate, scale_by_ptr, - skip_if, bias_correction1_ptr, bias_correction2_ptr, model_diff, model, m, v, max_v); -} - -template -struct AdamUpdateKernelUtil { - static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta1, - float beta2, float epsilon, float weight_decay, bool amsgrad, - bool do_bias_correction, float learning_rate_val, float bias_correction1_val, - float bias_correction2_val, const float* learning_rate, const T* scale_by_ptr, - const int64_t* skip_if, const float* bias_correction1_ptr, - const float* bias_correction2_ptr, const float16* model_diff, T* model, T* m, - T* v, T* max_v); -}; - -template -void AdamUpdateKernelUtil::Update( - ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta1, float beta2, - float epsilon, float weight_decay, bool amsgrad, bool do_bias_correction, - float learning_rate_val, float bias_correction1_val, float bias_correction2_val, - const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if, - const float* bias_correction1_ptr, const float* bias_correction2_ptr, const float16* model_diff, - T* model, T* m, T* v, T* max_v) { - AdamUpdateKernelUtil::Update( - stream, n, scale, l1, l2, beta1, beta2, epsilon, weight_decay, amsgrad, do_bias_correction, - learning_rate_val, bias_correction1_val, bias_correction2_val, learning_rate, scale_by_ptr, - skip_if, bias_correction1_ptr, bias_correction2_ptr, - reinterpret_cast(model_diff), model, m, v, max_v); -} - -template struct AdamUpdateKernelUtil; -template struct AdamUpdateKernelUtil; -template struct AdamUpdateKernelUtil; - -template -__global__ void AdagradUpdateGpu(int64_t n, T scale, float l1, float l2, float lr_decay, - float epsilon, float weight_decay, float learning_rate_val, - int64_t train_step, const float* learning_rate, - const int64_t* train_step_ptr, const T* scale_by_ptr, - const int64_t* skip_if, const G* model_diff, T* model, T* sum) { - if (skip_if != nullptr && *skip_if != 0) { return; } - if (learning_rate != nullptr) { learning_rate_val = *learning_rate; } - if (train_step_ptr != nullptr) { - train_step = *train_step_ptr + 1; - } // train_step_ptr start from zero. - if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } - learning_rate_val = learning_rate_val / (1 + (train_step - 1) * lr_decay); - - CUDA_1D_KERNEL_LOOP(i, n) { - AdagradUpdateFunctor()(model_diff + i, model + i, sum + i, scale, l1, l2, epsilon, - weight_decay, learning_rate_val); - } -} - -template -struct AdagradUpdateKernelUtil { - static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_decay, - float epsilon, float weight_decay, float learning_rate_val, int64_t train_step, - const float* learning_rate, const int64_t* train_step_ptr, - const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff, T* model, - T* sum); -}; - -template -void AdagradUpdateKernelUtil::Update( - ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_decay, float epsilon, - float weight_decay, float learning_rate_val, int64_t train_step, const float* learning_rate, - const int64_t* train_step_ptr, const T* scale_by_ptr, const int64_t* skip_if, - const G* model_diff, T* model, T* sum) { - AdagradUpdateGpu<<As()->cuda_stream()>>>( - n, scale, l1, l2, lr_decay, epsilon, weight_decay, learning_rate_val, train_step, - learning_rate, train_step_ptr, scale_by_ptr, skip_if, model_diff, model, sum); -} - -template struct AdagradUpdateKernelUtil; -template struct AdagradUpdateKernelUtil; - -template -struct LambUpdateKernelUtil { - static void Update(ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1, - float beta2, float epsilon, float weight_decay, float learning_rate_val, - bool do_bias_correction, float bias_correction1_val, - float bias_correction2_val, const float* learning_rate_ptr, - const float* bias_correction1_ptr, const float* bias_correction2_ptr, - const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff, - T* adam_diff, T* model, T* m, T* v, T* norm_buffer); -}; - -template -void LambUpdateKernelUtil::Update( - ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1, float beta2, - float epsilon, float weight_decay, float learning_rate_val, bool do_bias_correction, - float bias_correction1_val, float bias_correction2_val, const float* learning_rate_ptr, - const float* bias_correction1_ptr, const float* bias_correction2_ptr, const T* scale_by_ptr, - const int64_t* skip_if, const G* model_diff, T* adam_diff, T* model, T* m, T* v, - T* norm_buffer) { - LambGradGpu<<As()->cuda_stream()>>>( - n, scale, l1, l2, beta1, beta2, epsilon, scale_by_ptr, skip_if, model_diff, adam_diff, model, - m, v, do_bias_correction, bias_correction1_val, bias_correction2_val, bias_correction1_ptr, - bias_correction2_ptr); - T* w_norm_2 = norm_buffer; - T* g_norm_2 = norm_buffer + 1; - Memset(stream, norm_buffer, 0, 2 * sizeof(T)); - SumSquares2 - <<As()->cuda_stream()>>>(n, model, w_norm_2, adam_diff, g_norm_2); - LambUpdateGpu<<As()->cuda_stream()>>>( - n, weight_decay, learning_rate_val, learning_rate_ptr, skip_if, w_norm_2, g_norm_2, adam_diff, - model); -} - -template -struct LambUpdateKernelUtil { - static void Update(ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1, - float beta2, float epsilon, float weight_decay, float learning_rate_val, - bool do_bias_correction, float bias_correction1_val, - float bias_correction2_val, const float* learning_rate_ptr, - const float* bias_correction1_ptr, const float* bias_correction2_ptr, - const T* scale_by_ptr, const int64_t* skip_if, const float16* model_diff, - T* adam_diff, T* model, T* m, T* v, T* norm_buffer); -}; - -template -void LambUpdateKernelUtil::Update( - ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1, float beta2, - float epsilon, float weight_decay, float learning_rate_val, bool do_bias_correction, - float bias_correction1_val, float bias_correction2_val, const float* learning_rate_ptr, - const float* bias_correction1_ptr, const float* bias_correction2_ptr, const T* scale_by_ptr, - const int64_t* skip_if, const float16* model_diff, T* adam_diff, T* model, T* m, T* v, - T* norm_buffer) { - LambUpdateKernelUtil::Update( - stream, n, scale, l1, l2, beta1, beta2, epsilon, weight_decay, learning_rate_val, - do_bias_correction, bias_correction1_val, bias_correction2_val, learning_rate_ptr, - bias_correction1_ptr, bias_correction2_ptr, scale_by_ptr, skip_if, - reinterpret_cast(model_diff), adam_diff, model, m, v, norm_buffer); -} - -template struct LambUpdateKernelUtil; -template struct LambUpdateKernelUtil; -template struct LambUpdateKernelUtil; - -template -struct IndexedSlicesAdamMdUpdateKernelUtil { - static void Update(ep::Stream* stream, float beta1, float beta2, float epsilon, - float weight_decay, bool amsgrad, bool do_bias_correction, float lr, - int64_t num_instance, int64_t feature_size, int64_t lower_bound, - int64_t upper_bound, const IDX* num_unique_instance, - const float* learning_rate, const float* bias_correction1_ptr, - const float* bias_correction2_ptr, const K* indices, const T* values, T* model, - T* m, T* v, T* max_v); -}; - -template -void IndexedSlicesAdamMdUpdateKernelUtil::Update( - ep::Stream* stream, float beta1, float beta2, float epsilon, float weight_decay, bool amsgrad, - bool do_bias_correction, float lr, int64_t num_instance, int64_t feature_size, - int64_t lower_bound, int64_t upper_bound, const IDX* num_unique_instance, - const float* learning_rate, const float* bias_correction1_ptr, - const float* bias_correction2_ptr, const K* indices, const T* values, T* model, T* m, T* v, - T* max_v) { - IndexedSlicesAdamUpdateGpu - <<As()->cuda_stream()>>>( - beta1, beta2, epsilon, weight_decay, amsgrad, do_bias_correction, lr, feature_size, - lower_bound, upper_bound, num_unique_instance, learning_rate, bias_correction1_ptr, - bias_correction2_ptr, indices, values, model, m, v, max_v); -} - -#define INSTANTIATE_INDEXED_SLICES_ADAM_MODEL_UPDATE_KERNEL_UTIL_CUDA( \ - val_type_pair, key_type_pair, idx_type_pair) \ - template struct IndexedSlicesAdamMdUpdateKernelUtil< \ - DeviceType::kCUDA, OF_PP_PAIR_FIRST(val_type_pair), OF_PP_PAIR_FIRST(key_type_pair), \ - OF_PP_PAIR_FIRST(idx_type_pair)>; -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_INDEXED_SLICES_ADAM_MODEL_UPDATE_KERNEL_UTIL_CUDA, - FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ, INT_DATA_TYPE_SEQ); -#undef INSTANTIATE_INDEXED_SLICES_ADAM_MODEL_UPDATE_KERNEL_UTIL_CUDA - -template<> -struct BiasCorrectionFactorKernelUtil { - static void BiasCorrectionFactorCompute(ep::Stream* stream, float beta, const int64_t* train_step, - float* out); -}; - -void BiasCorrectionFactorKernelUtil::BiasCorrectionFactorCompute( - ep::Stream* stream, float beta, const int64_t* train_step, float* out) { - BiasCorrectionFactorKernelGpu<<<1, 1, 0, stream->As()->cuda_stream()>>>( - beta, train_step, out); -} - -namespace { - -template -__global__ void RmsPropUpdateGpu(int64_t n, T scale, float l1, float l2, T* mean_square, - T* mean_gradient, float epsilon, float weight_decay, - float decay_rate, float learning_rate_val, - const float* learning_rate, const T* scale_by_ptr, - const int64_t* skip_if, const G* model_diff, T* model) { - if (skip_if != nullptr && *skip_if != 0) { return; } - if (learning_rate != nullptr) { learning_rate_val = *learning_rate; } - if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } - CUDA_1D_KERNEL_LOOP(i, n) { - RmsPropUpdateFunctor()(model_diff + i, model + i, n, scale, l1, l2, - mean_square + i, - (centered ? mean_gradient + i : nullptr), epsilon, - weight_decay, decay_rate, learning_rate_val); - } -} - -} // namespace - -template -struct RmsPropUpdateKernelUtil { - static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, bool centered, - float epsilon, float weight_decay, float decay_rate, float learning_rate_val, - const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if, - const G* model_diff, T* model, T* mean_square, T* mean_gradient); -}; - -template -void RmsPropUpdateKernelUtil::Update( - ep::Stream* stream, int64_t n, T scale, float l1, float l2, bool centered, float epsilon, - float weight_decay, float decay_rate, float learning_rate_val, const float* learning_rate, - const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff, T* model, T* mean_square, - T* mean_gradient) { - if (centered) { - RmsPropUpdateGpu<<As()->cuda_stream()>>>( - n, scale, l1, l2, mean_square, mean_gradient, epsilon, weight_decay, decay_rate, - learning_rate_val, learning_rate, scale_by_ptr, skip_if, model_diff, model); - } else { - RmsPropUpdateGpu<<As()->cuda_stream()>>>( - n, scale, l1, l2, mean_square, mean_gradient, epsilon, weight_decay, decay_rate, - learning_rate_val, learning_rate, scale_by_ptr, skip_if, model_diff, model); - } -} - -template -struct RmsPropUpdateKernelUtil { - static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, bool centered, - float epsilon, float weight_decay, float decay_rate, float learning_rate_val, - const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if, - const float16* model_diff, T* model, T* mean_square, T* mean_gradient); -}; - -template -void RmsPropUpdateKernelUtil::Update( - ep::Stream* stream, int64_t n, T scale, float l1, float l2, bool centered, float epsilon, - float weight_decay, float decay_rate, float learning_rate_val, const float* learning_rate, - const T* scale_by_ptr, const int64_t* skip_if, const float16* model_diff, T* model, - T* mean_square, T* mean_gradient) { - RmsPropUpdateKernelUtil::Update( - stream, n, scale, l1, l2, centered, epsilon, weight_decay, decay_rate, learning_rate_val, - learning_rate, scale_by_ptr, skip_if, reinterpret_cast(model_diff), model, - mean_square, mean_gradient); -} - -template struct RmsPropUpdateKernelUtil; -template struct RmsPropUpdateKernelUtil; -template struct RmsPropUpdateKernelUtil; - -namespace { - -template -__global__ void LarsScaleModelDiffGpu(int64_t n, T scale, float l1, float l2, const T* scale_by_ptr, - const int64_t* skip_if, const G* model_diff, T* model, - T* model_diff_tmp) { - if (skip_if != nullptr && *skip_if != 0) { return; } - if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } - CUDA_1D_KERNEL_LOOP(i, n) { - model_diff_tmp[i] = - CastScaleRegularizeGradientFunctor()(model_diff[i], model[i], scale, l1, l2); - } -} - -template -__global__ void LarsGetLocalLearningRateGpu(const float* learning_rate, T weight_decay, T epsilon, - T lars_coefficient, const int64_t* skip_if, - T* data_tmp) { - if (skip_if != nullptr && *skip_if != 0) { return; } - T* model_norm = &data_tmp[0]; - T* model_diff_norm = &data_tmp[1]; - T* local_learning_rate = &data_tmp[2]; - *model_norm = std::sqrt(*model_norm); - *model_diff_norm = std::sqrt(*model_diff_norm); - T lars = static_cast(1); - if (*model_norm > 0 && *model_diff_norm > 0) { - lars = lars_coefficient * (*model_norm) - / (epsilon + (*model_diff_norm) + weight_decay * (*model_norm)); - } - *local_learning_rate = *learning_rate * lars; -} - -template -__global__ void LarsUpdateGpu(int64_t n, float momentum_beta, T* momentum, float weight_decay, - const int64_t* skip_if, T* local_learning_rate, T* model_diff_tmp, - T* model) { - if (skip_if != nullptr && *skip_if != 0) { return; } - CUDA_1D_KERNEL_LOOP(i, n) { - LarsUpdateFunctor()(model_diff_tmp + i, model + i, momentum_beta, momentum + i, weight_decay, - *local_learning_rate); - } -} -} // namespace - -template -struct LarsUpdateKernelUtil { - static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, - float momentum_beta, float epsilon, float lars_coefficient, float weight_decay, - const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if, - const G* model_diff, T* model, T* momentum, T* data_tmp, T* model_diff_tmp); -}; - -template -void LarsUpdateKernelUtil::Update( - ep::Stream* stream, int64_t n, T scale, float l1, float l2, float momentum_beta, float epsilon, - float lars_coefficient, float weight_decay, const float* learning_rate, const T* scale_by_ptr, - const int64_t* skip_if, const G* model_diff, T* model, T* momentum, T* data_tmp, - T* model_diff_tmp) { - LarsScaleModelDiffGpu<<As()->cuda_stream()>>>( - n, scale, l1, l2, scale_by_ptr, skip_if, model_diff, model, model_diff_tmp); - T* model_norm = data_tmp; - T* model_diff_norm = data_tmp + 1; - T* local_learning_rate = data_tmp + 2; - Memset(stream, data_tmp, 0, 2 * sizeof(T)); - SumSquares2<<As()->cuda_stream()>>>(n, model, model_norm, - model_diff_tmp, model_diff_norm); - LarsGetLocalLearningRateGpu<<<1, 1, 0, stream->As()->cuda_stream()>>>( - learning_rate, weight_decay, epsilon, lars_coefficient, skip_if, data_tmp); - LarsUpdateGpu<<As()->cuda_stream()>>>( - n, momentum_beta, momentum, weight_decay, skip_if, local_learning_rate, model_diff_tmp, - model); -} - -template -struct LarsUpdateKernelUtil { - static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, - float momentum_beta, float epsilon, float lars_coefficient, float weight_decay, - const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if, - const float16* model_diff, T* model, T* momentum, T* data_tmp, - T* model_diff_tmp); -}; - -template -void LarsUpdateKernelUtil::Update( - ep::Stream* stream, int64_t n, T scale, float l1, float l2, float momentum_beta, float epsilon, - float lars_coefficient, float weight_decay, const float* learning_rate, const T* scale_by_ptr, - const int64_t* skip_if, const float16* model_diff, T* model, T* momentum, T* data_tmp, - T* model_diff_tmp) { - LarsUpdateKernelUtil::Update( - stream, n, scale, l1, l2, momentum_beta, epsilon, lars_coefficient, weight_decay, - learning_rate, scale_by_ptr, skip_if, reinterpret_cast(model_diff), model, - momentum, data_tmp, model_diff_tmp); -} - -template struct LarsUpdateKernelUtil; -template struct LarsUpdateKernelUtil; -template struct LarsUpdateKernelUtil; - -template -__global__ void FtrlUpdateGpu(int64_t n, T scale, float l1, float l2, float lr_power, float lambda1, - float lambda2, float beta, float weight_decay, - float learning_rate_val, const float* learning_rate, - const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff, - T* model, T* accumulate, T* z) { - if (skip_if != nullptr && *skip_if != 0) { return; } - if (learning_rate != nullptr) { learning_rate_val = *learning_rate; } - if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } - CUDA_1D_KERNEL_LOOP(i, n) { - FtrlUpdateFunctor()(model_diff + i, model + i, accumulate + i, z + i, scale, l1, l2, - lr_power, lambda1, lambda2, beta, weight_decay, learning_rate_val); - } -} - -template -struct FtrlUpdateKernelUtil { - static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_power, - float lambda1, float lambda2, float beta, float weight_decay, - float learning_rate_val, const float* learning_rate, const T* scale_by_ptr, - const int64_t* skip_if, const G* model_diff, T* model, T* accumulate, T* z); -}; - -template -void FtrlUpdateKernelUtil::Update( - ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_power, float lambda1, - float lambda2, float beta, float weight_decay, float learning_rate_val, - const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff, - T* model, T* accumulate, T* z) { - FtrlUpdateGpu<<As()->cuda_stream()>>>( - n, scale, l1, l2, lr_power, lambda1, lambda2, beta, weight_decay, learning_rate_val, - learning_rate, scale_by_ptr, skip_if, model_diff, model, accumulate, z); -} - -template -struct FtrlUpdateKernelUtil { - static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_power, - float lambda1, float lambda2, float beta, float weight_decay, - float learning_rate_val, const float* learning_rate, const T* scale_by_ptr, - const int64_t* skip_if, const float16* model_diff, T* model, T* accumulate, - T* z); -}; - -template -void FtrlUpdateKernelUtil::Update( - ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_power, float lambda1, - float lambda2, float beta, float weight_decay, float learning_rate_val, - const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if, - const float16* model_diff, T* model, T* accumulate, T* z) { - FtrlUpdateKernelUtil::Update( - stream, n, scale, l1, l2, lr_power, lambda1, lambda2, beta, weight_decay, learning_rate_val, - learning_rate, scale_by_ptr, skip_if, reinterpret_cast(model_diff), model, - accumulate, z); -} - -template struct FtrlUpdateKernelUtil; -template struct FtrlUpdateKernelUtil; -template struct FtrlUpdateKernelUtil; +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/hip/atomic.hip.h" +#include "oneflow/user/kernels/model_update_kernel_util.h" +#include +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +__global__ void SGDUpdateGpu(int64_t n, T scale, float l1, float l2, float weight_decay, + float learning_rate_val, const float* learning_rate, + const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff, + T* model) { + if (skip_if != nullptr && *skip_if != 0) { return; } + if (learning_rate != nullptr) { learning_rate_val = *learning_rate; } + if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } + CUDA_1D_KERNEL_LOOP(i, n) { + SGDUpdateFunctor()(model_diff + i, model + i, scale, l1, l2, weight_decay, + learning_rate_val); + } +} + +template +__global__ void IndexedSlicesSGDUpdateGpu(float weight_decay, const IDX feature_size, + const int64_t lower_bound, const int64_t upper_bound, + const IDX* num_unique_instance, + const float* learning_rate, const K* indices, + const T* values, T* model) { + const int64_t n = *num_unique_instance * feature_size; + const T lr = *learning_rate; + CUDA_1D_KERNEL_LOOP_T(IDX, i, n) { + const IDX indices_idx = i / feature_size; + const IDX inner_idx = i - indices_idx * feature_size; + const IDX instance_id = indices[indices_idx]; + if (instance_id >= lower_bound && instance_id < upper_bound) { + const IDX model_idx = (instance_id - lower_bound) * feature_size + inner_idx; + SGDUpdateFunctor()(values + i, model + model_idx, static_cast(1), 0.0, 0.0, + weight_decay, lr); + } + } +} + +template +__global__ void SumSquares2(int64_t n, const T* src0, T* dst0, const T* src1, T* dst1) { + T t_sum0 = 0; + T t_sum1 = 0; + CUDA_1D_KERNEL_LOOP(i, n) { + t_sum0 += src0[i] * src0[i]; + t_sum1 += src1[i] * src1[i]; + } + typedef hipcub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage0; + __shared__ typename BlockReduce::TempStorage temp_storage1; + T b_sum0 = BlockReduce(temp_storage0).Sum(t_sum0); + T b_sum1 = BlockReduce(temp_storage1).Sum(t_sum1); + if (threadIdx.x == 0) { + cuda::atomic::Add(dst0, b_sum0); + cuda::atomic::Add(dst1, b_sum1); + } +} + +} // namespace + +template +struct SGDUpdateKernelUtil { + static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float weight_decay, + float learning_rate_val, const float* learning_rate, const T* scale_by_ptr, + const int64_t* skip_if, const G* model_diff, T* model); +}; + +template +void SGDUpdateKernelUtil::Update( + ep::Stream* stream, int64_t n, T scale, float l1, float l2, float weight_decay, + float learning_rate_val, const float* learning_rate, const T* scale_by_ptr, + const int64_t* skip_if, const G* model_diff, T* model) { + SGDUpdateGpu<<As()->cuda_stream()>>>( + n, scale, l1, l2, weight_decay, learning_rate_val, learning_rate, scale_by_ptr, skip_if, + model_diff, model); +} + +template +struct SGDUpdateKernelUtil { + static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float weight_decay, + float learning_rate_val, const float* learning_rate, const T* scale_by_ptr, + const int64_t* skip_if, const float16* model_diff, T* model); +}; + +template +void SGDUpdateKernelUtil::Update( + ep::Stream* stream, int64_t n, T scale, float l1, float l2, float weight_decay, + float learning_rate_val, const float* learning_rate, const T* scale_by_ptr, + const int64_t* skip_if, const float16* model_diff, T* model) { + SGDUpdateKernelUtil::Update( + stream, n, scale, l1, l2, weight_decay, learning_rate_val, learning_rate, scale_by_ptr, + skip_if, reinterpret_cast(model_diff), model); +} + +template struct SGDUpdateKernelUtil; +template struct SGDUpdateKernelUtil; +template struct SGDUpdateKernelUtil; + +template +struct IndexedSlicesSGDUpdateKernelUtil { + static void Update(ep::Stream* stream, float weight_decay, int64_t num_indices, + int64_t feature_size, int64_t lower_bound, int64_t upper_bound, + const IDX* num_unique_instance, const float* learning_rate, const K* indices, + const T* values, T* model); +}; + +template +void IndexedSlicesSGDUpdateKernelUtil::Update( + ep::Stream* stream, float weight_decay, int64_t num_indices, int64_t feature_size, + int64_t lower_bound, int64_t upper_bound, const IDX* num_unique_instance, + const float* learning_rate, const K* indices, const T* values, T* model) { + IndexedSlicesSGDUpdateGpu + <<As()->cuda_stream()>>>(weight_decay, feature_size, lower_bound, + upper_bound, num_unique_instance, + learning_rate, indices, values, model); +} + +#define INITIATE_INDEXED_SLICES_SGD_UPDATE_KERNEL_UTIL_CUDA(val_type_pair, key_type_pair, \ + idx_type_pair) \ + template struct IndexedSlicesSGDUpdateKernelUtil< \ + DeviceType::kCUDA, OF_PP_PAIR_FIRST(val_type_pair), OF_PP_PAIR_FIRST(key_type_pair), \ + OF_PP_PAIR_FIRST(idx_type_pair)>; +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_INDEXED_SLICES_SGD_UPDATE_KERNEL_UTIL_CUDA, + FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ, INT_DATA_TYPE_SEQ); +#undef INITIATE_INDEXED_SLICES_SGD_UPDATE_KERNEL_UTIL_CUDA + +namespace { + +template +__global__ void MomentumUpdateGpu(int64_t n, T scale, float l1, float l2, float beta, + float weight_decay, float learning_rate_val, + const float* learning_rate, const T* scale_by_ptr, + const int64_t* skip_if, const G* model_diff, T* model, + T* momentum) { + if (skip_if != nullptr && *skip_if != 0) { return; } + if (learning_rate != nullptr) { learning_rate_val = *learning_rate; } + if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } + CUDA_1D_KERNEL_LOOP(i, n) { + MomentumUpdateFunctor()(model_diff + i, model + i, momentum + i, scale, l1, l2, beta, + weight_decay, learning_rate_val); + } +} + +template +__global__ void IndexedSlicesMomentumUpdateGpu(T beta, float weight_decay, int64_t feature_size, + int64_t lower_bound, int64_t upper_bound, + const IDX* num_unique_instance, + const float* learning_rate, const K* indices, + const T* values, T* model, T* momentum) { + const int64_t n = *num_unique_instance * feature_size; + const T lr = *learning_rate; + CUDA_1D_KERNEL_LOOP(i, n) { + const IDX indices_idx = i / feature_size; + const IDX inner_idx = i - indices_idx * feature_size; + const IDX instance_id = indices[indices_idx]; + if (instance_id >= lower_bound && instance_id < upper_bound) { + const IDX model_idx = (instance_id - lower_bound) * feature_size + inner_idx; + MomentumUpdateFunctor()(values + i, model + model_idx, momentum + model_idx, + static_cast(1), 0.0, 0.0, beta, weight_decay, lr); + } + } +} + +} // namespace + +template +struct MomentumUpdateKernelUtil { + static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta, + float weight_decay, float learning_rate_val, const float* learning_rate, + const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff, T* model, + T* momentum); +}; + +template +void MomentumUpdateKernelUtil::Update( + ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta, float weight_decay, + float learning_rate_val, const float* learning_rate, const T* scale_by_ptr, + const int64_t* skip_if, const G* model_diff, T* model, T* momentum) { + MomentumUpdateGpu<<As()->cuda_stream()>>>( + n, scale, l1, l2, beta, weight_decay, learning_rate_val, learning_rate, scale_by_ptr, skip_if, + model_diff, model, momentum); +} + +template +struct MomentumUpdateKernelUtil { + static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta, + float weight_decay, float learning_rate_val, const float* learning_rate, + const T* scale_by_ptr, const int64_t* skip_if, const float16* model_diff, + T* model, T* momentum); +}; + +template +void MomentumUpdateKernelUtil::Update( + ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta, float weight_decay, + float learning_rate_val, const float* learning_rate, const T* scale_by_ptr, + const int64_t* skip_if, const float16* model_diff, T* model, T* momentum) { + MomentumUpdateKernelUtil::Update( + stream, n, scale, l1, l2, beta, weight_decay, learning_rate_val, learning_rate, scale_by_ptr, + skip_if, reinterpret_cast(model_diff), model, momentum); +} + +template struct MomentumUpdateKernelUtil; +template struct MomentumUpdateKernelUtil; +template struct MomentumUpdateKernelUtil; + +template +struct IndexedSlicesMomentumMdUpdateKernelUtil { + static void Update(ep::Stream* stream, T beta, float weight_decay, int64_t num_instance, + int64_t feature_size, int64_t lower_bound, int64_t upper_bound, + const IDX* num_unique_instance, const float* learning_rate, const K* indices, + const T* values, T* model, T* momentum); +}; + +template +void IndexedSlicesMomentumMdUpdateKernelUtil::Update( + ep::Stream* stream, T beta, float weight_decay, int64_t num_instance, int64_t feature_size, + int64_t lower_bound, int64_t upper_bound, const IDX* num_unique_instance, + const float* learning_rate, const K* indices, const T* values, T* model, T* momentum) { + IndexedSlicesMomentumUpdateGpu + <<As()->cuda_stream()>>>( + beta, weight_decay, feature_size, lower_bound, upper_bound, num_unique_instance, + learning_rate, indices, values, model, momentum); +} + +#define INSTANTIATE_INDEXED_SLICES_MOMENTUM_MODEL_UPDATE_KERNEL_UTIL_CUDA( \ + val_type_pair, key_type_pair, idx_type_pair) \ + template struct IndexedSlicesMomentumMdUpdateKernelUtil< \ + DeviceType::kCUDA, OF_PP_PAIR_FIRST(val_type_pair), OF_PP_PAIR_FIRST(key_type_pair), \ + OF_PP_PAIR_FIRST(idx_type_pair)>; +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_INDEXED_SLICES_MOMENTUM_MODEL_UPDATE_KERNEL_UTIL_CUDA, + FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ, INT_DATA_TYPE_SEQ); +#undef INSTANTIATE_INDEXED_SLICES_MOMENTUM_MODEL_UPDATE_KERNEL_UTIL_CUDA + +namespace { + +__global__ void BiasCorrectionFactorKernelGpu(float beta, const int64_t* train_step, float* out) { + const auto exponent = static_cast(*train_step + 1); + const float bias_correction_factor = 1.0 - static_cast(pow(beta, exponent)); + *out = bias_correction_factor; +} + +template +__global__ void AdamUpdateGpu(int64_t n, T scale, float l1, float l2, float beta1, float beta2, + float epsilon, float weight_decay, bool amsgrad, + bool do_bias_correction, float learning_rate_val, + float bias_correction1_val, float bias_correction2_val, + const float* learning_rate, const T* scale_by_ptr, + const int64_t* skip_if, const float* bias_correction1_ptr, + const float* bias_correction2_ptr, const G* model_diff, T* model, + T* m, T* v, T* max_v) { + if (skip_if != nullptr && *skip_if != 0) { return; } + if (learning_rate != nullptr) { learning_rate_val = *learning_rate; } + if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } + if (bias_correction1_ptr != nullptr) { bias_correction1_val = *bias_correction1_ptr; } + if (bias_correction2_ptr != nullptr) { bias_correction2_val = *bias_correction2_ptr; } + + CUDA_1D_KERNEL_LOOP(i, n) { + AdamUpdateFunctor()(model_diff + i, model + i, m + i, v + i, max_v + i, scale, l1, l2, + beta1, beta2, epsilon, weight_decay, amsgrad, bias_correction1_val, + bias_correction2_val, learning_rate_val); + } +} + +template +__global__ void AdamUpdateBetaTGpu(const T beta1, const T beta2, const int64_t* skip_if, T* beta1_t, + T* beta2_t) { + if (skip_if != nullptr && *skip_if != 0) { return; } + *beta1_t *= beta1; + *beta2_t *= beta2; +} + +template +__global__ void IndexedSlicesAdamUpdateGpu( + float beta1, float beta2, float epsilon, float weight_decay, bool amsgrad, + bool do_bias_correction, float lr, int64_t feature_size, int64_t lower_bound, + int64_t upper_bound, const IDX* num_unique_instance, const float* learning_rate, + const float* bias_correction1_ptr, const float* bias_correction2_ptr, const K* indices, + const T* values, T* model, T* m, T* v, T* max_v) { + if (learning_rate != nullptr) { lr = *learning_rate; } + float bias_correction1 = 1.0; + float bias_correction2 = 1.0; + if (bias_correction1_ptr != nullptr) { bias_correction1 = *bias_correction1_ptr; } + if (bias_correction2_ptr != nullptr) { bias_correction2 = *bias_correction2_ptr; } + + const int64_t n = *num_unique_instance * feature_size; + CUDA_1D_KERNEL_LOOP(i, n) { + const IDX indices_idx = i / feature_size; + const IDX inner_idx = i - indices_idx * feature_size; + const IDX instance_id = indices[indices_idx]; + if (instance_id >= lower_bound && instance_id < upper_bound) { + const IDX model_idx = (instance_id - lower_bound) * feature_size + inner_idx; + AdamUpdateFunctor()(values + i, model + model_idx, m + model_idx, v + model_idx, + max_v + i, static_cast(1), 0, 0, beta1, beta2, epsilon, + weight_decay, amsgrad, bias_correction1, bias_correction2, lr); + } + } +} + +template +__global__ void LambGradGpu(int64_t n, T scale, float l1, float l2, float beta1, float beta2, + float epsilon, const T* scale_by_ptr, const int64_t* skip_if, + const G* model_diff, T* adam_diff, T* model, T* m, T* v, + bool do_bias_correction, float bias_correction1_val, + float bias_correction2_val, const float* bias_correction1_ptr, + const float* bias_correction2_ptr) { + if (skip_if != nullptr && *skip_if != 0) { return; } + if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } + if (bias_correction1_ptr != nullptr) { bias_correction1_val = *bias_correction1_ptr; } + if (bias_correction2_ptr != nullptr) { bias_correction2_val = *bias_correction2_ptr; } + CUDA_1D_KERNEL_LOOP(i, n) { + LambGradFunctor()(model_diff + i, adam_diff + i, model + i, m + i, v + i, scale, l1, l2, + beta1, beta2, epsilon, do_bias_correction, bias_correction1_val, + bias_correction2_val); + } +} + +template +__global__ void LambUpdateGpu(int64_t n, float weight_decay, float learning_rate_val, + const float* learning_rate_ptr, const int64_t* skip_if, + const T* w_norm_2, const T* g_norm_2, const T* adam_diff, T* model) { + if (skip_if != nullptr && *skip_if != 0) { return; } + if (learning_rate_ptr != nullptr) { learning_rate_val = *learning_rate_ptr; } + const float lr = LambLRFunctor()(learning_rate_val, w_norm_2, g_norm_2); + CUDA_1D_KERNEL_LOOP(i, n) { LambUpdateFunctor()(lr, weight_decay, adam_diff + i, model + i); } +} + +} // namespace + +template +struct AdamUpdateKernelUtil { + static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta1, + float beta2, float epsilon, float weight_decay, bool amsgrad, + bool do_bias_correction, float learning_rate_val, float bias_correction1_val, + float bias_correction2_val, const float* learning_rate, const T* scale_by_ptr, + const int64_t* skip_if, const float* bias_correction1_ptr, + const float* bias_correction2_ptr, const G* model_diff, T* model, T* m, T* v, + T* max_v); +}; + +template +void AdamUpdateKernelUtil::Update( + ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta1, float beta2, + float epsilon, float weight_decay, bool amsgrad, bool do_bias_correction, + float learning_rate_val, float bias_correction1_val, float bias_correction2_val, + const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if, + const float* bias_correction1_ptr, const float* bias_correction2_ptr, const G* model_diff, + T* model, T* m, T* v, T* max_v) { + AdamUpdateGpu<<As()->cuda_stream()>>>( + n, scale, l1, l2, beta1, beta2, epsilon, weight_decay, amsgrad, do_bias_correction, + learning_rate_val, bias_correction1_val, bias_correction2_val, learning_rate, scale_by_ptr, + skip_if, bias_correction1_ptr, bias_correction2_ptr, model_diff, model, m, v, max_v); +} + +template +struct AdamUpdateKernelUtil { + static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta1, + float beta2, float epsilon, float weight_decay, bool amsgrad, + bool do_bias_correction, float learning_rate_val, float bias_correction1_val, + float bias_correction2_val, const float* learning_rate, const T* scale_by_ptr, + const int64_t* skip_if, const float* bias_correction1_ptr, + const float* bias_correction2_ptr, const float16* model_diff, T* model, T* m, + T* v, T* max_v); +}; + +template +void AdamUpdateKernelUtil::Update( + ep::Stream* stream, int64_t n, T scale, float l1, float l2, float beta1, float beta2, + float epsilon, float weight_decay, bool amsgrad, bool do_bias_correction, + float learning_rate_val, float bias_correction1_val, float bias_correction2_val, + const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if, + const float* bias_correction1_ptr, const float* bias_correction2_ptr, const float16* model_diff, + T* model, T* m, T* v, T* max_v) { + AdamUpdateKernelUtil::Update( + stream, n, scale, l1, l2, beta1, beta2, epsilon, weight_decay, amsgrad, do_bias_correction, + learning_rate_val, bias_correction1_val, bias_correction2_val, learning_rate, scale_by_ptr, + skip_if, bias_correction1_ptr, bias_correction2_ptr, + reinterpret_cast(model_diff), model, m, v, max_v); +} + +template struct AdamUpdateKernelUtil; +template struct AdamUpdateKernelUtil; +template struct AdamUpdateKernelUtil; + +template +__global__ void AdagradUpdateGpu(int64_t n, T scale, float l1, float l2, float lr_decay, + float epsilon, float weight_decay, float learning_rate_val, + int64_t train_step, const float* learning_rate, + const int64_t* train_step_ptr, const T* scale_by_ptr, + const int64_t* skip_if, const G* model_diff, T* model, T* sum) { + if (skip_if != nullptr && *skip_if != 0) { return; } + if (learning_rate != nullptr) { learning_rate_val = *learning_rate; } + if (train_step_ptr != nullptr) { + train_step = *train_step_ptr + 1; + } // train_step_ptr start from zero. + if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } + learning_rate_val = learning_rate_val / (1 + (train_step - 1) * lr_decay); + + CUDA_1D_KERNEL_LOOP(i, n) { + AdagradUpdateFunctor()(model_diff + i, model + i, sum + i, scale, l1, l2, epsilon, + weight_decay, learning_rate_val); + } +} + +template +struct AdagradUpdateKernelUtil { + static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_decay, + float epsilon, float weight_decay, float learning_rate_val, int64_t train_step, + const float* learning_rate, const int64_t* train_step_ptr, + const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff, T* model, + T* sum); +}; + +template +void AdagradUpdateKernelUtil::Update( + ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_decay, float epsilon, + float weight_decay, float learning_rate_val, int64_t train_step, const float* learning_rate, + const int64_t* train_step_ptr, const T* scale_by_ptr, const int64_t* skip_if, + const G* model_diff, T* model, T* sum) { + AdagradUpdateGpu<<As()->cuda_stream()>>>( + n, scale, l1, l2, lr_decay, epsilon, weight_decay, learning_rate_val, train_step, + learning_rate, train_step_ptr, scale_by_ptr, skip_if, model_diff, model, sum); +} + +template struct AdagradUpdateKernelUtil; +template struct AdagradUpdateKernelUtil; + +template +struct LambUpdateKernelUtil { + static void Update(ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1, + float beta2, float epsilon, float weight_decay, float learning_rate_val, + bool do_bias_correction, float bias_correction1_val, + float bias_correction2_val, const float* learning_rate_ptr, + const float* bias_correction1_ptr, const float* bias_correction2_ptr, + const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff, + T* adam_diff, T* model, T* m, T* v, T* norm_buffer); +}; + +template +void LambUpdateKernelUtil::Update( + ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1, float beta2, + float epsilon, float weight_decay, float learning_rate_val, bool do_bias_correction, + float bias_correction1_val, float bias_correction2_val, const float* learning_rate_ptr, + const float* bias_correction1_ptr, const float* bias_correction2_ptr, const T* scale_by_ptr, + const int64_t* skip_if, const G* model_diff, T* adam_diff, T* model, T* m, T* v, + T* norm_buffer) { + LambGradGpu<<As()->cuda_stream()>>>( + n, scale, l1, l2, beta1, beta2, epsilon, scale_by_ptr, skip_if, model_diff, adam_diff, model, + m, v, do_bias_correction, bias_correction1_val, bias_correction2_val, bias_correction1_ptr, + bias_correction2_ptr); + T* w_norm_2 = norm_buffer; + T* g_norm_2 = norm_buffer + 1; + Memset(stream, norm_buffer, 0, 2 * sizeof(T)); + SumSquares2 + <<As()->cuda_stream()>>>(n, model, w_norm_2, adam_diff, g_norm_2); + LambUpdateGpu<<As()->cuda_stream()>>>( + n, weight_decay, learning_rate_val, learning_rate_ptr, skip_if, w_norm_2, g_norm_2, adam_diff, + model); +} + +template +struct LambUpdateKernelUtil { + static void Update(ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1, + float beta2, float epsilon, float weight_decay, float learning_rate_val, + bool do_bias_correction, float bias_correction1_val, + float bias_correction2_val, const float* learning_rate_ptr, + const float* bias_correction1_ptr, const float* bias_correction2_ptr, + const T* scale_by_ptr, const int64_t* skip_if, const float16* model_diff, + T* adam_diff, T* model, T* m, T* v, T* norm_buffer); +}; + +template +void LambUpdateKernelUtil::Update( + ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1, float beta2, + float epsilon, float weight_decay, float learning_rate_val, bool do_bias_correction, + float bias_correction1_val, float bias_correction2_val, const float* learning_rate_ptr, + const float* bias_correction1_ptr, const float* bias_correction2_ptr, const T* scale_by_ptr, + const int64_t* skip_if, const float16* model_diff, T* adam_diff, T* model, T* m, T* v, + T* norm_buffer) { + LambUpdateKernelUtil::Update( + stream, n, scale, l1, l2, beta1, beta2, epsilon, weight_decay, learning_rate_val, + do_bias_correction, bias_correction1_val, bias_correction2_val, learning_rate_ptr, + bias_correction1_ptr, bias_correction2_ptr, scale_by_ptr, skip_if, + reinterpret_cast(model_diff), adam_diff, model, m, v, norm_buffer); +} + +template struct LambUpdateKernelUtil; +template struct LambUpdateKernelUtil; +template struct LambUpdateKernelUtil; + +template +struct IndexedSlicesAdamMdUpdateKernelUtil { + static void Update(ep::Stream* stream, float beta1, float beta2, float epsilon, + float weight_decay, bool amsgrad, bool do_bias_correction, float lr, + int64_t num_instance, int64_t feature_size, int64_t lower_bound, + int64_t upper_bound, const IDX* num_unique_instance, + const float* learning_rate, const float* bias_correction1_ptr, + const float* bias_correction2_ptr, const K* indices, const T* values, T* model, + T* m, T* v, T* max_v); +}; + +template +void IndexedSlicesAdamMdUpdateKernelUtil::Update( + ep::Stream* stream, float beta1, float beta2, float epsilon, float weight_decay, bool amsgrad, + bool do_bias_correction, float lr, int64_t num_instance, int64_t feature_size, + int64_t lower_bound, int64_t upper_bound, const IDX* num_unique_instance, + const float* learning_rate, const float* bias_correction1_ptr, + const float* bias_correction2_ptr, const K* indices, const T* values, T* model, T* m, T* v, + T* max_v) { + IndexedSlicesAdamUpdateGpu + <<As()->cuda_stream()>>>( + beta1, beta2, epsilon, weight_decay, amsgrad, do_bias_correction, lr, feature_size, + lower_bound, upper_bound, num_unique_instance, learning_rate, bias_correction1_ptr, + bias_correction2_ptr, indices, values, model, m, v, max_v); +} + +#define INSTANTIATE_INDEXED_SLICES_ADAM_MODEL_UPDATE_KERNEL_UTIL_CUDA( \ + val_type_pair, key_type_pair, idx_type_pair) \ + template struct IndexedSlicesAdamMdUpdateKernelUtil< \ + DeviceType::kCUDA, OF_PP_PAIR_FIRST(val_type_pair), OF_PP_PAIR_FIRST(key_type_pair), \ + OF_PP_PAIR_FIRST(idx_type_pair)>; +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_INDEXED_SLICES_ADAM_MODEL_UPDATE_KERNEL_UTIL_CUDA, + FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ, INT_DATA_TYPE_SEQ); +#undef INSTANTIATE_INDEXED_SLICES_ADAM_MODEL_UPDATE_KERNEL_UTIL_CUDA + +template<> +struct BiasCorrectionFactorKernelUtil { + static void BiasCorrectionFactorCompute(ep::Stream* stream, float beta, const int64_t* train_step, + float* out); +}; + +void BiasCorrectionFactorKernelUtil::BiasCorrectionFactorCompute( + ep::Stream* stream, float beta, const int64_t* train_step, float* out) { + BiasCorrectionFactorKernelGpu<<<1, 1, 0, stream->As()->cuda_stream()>>>( + beta, train_step, out); +} + +namespace { + +template +__global__ void RmsPropUpdateGpu(int64_t n, T scale, float l1, float l2, T* mean_square, + T* mean_gradient, float epsilon, float weight_decay, + float decay_rate, float learning_rate_val, + const float* learning_rate, const T* scale_by_ptr, + const int64_t* skip_if, const G* model_diff, T* model) { + if (skip_if != nullptr && *skip_if != 0) { return; } + if (learning_rate != nullptr) { learning_rate_val = *learning_rate; } + if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } + CUDA_1D_KERNEL_LOOP(i, n) { + RmsPropUpdateFunctor()(model_diff + i, model + i, n, scale, l1, l2, + mean_square + i, + (centered ? mean_gradient + i : nullptr), epsilon, + weight_decay, decay_rate, learning_rate_val); + } +} + +} // namespace + +template +struct RmsPropUpdateKernelUtil { + static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, bool centered, + float epsilon, float weight_decay, float decay_rate, float learning_rate_val, + const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if, + const G* model_diff, T* model, T* mean_square, T* mean_gradient); +}; + +template +void RmsPropUpdateKernelUtil::Update( + ep::Stream* stream, int64_t n, T scale, float l1, float l2, bool centered, float epsilon, + float weight_decay, float decay_rate, float learning_rate_val, const float* learning_rate, + const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff, T* model, T* mean_square, + T* mean_gradient) { + if (centered) { + RmsPropUpdateGpu<<As()->cuda_stream()>>>( + n, scale, l1, l2, mean_square, mean_gradient, epsilon, weight_decay, decay_rate, + learning_rate_val, learning_rate, scale_by_ptr, skip_if, model_diff, model); + } else { + RmsPropUpdateGpu<<As()->cuda_stream()>>>( + n, scale, l1, l2, mean_square, mean_gradient, epsilon, weight_decay, decay_rate, + learning_rate_val, learning_rate, scale_by_ptr, skip_if, model_diff, model); + } +} + +template +struct RmsPropUpdateKernelUtil { + static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, bool centered, + float epsilon, float weight_decay, float decay_rate, float learning_rate_val, + const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if, + const float16* model_diff, T* model, T* mean_square, T* mean_gradient); +}; + +template +void RmsPropUpdateKernelUtil::Update( + ep::Stream* stream, int64_t n, T scale, float l1, float l2, bool centered, float epsilon, + float weight_decay, float decay_rate, float learning_rate_val, const float* learning_rate, + const T* scale_by_ptr, const int64_t* skip_if, const float16* model_diff, T* model, + T* mean_square, T* mean_gradient) { + RmsPropUpdateKernelUtil::Update( + stream, n, scale, l1, l2, centered, epsilon, weight_decay, decay_rate, learning_rate_val, + learning_rate, scale_by_ptr, skip_if, reinterpret_cast(model_diff), model, + mean_square, mean_gradient); +} + +template struct RmsPropUpdateKernelUtil; +template struct RmsPropUpdateKernelUtil; +template struct RmsPropUpdateKernelUtil; + +namespace { + +template +__global__ void LarsScaleModelDiffGpu(int64_t n, T scale, float l1, float l2, const T* scale_by_ptr, + const int64_t* skip_if, const G* model_diff, T* model, + T* model_diff_tmp) { + if (skip_if != nullptr && *skip_if != 0) { return; } + if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } + CUDA_1D_KERNEL_LOOP(i, n) { + model_diff_tmp[i] = + CastScaleRegularizeGradientFunctor()(model_diff[i], model[i], scale, l1, l2); + } +} + +template +__global__ void LarsGetLocalLearningRateGpu(const float* learning_rate, T weight_decay, T epsilon, + T lars_coefficient, const int64_t* skip_if, + T* data_tmp) { + if (skip_if != nullptr && *skip_if != 0) { return; } + T* model_norm = &data_tmp[0]; + T* model_diff_norm = &data_tmp[1]; + T* local_learning_rate = &data_tmp[2]; + *model_norm = std::sqrt(*model_norm); + *model_diff_norm = std::sqrt(*model_diff_norm); + T lars = static_cast(1); + if (*model_norm > 0 && *model_diff_norm > 0) { + lars = lars_coefficient * (*model_norm) + / (epsilon + (*model_diff_norm) + weight_decay * (*model_norm)); + } + *local_learning_rate = *learning_rate * lars; +} + +template +__global__ void LarsUpdateGpu(int64_t n, float momentum_beta, T* momentum, float weight_decay, + const int64_t* skip_if, T* local_learning_rate, T* model_diff_tmp, + T* model) { + if (skip_if != nullptr && *skip_if != 0) { return; } + CUDA_1D_KERNEL_LOOP(i, n) { + LarsUpdateFunctor()(model_diff_tmp + i, model + i, momentum_beta, momentum + i, weight_decay, + *local_learning_rate); + } +} +} // namespace + +template +struct LarsUpdateKernelUtil { + static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, + float momentum_beta, float epsilon, float lars_coefficient, float weight_decay, + const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if, + const G* model_diff, T* model, T* momentum, T* data_tmp, T* model_diff_tmp); +}; + +template +void LarsUpdateKernelUtil::Update( + ep::Stream* stream, int64_t n, T scale, float l1, float l2, float momentum_beta, float epsilon, + float lars_coefficient, float weight_decay, const float* learning_rate, const T* scale_by_ptr, + const int64_t* skip_if, const G* model_diff, T* model, T* momentum, T* data_tmp, + T* model_diff_tmp) { + LarsScaleModelDiffGpu<<As()->cuda_stream()>>>( + n, scale, l1, l2, scale_by_ptr, skip_if, model_diff, model, model_diff_tmp); + T* model_norm = data_tmp; + T* model_diff_norm = data_tmp + 1; + T* local_learning_rate = data_tmp + 2; + Memset(stream, data_tmp, 0, 2 * sizeof(T)); + SumSquares2<<As()->cuda_stream()>>>(n, model, model_norm, + model_diff_tmp, model_diff_norm); + LarsGetLocalLearningRateGpu<<<1, 1, 0, stream->As()->cuda_stream()>>>( + learning_rate, weight_decay, epsilon, lars_coefficient, skip_if, data_tmp); + LarsUpdateGpu<<As()->cuda_stream()>>>( + n, momentum_beta, momentum, weight_decay, skip_if, local_learning_rate, model_diff_tmp, + model); +} + +template +struct LarsUpdateKernelUtil { + static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, + float momentum_beta, float epsilon, float lars_coefficient, float weight_decay, + const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if, + const float16* model_diff, T* model, T* momentum, T* data_tmp, + T* model_diff_tmp); +}; + +template +void LarsUpdateKernelUtil::Update( + ep::Stream* stream, int64_t n, T scale, float l1, float l2, float momentum_beta, float epsilon, + float lars_coefficient, float weight_decay, const float* learning_rate, const T* scale_by_ptr, + const int64_t* skip_if, const float16* model_diff, T* model, T* momentum, T* data_tmp, + T* model_diff_tmp) { + LarsUpdateKernelUtil::Update( + stream, n, scale, l1, l2, momentum_beta, epsilon, lars_coefficient, weight_decay, + learning_rate, scale_by_ptr, skip_if, reinterpret_cast(model_diff), model, + momentum, data_tmp, model_diff_tmp); +} + +template struct LarsUpdateKernelUtil; +template struct LarsUpdateKernelUtil; +template struct LarsUpdateKernelUtil; + +template +__global__ void FtrlUpdateGpu(int64_t n, T scale, float l1, float l2, float lr_power, float lambda1, + float lambda2, float beta, float weight_decay, + float learning_rate_val, const float* learning_rate, + const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff, + T* model, T* accumulate, T* z) { + if (skip_if != nullptr && *skip_if != 0) { return; } + if (learning_rate != nullptr) { learning_rate_val = *learning_rate; } + if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } + CUDA_1D_KERNEL_LOOP(i, n) { + FtrlUpdateFunctor()(model_diff + i, model + i, accumulate + i, z + i, scale, l1, l2, + lr_power, lambda1, lambda2, beta, weight_decay, learning_rate_val); + } +} + +template +struct FtrlUpdateKernelUtil { + static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_power, + float lambda1, float lambda2, float beta, float weight_decay, + float learning_rate_val, const float* learning_rate, const T* scale_by_ptr, + const int64_t* skip_if, const G* model_diff, T* model, T* accumulate, T* z); +}; + +template +void FtrlUpdateKernelUtil::Update( + ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_power, float lambda1, + float lambda2, float beta, float weight_decay, float learning_rate_val, + const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff, + T* model, T* accumulate, T* z) { + FtrlUpdateGpu<<As()->cuda_stream()>>>( + n, scale, l1, l2, lr_power, lambda1, lambda2, beta, weight_decay, learning_rate_val, + learning_rate, scale_by_ptr, skip_if, model_diff, model, accumulate, z); +} + +template +struct FtrlUpdateKernelUtil { + static void Update(ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_power, + float lambda1, float lambda2, float beta, float weight_decay, + float learning_rate_val, const float* learning_rate, const T* scale_by_ptr, + const int64_t* skip_if, const float16* model_diff, T* model, T* accumulate, + T* z); +}; + +template +void FtrlUpdateKernelUtil::Update( + ep::Stream* stream, int64_t n, T scale, float l1, float l2, float lr_power, float lambda1, + float lambda2, float beta, float weight_decay, float learning_rate_val, + const float* learning_rate, const T* scale_by_ptr, const int64_t* skip_if, + const float16* model_diff, T* model, T* accumulate, T* z) { + FtrlUpdateKernelUtil::Update( + stream, n, scale, l1, l2, lr_power, lambda1, lambda2, beta, weight_decay, learning_rate_val, + learning_rate, scale_by_ptr, skip_if, reinterpret_cast(model_diff), model, + accumulate, z); +} + +template struct FtrlUpdateKernelUtil; +template struct FtrlUpdateKernelUtil; +template struct FtrlUpdateKernelUtil; } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/moving_average_min_max_observer_kernel.hip.cpp b/oneflow/user/kernels/moving_average_min_max_observer_kernel.hip.cpp index 4feb42b..6d65e1f 100644 --- a/oneflow/user/kernels/moving_average_min_max_observer_kernel.hip.cpp +++ b/oneflow/user/kernels/moving_average_min_max_observer_kernel.hip.cpp @@ -1,317 +1,317 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/hip/atomic.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -#include - -namespace oneflow { - -namespace { - -// NOTE(Liang Depeng): refer to -// https://stackoverflow.com/questions/17371275/implementing-max-reduce-in-cuda -template -__global__ void ReduceMaxMinPerLayer(const T* input_ptr, const int64_t elements, T* max_ptr, - T* min_ptr) { - extern __shared__ unsigned char shared_max_min_memory[]; - T* shared_max = reinterpret_cast(shared_max_min_memory); - T* shared_min = shared_max + blockDim.x; - - int64_t tid = threadIdx.x; - int64_t gid = (blockDim.x * blockIdx.x) + tid; - shared_max[tid] = -FLT_MAX; - shared_min[tid] = -FLT_MAX; - - while (gid < elements) { - shared_max[tid] = max(shared_max[tid], input_ptr[gid]); - shared_min[tid] = max(shared_min[tid], -input_ptr[gid]); - gid += gridDim.x * blockDim.x; - } - __syncthreads(); - gid = (blockDim.x * blockIdx.x) + tid; - for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { - if (tid < s && gid < elements) { - shared_max[tid] = max(shared_max[tid], shared_max[tid + s]); - shared_min[tid] = max(shared_min[tid], shared_min[tid + s]); - } - __syncthreads(); - } - - if (tid == 0) { - cuda::atomic::Max(max_ptr, shared_max[0]); - cuda::atomic::Max(min_ptr, shared_min[0]); - } -} - -template -__global__ void InitMaxMin(const int64_t elements, T* max_ptr, T* min_ptr) { - int64_t tid = threadIdx.x; - int64_t gid = (blockDim.x * blockIdx.x) + tid; - - while (gid < elements) { - max_ptr[gid] = -FLT_MAX; - min_ptr[gid] = -FLT_MAX; - gid += gridDim.x * blockDim.x; - } -} - -template -__global__ void CalScaleZeroPointSymmetric(const int64_t elements, const double quantization_bit, - const float momentum, const T* max_ptr, const T* min_ptr, - T* moving_max_ptr, T* moving_min_ptr, T* scale, - T* zero_point) { - int64_t tid = threadIdx.x; - int64_t gid = (blockDim.x * blockIdx.x) + tid; - - while (gid < elements) { - T activation_max = max(fabs(max_ptr[gid]), fabs(min_ptr[gid])); - T denominator = static_cast(pow(2.0, quantization_bit - 1)) - 1; - - if (moving_max_ptr[gid] == 0) - moving_max_ptr[gid] = activation_max; - else - moving_max_ptr[gid] = moving_max_ptr[gid] * momentum + activation_max * (1 - momentum); - - // NOTE(Liang Depeng): symmetric quantization only use moving_max to calculate the scale - moving_min_ptr[gid] = moving_max_ptr[gid]; - - scale[gid] = moving_max_ptr[gid] / denominator; - zero_point[gid] = 0; - gid += gridDim.x * blockDim.x; - } -} - -template -__global__ void CalFreezeScaleZeroPointSymmetric(const int64_t elements, - const double quantization_bit, - const float momentum, const T* moving_max_ptr, - T* scale, T* zero_point) { - int64_t tid = threadIdx.x; - int64_t gid = (blockDim.x * blockIdx.x) + tid; - - while (gid < elements) { - T denominator = static_cast(pow(2.0, quantization_bit - 1)) - 1; - scale[gid] = moving_max_ptr[gid] / denominator; - zero_point[gid] = 0; - gid += gridDim.x * blockDim.x; - } -} - -template -__global__ void CalScaleZeroPointAffine(const int64_t elements, const double quantization_bit, - const float momentum, const T* max_ptr, const T* min_ptr, - T* moving_max_ptr, T* moving_min_ptr, T* scale, - T* zero_point) { - int64_t tid = threadIdx.x; - int64_t gid = (blockDim.x * blockIdx.x) + tid; - - while (gid < elements) { - T denominator = static_cast(pow(2.0, quantization_bit)) - 1; - - if (moving_max_ptr[gid] == 0) - moving_max_ptr[gid] = max_ptr[gid]; - else - moving_max_ptr[gid] = moving_max_ptr[gid] * momentum + max_ptr[gid] * (1 - momentum); - - if (moving_min_ptr[gid] == 0) - moving_min_ptr[gid] = -min_ptr[gid]; - else - moving_min_ptr[gid] = moving_min_ptr[gid] * momentum + -min_ptr[gid] * (1 - momentum); - - T min = moving_min_ptr[gid]; - T s = (moving_max_ptr[gid] - min) / denominator; - - scale[gid] = s; - zero_point[gid] = -round(min / s); - gid += gridDim.x * blockDim.x; - } -} - -template -__global__ void CalFreezeScaleZeroPointAffine(const int64_t elements, const double quantization_bit, - const float momentum, const T* moving_max_ptr, - const T* moving_min_ptr, T* scale, T* zero_point) { - int64_t tid = threadIdx.x; - int64_t gid = (blockDim.x * blockIdx.x) + tid; - - while (gid < elements) { - T denominator = static_cast(pow(2.0, quantization_bit)) - 1; - - T min = moving_min_ptr[gid]; - T s = (moving_max_ptr[gid] - min) / denominator; - - scale[gid] = s; - zero_point[gid] = -round(min / s); - gid += gridDim.x * blockDim.x; - } -} - -template -__global__ void CalScaleZeroPointCambricon(const int64_t elements, const double quantization_bit, - const float momentum, const T* max_ptr, const T* min_ptr, - T* moving_max_ptr, T* moving_min_ptr, T* scale, - T* zero_point) { - int64_t tid = threadIdx.x; - int64_t gid = (blockDim.x * blockIdx.x) + tid; - - while (gid < elements) { - T activation_max = max(fabs(max_ptr[gid]), fabs(min_ptr[gid])); - - if (moving_max_ptr[gid] == 0) - moving_max_ptr[gid] = activation_max; - else - moving_max_ptr[gid] = moving_max_ptr[gid] * momentum + activation_max * (1 - momentum); - - // NOTE(Liang Depeng): cambricon quantization only use moving_max to calculate the scale - moving_min_ptr[gid] = moving_max_ptr[gid]; - - scale[gid] = floor(log2(moving_max_ptr[gid])) - (quantization_bit - 2); - zero_point[gid] = 0; - gid += gridDim.x * blockDim.x; - } -} - -template -__global__ void CalFreezeScaleZeroPointCambricon(const int64_t elements, - const double quantization_bit, - const float momentum, const T* moving_max_ptr, - T* scale, T* zero_point) { - int64_t tid = threadIdx.x; - int64_t gid = (blockDim.x * blockIdx.x) + tid; - - while (gid < elements) { - T denominator = static_cast(pow(2.0, quantization_bit - 1)) - 1; - scale[gid] = floor(log2(moving_max_ptr[gid])) - (quantization_bit - 2); - zero_point[gid] = 0; - gid += gridDim.x * blockDim.x; - } -} - -ep::CudaLaunchConfig GetLaunchConfig(ep::CudaStream* stream, size_t thread_num, - size_t shared_mem_size) { - ep::CudaLaunchConfig config; - stream->InitLaunchConfigWithWaves(&config, thread_num, kCudaThreadsNumPerBlock, 1); - config.shared_mem_size = shared_mem_size; - return config; -} - -} // namespace - -#define LAUNCH_CUDA_KERNEL(func, stream, thread_num, shared_mem_size, ...) \ - (stream)->LaunchKernel(func, GetLaunchConfig((stream), thread_num, shared_mem_size), __VA_ARGS__); - -template -class GpuMovingAverageMinMaxObserverKernel final : public user_op::OpKernel { - public: - GpuMovingAverageMinMaxObserverKernel() = default; - ~GpuMovingAverageMinMaxObserverKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - const user_op::Tensor* current_train_step = - ctx->Tensor4ArgNameAndIndex("current_train_step", 0); - user_op::Tensor* moving_max = ctx->Tensor4ArgNameAndIndex("moving_max", 0); - user_op::Tensor* moving_min = ctx->Tensor4ArgNameAndIndex("moving_min", 0); - user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0); - user_op::Tensor* zero_point = ctx->Tensor4ArgNameAndIndex("zero_point", 0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - - const bool is_training = ctx->Attr("training"); - const int64_t stop_update_after_iters = ctx->Attr("stop_update_after_iters"); - const std::string quantization_scheme = ctx->Attr("quantization_scheme"); - const int32_t quantization_bit = ctx->Attr("quantization_bit"); - const float momentum = ctx->Attr("momentum"); - const std::string quantization_formula = ctx->Attr("quantization_formula"); - - int64_t elements = in->shape_view().elem_cnt(); - T* max_ptr = tmp_buffer->mut_dptr(); - T* min_ptr = max_ptr + 1; - - int64_t* host_current_train_step_ptr = new int64_t[current_train_step->shape_view().elem_cnt()]; - OF_CUDA_CHECK(hipMemcpy(host_current_train_step_ptr, current_train_step->dptr(), - current_train_step->shape_view().elem_cnt() * sizeof(int64_t), - hipMemcpyDefault)); - auto* cuda_stream = ctx->stream()->As(); - if (*host_current_train_step_ptr <= stop_update_after_iters && is_training) { - LAUNCH_CUDA_KERNEL((InitMaxMin), cuda_stream, 1, 0, 1, max_ptr, min_ptr); - LAUNCH_CUDA_KERNEL((ReduceMaxMinPerLayer), cuda_stream, elements, - kCudaThreadsNumPerBlock * 2 * sizeof(T), in->dptr(), elements, max_ptr, - min_ptr); - } - bool moving = (*host_current_train_step_ptr <= stop_update_after_iters) && is_training; - if (quantization_formula == "google") { - if (quantization_scheme == "symmetric") { - if (moving) { - LAUNCH_CUDA_KERNEL((CalScaleZeroPointSymmetric), cuda_stream, 1, 0, 1, - static_cast(quantization_bit), momentum, max_ptr, min_ptr, - moving_max->mut_dptr(), moving_min->mut_dptr(), - scale->mut_dptr(), zero_point->mut_dptr()); - } else { - LAUNCH_CUDA_KERNEL((CalFreezeScaleZeroPointSymmetric), cuda_stream, 1, 0, 1, - static_cast(quantization_bit), momentum, moving_max->dptr(), - scale->mut_dptr(), zero_point->mut_dptr()); - } - } else { // quantization_scheme == "affine" - if (moving) { - LAUNCH_CUDA_KERNEL((CalScaleZeroPointAffine), cuda_stream, 1, 0, 1, - static_cast(quantization_bit), momentum, max_ptr, min_ptr, - moving_max->mut_dptr(), moving_min->mut_dptr(), - scale->mut_dptr(), zero_point->mut_dptr()); - } else { - LAUNCH_CUDA_KERNEL((CalFreezeScaleZeroPointAffine), cuda_stream, 1, 0, 1, - static_cast(quantization_bit), momentum, moving_max->dptr(), - moving_min->dptr(), scale->mut_dptr(), - zero_point->mut_dptr()); - } - } - } else if (quantization_formula == "cambricon") { - if (moving) { - LAUNCH_CUDA_KERNEL((CalScaleZeroPointCambricon), cuda_stream, 1, 0, 1, - static_cast(quantization_bit), momentum, max_ptr, min_ptr, - moving_max->mut_dptr(), moving_min->mut_dptr(), - scale->mut_dptr(), zero_point->mut_dptr()); - } else { - LAUNCH_CUDA_KERNEL((CalFreezeScaleZeroPointCambricon), cuda_stream, 1, 0, 1, - static_cast(quantization_bit), momentum, moving_max->dptr(), - scale->mut_dptr(), zero_point->mut_dptr()); - } - } else { - UNIMPLEMENTED(); - } - - delete[] host_current_train_step_ptr; - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_MOVING_AVERAGE_MIN_MAX_OBSERVER_KERNEL(dtype) \ - REGISTER_USER_KERNEL("moving_average_min_max_observer") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("in", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { return 2 * sizeof(dtype); }) - -REGISTER_MOVING_AVERAGE_MIN_MAX_OBSERVER_KERNEL(float); -REGISTER_MOVING_AVERAGE_MIN_MAX_OBSERVER_KERNEL(double); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/hip/atomic.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +#include + +namespace oneflow { + +namespace { + +// NOTE(Liang Depeng): refer to +// https://stackoverflow.com/questions/17371275/implementing-max-reduce-in-cuda +template +__global__ void ReduceMaxMinPerLayer(const T* input_ptr, const int64_t elements, T* max_ptr, + T* min_ptr) { + extern __shared__ unsigned char shared_max_min_memory[]; + T* shared_max = reinterpret_cast(shared_max_min_memory); + T* shared_min = shared_max + blockDim.x; + + int64_t tid = threadIdx.x; + int64_t gid = (blockDim.x * blockIdx.x) + tid; + shared_max[tid] = -FLT_MAX; + shared_min[tid] = -FLT_MAX; + + while (gid < elements) { + shared_max[tid] = max(shared_max[tid], input_ptr[gid]); + shared_min[tid] = max(shared_min[tid], -input_ptr[gid]); + gid += gridDim.x * blockDim.x; + } + __syncthreads(); + gid = (blockDim.x * blockIdx.x) + tid; + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s && gid < elements) { + shared_max[tid] = max(shared_max[tid], shared_max[tid + s]); + shared_min[tid] = max(shared_min[tid], shared_min[tid + s]); + } + __syncthreads(); + } + + if (tid == 0) { + cuda::atomic::Max(max_ptr, shared_max[0]); + cuda::atomic::Max(min_ptr, shared_min[0]); + } +} + +template +__global__ void InitMaxMin(const int64_t elements, T* max_ptr, T* min_ptr) { + int64_t tid = threadIdx.x; + int64_t gid = (blockDim.x * blockIdx.x) + tid; + + while (gid < elements) { + max_ptr[gid] = -FLT_MAX; + min_ptr[gid] = -FLT_MAX; + gid += gridDim.x * blockDim.x; + } +} + +template +__global__ void CalScaleZeroPointSymmetric(const int64_t elements, const double quantization_bit, + const float momentum, const T* max_ptr, const T* min_ptr, + T* moving_max_ptr, T* moving_min_ptr, T* scale, + T* zero_point) { + int64_t tid = threadIdx.x; + int64_t gid = (blockDim.x * blockIdx.x) + tid; + + while (gid < elements) { + T activation_max = max(fabs(max_ptr[gid]), fabs(min_ptr[gid])); + T denominator = static_cast(pow(2.0, quantization_bit - 1)) - 1; + + if (moving_max_ptr[gid] == 0) + moving_max_ptr[gid] = activation_max; + else + moving_max_ptr[gid] = moving_max_ptr[gid] * momentum + activation_max * (1 - momentum); + + // NOTE(Liang Depeng): symmetric quantization only use moving_max to calculate the scale + moving_min_ptr[gid] = moving_max_ptr[gid]; + + scale[gid] = moving_max_ptr[gid] / denominator; + zero_point[gid] = 0; + gid += gridDim.x * blockDim.x; + } +} + +template +__global__ void CalFreezeScaleZeroPointSymmetric(const int64_t elements, + const double quantization_bit, + const float momentum, const T* moving_max_ptr, + T* scale, T* zero_point) { + int64_t tid = threadIdx.x; + int64_t gid = (blockDim.x * blockIdx.x) + tid; + + while (gid < elements) { + T denominator = static_cast(pow(2.0, quantization_bit - 1)) - 1; + scale[gid] = moving_max_ptr[gid] / denominator; + zero_point[gid] = 0; + gid += gridDim.x * blockDim.x; + } +} + +template +__global__ void CalScaleZeroPointAffine(const int64_t elements, const double quantization_bit, + const float momentum, const T* max_ptr, const T* min_ptr, + T* moving_max_ptr, T* moving_min_ptr, T* scale, + T* zero_point) { + int64_t tid = threadIdx.x; + int64_t gid = (blockDim.x * blockIdx.x) + tid; + + while (gid < elements) { + T denominator = static_cast(pow(2.0, quantization_bit)) - 1; + + if (moving_max_ptr[gid] == 0) + moving_max_ptr[gid] = max_ptr[gid]; + else + moving_max_ptr[gid] = moving_max_ptr[gid] * momentum + max_ptr[gid] * (1 - momentum); + + if (moving_min_ptr[gid] == 0) + moving_min_ptr[gid] = -min_ptr[gid]; + else + moving_min_ptr[gid] = moving_min_ptr[gid] * momentum + -min_ptr[gid] * (1 - momentum); + + T min = moving_min_ptr[gid]; + T s = (moving_max_ptr[gid] - min) / denominator; + + scale[gid] = s; + zero_point[gid] = -round(min / s); + gid += gridDim.x * blockDim.x; + } +} + +template +__global__ void CalFreezeScaleZeroPointAffine(const int64_t elements, const double quantization_bit, + const float momentum, const T* moving_max_ptr, + const T* moving_min_ptr, T* scale, T* zero_point) { + int64_t tid = threadIdx.x; + int64_t gid = (blockDim.x * blockIdx.x) + tid; + + while (gid < elements) { + T denominator = static_cast(pow(2.0, quantization_bit)) - 1; + + T min = moving_min_ptr[gid]; + T s = (moving_max_ptr[gid] - min) / denominator; + + scale[gid] = s; + zero_point[gid] = -round(min / s); + gid += gridDim.x * blockDim.x; + } +} + +template +__global__ void CalScaleZeroPointCambricon(const int64_t elements, const double quantization_bit, + const float momentum, const T* max_ptr, const T* min_ptr, + T* moving_max_ptr, T* moving_min_ptr, T* scale, + T* zero_point) { + int64_t tid = threadIdx.x; + int64_t gid = (blockDim.x * blockIdx.x) + tid; + + while (gid < elements) { + T activation_max = max(fabs(max_ptr[gid]), fabs(min_ptr[gid])); + + if (moving_max_ptr[gid] == 0) + moving_max_ptr[gid] = activation_max; + else + moving_max_ptr[gid] = moving_max_ptr[gid] * momentum + activation_max * (1 - momentum); + + // NOTE(Liang Depeng): cambricon quantization only use moving_max to calculate the scale + moving_min_ptr[gid] = moving_max_ptr[gid]; + + scale[gid] = floor(log2(moving_max_ptr[gid])) - (quantization_bit - 2); + zero_point[gid] = 0; + gid += gridDim.x * blockDim.x; + } +} + +template +__global__ void CalFreezeScaleZeroPointCambricon(const int64_t elements, + const double quantization_bit, + const float momentum, const T* moving_max_ptr, + T* scale, T* zero_point) { + int64_t tid = threadIdx.x; + int64_t gid = (blockDim.x * blockIdx.x) + tid; + + while (gid < elements) { + T denominator = static_cast(pow(2.0, quantization_bit - 1)) - 1; + scale[gid] = floor(log2(moving_max_ptr[gid])) - (quantization_bit - 2); + zero_point[gid] = 0; + gid += gridDim.x * blockDim.x; + } +} + +ep::CudaLaunchConfig GetLaunchConfig(ep::CudaStream* stream, size_t thread_num, + size_t shared_mem_size) { + ep::CudaLaunchConfig config; + stream->InitLaunchConfigWithWaves(&config, thread_num, kCudaThreadsNumPerBlock, 1); + config.shared_mem_size = shared_mem_size; + return config; +} + +} // namespace + +#define LAUNCH_CUDA_KERNEL(func, stream, thread_num, shared_mem_size, ...) \ + (stream)->LaunchKernel(func, GetLaunchConfig((stream), thread_num, shared_mem_size), __VA_ARGS__); + +template +class GpuMovingAverageMinMaxObserverKernel final : public user_op::OpKernel { + public: + GpuMovingAverageMinMaxObserverKernel() = default; + ~GpuMovingAverageMinMaxObserverKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + const user_op::Tensor* current_train_step = + ctx->Tensor4ArgNameAndIndex("current_train_step", 0); + user_op::Tensor* moving_max = ctx->Tensor4ArgNameAndIndex("moving_max", 0); + user_op::Tensor* moving_min = ctx->Tensor4ArgNameAndIndex("moving_min", 0); + user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0); + user_op::Tensor* zero_point = ctx->Tensor4ArgNameAndIndex("zero_point", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + + const bool is_training = ctx->Attr("training"); + const int64_t stop_update_after_iters = ctx->Attr("stop_update_after_iters"); + const std::string quantization_scheme = ctx->Attr("quantization_scheme"); + const int32_t quantization_bit = ctx->Attr("quantization_bit"); + const float momentum = ctx->Attr("momentum"); + const std::string quantization_formula = ctx->Attr("quantization_formula"); + + int64_t elements = in->shape_view().elem_cnt(); + T* max_ptr = tmp_buffer->mut_dptr(); + T* min_ptr = max_ptr + 1; + + int64_t* host_current_train_step_ptr = new int64_t[current_train_step->shape_view().elem_cnt()]; + OF_CUDA_CHECK(hipMemcpy(host_current_train_step_ptr, current_train_step->dptr(), + current_train_step->shape_view().elem_cnt() * sizeof(int64_t), + hipMemcpyDefault)); + auto* cuda_stream = ctx->stream()->As(); + if (*host_current_train_step_ptr <= stop_update_after_iters && is_training) { + LAUNCH_CUDA_KERNEL((InitMaxMin), cuda_stream, 1, 0, 1, max_ptr, min_ptr); + LAUNCH_CUDA_KERNEL((ReduceMaxMinPerLayer), cuda_stream, elements, + kCudaThreadsNumPerBlock * 2 * sizeof(T), in->dptr(), elements, max_ptr, + min_ptr); + } + bool moving = (*host_current_train_step_ptr <= stop_update_after_iters) && is_training; + if (quantization_formula == "google") { + if (quantization_scheme == "symmetric") { + if (moving) { + LAUNCH_CUDA_KERNEL((CalScaleZeroPointSymmetric), cuda_stream, 1, 0, 1, + static_cast(quantization_bit), momentum, max_ptr, min_ptr, + moving_max->mut_dptr(), moving_min->mut_dptr(), + scale->mut_dptr(), zero_point->mut_dptr()); + } else { + LAUNCH_CUDA_KERNEL((CalFreezeScaleZeroPointSymmetric), cuda_stream, 1, 0, 1, + static_cast(quantization_bit), momentum, moving_max->dptr(), + scale->mut_dptr(), zero_point->mut_dptr()); + } + } else { // quantization_scheme == "affine" + if (moving) { + LAUNCH_CUDA_KERNEL((CalScaleZeroPointAffine), cuda_stream, 1, 0, 1, + static_cast(quantization_bit), momentum, max_ptr, min_ptr, + moving_max->mut_dptr(), moving_min->mut_dptr(), + scale->mut_dptr(), zero_point->mut_dptr()); + } else { + LAUNCH_CUDA_KERNEL((CalFreezeScaleZeroPointAffine), cuda_stream, 1, 0, 1, + static_cast(quantization_bit), momentum, moving_max->dptr(), + moving_min->dptr(), scale->mut_dptr(), + zero_point->mut_dptr()); + } + } + } else if (quantization_formula == "cambricon") { + if (moving) { + LAUNCH_CUDA_KERNEL((CalScaleZeroPointCambricon), cuda_stream, 1, 0, 1, + static_cast(quantization_bit), momentum, max_ptr, min_ptr, + moving_max->mut_dptr(), moving_min->mut_dptr(), + scale->mut_dptr(), zero_point->mut_dptr()); + } else { + LAUNCH_CUDA_KERNEL((CalFreezeScaleZeroPointCambricon), cuda_stream, 1, 0, 1, + static_cast(quantization_bit), momentum, moving_max->dptr(), + scale->mut_dptr(), zero_point->mut_dptr()); + } + } else { + UNIMPLEMENTED(); + } + + delete[] host_current_train_step_ptr; + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_MOVING_AVERAGE_MIN_MAX_OBSERVER_KERNEL(dtype) \ + REGISTER_USER_KERNEL("moving_average_min_max_observer") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("in", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { return 2 * sizeof(dtype); }) + +REGISTER_MOVING_AVERAGE_MIN_MAX_OBSERVER_KERNEL(float); +REGISTER_MOVING_AVERAGE_MIN_MAX_OBSERVER_KERNEL(double); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/multi_reduce_kernels.hip.cpp b/oneflow/user/kernels/multi_reduce_kernels.hip.cpp index 85440cf..4b33383 100644 --- a/oneflow/user/kernels/multi_reduce_kernels.hip.cpp +++ b/oneflow/user/kernels/multi_reduce_kernels.hip.cpp @@ -1,142 +1,142 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/user/kernels/multi_reduce_kernels.h" -#include "oneflow/core/ep/include/primitive/fill.h" -#include "oneflow/core/hip/atomic.hip.h" -#include "oneflow/core/device/cuda_util.h" -#include -#include - -namespace oneflow { - -namespace { - -constexpr int64_t kMultiReduceMaxPackSize = 64; - -template -struct MultiReduceParamsPack { - MultiReduceParam params[kMultiReduceMaxPackSize]; - size_t size; -}; - -template -__global__ void MultiBlockReduceGpu(TransformFn transform, - const MultiReduceParamsPack pack_params, const T init, - T* out) { - ReduceFn reduce_fn{}; - T t_out = init; - for (int i = 0; i < pack_params.size; ++i) { - const auto& param = pack_params.params[i]; - CUDA_1D_KERNEL_LOOP(j, param.size) { t_out = reduce_fn(t_out, transform(param.data[j])); } - } - typedef hipcub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - T b_out = BlockReduce(temp_storage).Reduce(t_out, reduce_fn); - if (threadIdx.x == 0) { out[blockIdx.x] = b_out; } -} - -size_t InferTempStorageSize(user_op::InferContext* ctx) { - auto input_size = ctx->input_size("x"); - if (input_size == 0) { return 0; } - int64_t max_elem_cnt = 0; - int64_t pack_size = 0; - int32_t num_blocks = 0; - for (size_t i = 0; i < input_size; ++i) { - int64_t elem_cnt = ctx->InputShape("x", i).elem_cnt(); - max_elem_cnt = std::max(max_elem_cnt, elem_cnt); - pack_size++; - if (pack_size == kMultiReduceMaxPackSize || i == input_size - 1) { - CHECK_LT(max_elem_cnt, std::numeric_limits::max()); - num_blocks += BlocksNum4ThreadsNum(static_cast(max_elem_cnt)); - max_elem_cnt = 0; - pack_size = 0; - } - } - CHECK_LT(num_blocks, kCudaThreadsNumPerBlock * kCudaThreadsNumPerBlock * kCudaThreadsNumPerBlock) - << "Too much blocks needed for computing " << ctx->op_name() << ", should be less than " - << kCudaThreadsNumPerBlock << "*" << kCudaThreadsNumPerBlock << "*" << kCudaThreadsNumPerBlock - << ", but got " << num_blocks; - size_t elem_size = GetSizeOfDataType(ctx->InputDType("x", 0)); - return GetCudaAlignedSize(num_blocks * elem_size * 2); -} - -} // namespace - -template -struct MultiReduce { - void operator()(ep::Stream* stream, TransformFn transform, - const std::vector>& params, T init, T* ret, T* temp) { - CHECK_NOTNULL(temp); - int32_t total_num_blocks = 0; - for (size_t i = 0; i < params.size(); i += kMultiReduceMaxPackSize) { - MultiReduceParamsPack pack_params{}; - size_t max_elem_cnt = 0; - pack_params.size = std::min(kMultiReduceMaxPackSize, params.size() - i); - for (size_t j = 0; j < pack_params.size; ++j) { - pack_params.params[j] = params[i + j]; - max_elem_cnt = std::max(max_elem_cnt, pack_params.params[j].size); - } - int32_t num_blocks = BlocksNum4ThreadsNum(max_elem_cnt); - MultiBlockReduceGpu - <<As()->cuda_stream()>>>( - transform, pack_params, init, temp + total_num_blocks); - total_num_blocks += num_blocks; - } - size_t wksp_size = 0; - auto DeviceReduce = [&](void* temp_storage) -> void { - OF_CUDA_CHECK(hipcub::DeviceReduce::Reduce(temp_storage, wksp_size, temp, ret, total_num_blocks, - ReduceFn{}, init, - stream->As()->cuda_stream())); - }; - DeviceReduce(nullptr); - // NOTE(zwx): We have allocated the temp storage with the space - // that can hold all the elements to reduce, - // normally the `temp_storage_bytes` for hipcub::DeviceReduce shouldn't exceed it. - CHECK_LE(wksp_size, total_num_blocks * sizeof(T)) - << wksp_size << " size in bytes of temp storage is needed for doing hipcub::DeviceReduce, " - << "but only allocated " << total_num_blocks * sizeof(T); - DeviceReduce(temp + total_num_blocks); - } -}; - -#define REGISTER_MULTI_REDUCE_SUM_POW_ABS_CUDA_KERNEL(dtype) \ - REGISTER_USER_KERNEL("multi_reduce_sum_pow_abs") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn(InferTempStorageSize); - -#define REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL(op_type_name, ximum_enum, dtype) \ - REGISTER_USER_KERNEL(op_type_name) \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn(InferTempStorageSize); - -#define REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNELS(dtype) \ - REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL("multi_reduce_max_abs", Ximum::kMax, dtype) \ - REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL("multi_reduce_min_abs", Ximum::kMin, dtype) \ - REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL("local_multi_reduce_max_abs", Ximum::kMax, dtype) \ - REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL("local_multi_reduce_min_abs", Ximum::kMin, dtype) - -REGISTER_MULTI_REDUCE_SUM_POW_ABS_CUDA_KERNEL(float) -REGISTER_MULTI_REDUCE_SUM_POW_ABS_CUDA_KERNEL(double) - -REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNELS(float) -REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNELS(double) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/user/kernels/multi_reduce_kernels.h" +#include "oneflow/core/ep/include/primitive/fill.h" +#include "oneflow/core/hip/atomic.hip.h" +#include "oneflow/core/device/cuda_util.h" +#include +#include + +namespace oneflow { + +namespace { + +constexpr int64_t kMultiReduceMaxPackSize = 64; + +template +struct MultiReduceParamsPack { + MultiReduceParam params[kMultiReduceMaxPackSize]; + size_t size; +}; + +template +__global__ void MultiBlockReduceGpu(TransformFn transform, + const MultiReduceParamsPack pack_params, const T init, + T* out) { + ReduceFn reduce_fn{}; + T t_out = init; + for (int i = 0; i < pack_params.size; ++i) { + const auto& param = pack_params.params[i]; + CUDA_1D_KERNEL_LOOP(j, param.size) { t_out = reduce_fn(t_out, transform(param.data[j])); } + } + typedef hipcub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + T b_out = BlockReduce(temp_storage).Reduce(t_out, reduce_fn); + if (threadIdx.x == 0) { out[blockIdx.x] = b_out; } +} + +size_t InferTempStorageSize(user_op::InferContext* ctx) { + auto input_size = ctx->input_size("x"); + if (input_size == 0) { return 0; } + int64_t max_elem_cnt = 0; + int64_t pack_size = 0; + int32_t num_blocks = 0; + for (size_t i = 0; i < input_size; ++i) { + int64_t elem_cnt = ctx->InputShape("x", i).elem_cnt(); + max_elem_cnt = std::max(max_elem_cnt, elem_cnt); + pack_size++; + if (pack_size == kMultiReduceMaxPackSize || i == input_size - 1) { + CHECK_LT(max_elem_cnt, std::numeric_limits::max()); + num_blocks += BlocksNum4ThreadsNum(static_cast(max_elem_cnt)); + max_elem_cnt = 0; + pack_size = 0; + } + } + CHECK_LT(num_blocks, kCudaThreadsNumPerBlock * kCudaThreadsNumPerBlock * kCudaThreadsNumPerBlock) + << "Too much blocks needed for computing " << ctx->op_name() << ", should be less than " + << kCudaThreadsNumPerBlock << "*" << kCudaThreadsNumPerBlock << "*" << kCudaThreadsNumPerBlock + << ", but got " << num_blocks; + size_t elem_size = GetSizeOfDataType(ctx->InputDType("x", 0)); + return GetCudaAlignedSize(num_blocks * elem_size * 2); +} + +} // namespace + +template +struct MultiReduce { + void operator()(ep::Stream* stream, TransformFn transform, + const std::vector>& params, T init, T* ret, T* temp) { + CHECK_NOTNULL(temp); + int32_t total_num_blocks = 0; + for (size_t i = 0; i < params.size(); i += kMultiReduceMaxPackSize) { + MultiReduceParamsPack pack_params{}; + size_t max_elem_cnt = 0; + pack_params.size = std::min(kMultiReduceMaxPackSize, params.size() - i); + for (size_t j = 0; j < pack_params.size; ++j) { + pack_params.params[j] = params[i + j]; + max_elem_cnt = std::max(max_elem_cnt, pack_params.params[j].size); + } + int32_t num_blocks = BlocksNum4ThreadsNum(max_elem_cnt); + MultiBlockReduceGpu + <<As()->cuda_stream()>>>( + transform, pack_params, init, temp + total_num_blocks); + total_num_blocks += num_blocks; + } + size_t wksp_size = 0; + auto DeviceReduce = [&](void* temp_storage) -> void { + OF_CUDA_CHECK(hipcub::DeviceReduce::Reduce(temp_storage, wksp_size, temp, ret, total_num_blocks, + ReduceFn{}, init, + stream->As()->cuda_stream())); + }; + DeviceReduce(nullptr); + // NOTE(zwx): We have allocated the temp storage with the space + // that can hold all the elements to reduce, + // normally the `temp_storage_bytes` for hipcub::DeviceReduce shouldn't exceed it. + CHECK_LE(wksp_size, total_num_blocks * sizeof(T)) + << wksp_size << " size in bytes of temp storage is needed for doing hipcub::DeviceReduce, " + << "but only allocated " << total_num_blocks * sizeof(T); + DeviceReduce(temp + total_num_blocks); + } +}; + +#define REGISTER_MULTI_REDUCE_SUM_POW_ABS_CUDA_KERNEL(dtype) \ + REGISTER_USER_KERNEL("multi_reduce_sum_pow_abs") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("y", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn(InferTempStorageSize); + +#define REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL(op_type_name, ximum_enum, dtype) \ + REGISTER_USER_KERNEL(op_type_name) \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("y", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn(InferTempStorageSize); + +#define REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNELS(dtype) \ + REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL("multi_reduce_max_abs", Ximum::kMax, dtype) \ + REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL("multi_reduce_min_abs", Ximum::kMin, dtype) \ + REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL("local_multi_reduce_max_abs", Ximum::kMax, dtype) \ + REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNEL("local_multi_reduce_min_abs", Ximum::kMin, dtype) + +REGISTER_MULTI_REDUCE_SUM_POW_ABS_CUDA_KERNEL(float) +REGISTER_MULTI_REDUCE_SUM_POW_ABS_CUDA_KERNEL(double) + +REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNELS(float) +REGISTER_MULTI_REDUCE_XIMUM_ABS_CUDA_KERNELS(double) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/nd_index_slice_kernels.hip.cpp b/oneflow/user/kernels/nd_index_slice_kernels.hip.cpp index 5f0cea1..2a974e1 100644 --- a/oneflow/user/kernels/nd_index_slice_kernels.hip.cpp +++ b/oneflow/user/kernels/nd_index_slice_kernels.hip.cpp @@ -1,166 +1,166 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/user/kernels/nd_index_slice_kernels.h" -#include "oneflow/core/hip/atomic.hip.h" - -namespace oneflow { - -namespace { - -template -__global__ void CudaGatherNd(NdIndexSliceArgs args, const I* indices, const T* dense, - T* slices) { - DoGatherNd(args.num_slices * args.slice_size, args.slice_size, args.index_ndims, args.dense_shape, - indices, dense, slices); -} - -template -__global__ void CudaScatterNdAdd(NdIndexSliceArgs args, const I* indices, const T* slices, - T* dense) { - DoScatterNdAdd(args.num_slices * args.slice_size, args.slice_size, - args.index_ndims, args.dense_shape, indices, slices, dense); -} - -template -__global__ void CudaScatterNdUpdate(NdIndexSliceArgs args, const I* indices, const T* slices, - T* dense) { - DoScatterNdUpdate(args.num_slices * args.slice_size, args.slice_size, - args.index_ndims, args.dense_shape, indices, slices, dense); -} - -template -__global__ void CudaFillByNdIndex(NdIndexSliceArgs args, const I* indices, T* dense, - T value) { - DoFillByNdIndex(args.num_slices * args.slice_size, args.slice_size, args.index_ndims, - args.dense_shape, indices, dense, value); -} - -} // namespace - -template -struct GatherNdFunctor final { - void operator()(ep::Stream* stream, const NdIndexSliceArgs& args, const I* indices, - const T* dense, T* slices) const { - RUN_CUDA_KERNEL((CudaGatherNd), stream, args.num_slices * args.slice_size, args, indices, - dense, slices); - } -}; - -template -struct ScatterNdAddFunctor final { - void operator()(ep::Stream* stream, const NdIndexSliceArgs& args, const I* indices, - const T* slices, T* dense) const { - RUN_CUDA_KERNEL((CudaScatterNdAdd), stream, args.num_slices * args.slice_size, args, - indices, slices, dense); - } -}; - -template -struct ScatterNdUpdateFunctor final { - void operator()(ep::Stream* stream, const NdIndexSliceArgs& args, const I* indices, - const T* slices, T* dense) const { - RUN_CUDA_KERNEL((CudaScatterNdUpdate), stream, args.num_slices * args.slice_size, args, - indices, slices, dense); - } -}; - -template -struct FillByNdIndexFunctor final { - void operator()(ep::Stream* stream, const NdIndexSliceArgs& args, const I* indices, - T* dense, T value) const { - RUN_CUDA_KERNEL((CudaFillByNdIndex), stream, args.num_slices * args.slice_size, args, - indices, dense, value); - } -}; - -template -struct DeviceAdd { - __device__ __forceinline__ static void Invoke(const T* x, T* y) { cuda::atomic::Add(y, *x); } -}; - -template<> -struct DeviceAdd { - __device__ __forceinline__ static void Invoke(const bool* x, bool* y) { *y += *x; } -}; - -template<> -struct DeviceAdd { - __device__ __forceinline__ static void Invoke(const uint8_t* x, uint8_t* y) { *y += *x; } -}; - -template<> -struct DeviceAdd { - __device__ __forceinline__ static void Invoke(const int8_t* x, int8_t* y) { *y += *x; } -}; - -template<> -struct DeviceAdd { - __device__ __forceinline__ static void Invoke(const int64_t* x, int64_t* y) { *y += *x; } -}; - -#define CUDA_ATOMIC_ADD_SUPPORTED_DATA_TYPE_SEQ \ - FLOATING_DATA_TYPE_SEQ \ - OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE( - INSTANTIATE_GATHER_ND_FUNCTOR, (DeviceType::kCUDA), - ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_SCATTER_ND_ADD_FUNCTOR, (DeviceType::kCUDA), - CUDA_ATOMIC_ADD_SUPPORTED_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, - INDEX_DATA_TYPE_SEQ) - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_FILL_BY_ND_INDEX_FUNCTOR, (DeviceType::kCUDA), - ARITHMETIC_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE( - REGISTER_GATHER_ND_KERNELS, (DeviceType::kCUDA), - ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE( - REGISTER_SCATTER_ND_KERNELS, (DeviceType::kCUDA), - ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SCATTER_ND_LIKE_KERNELS, (DeviceType::kCUDA), - CUDA_ATOMIC_ADD_SUPPORTED_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, - INDEX_DATA_TYPE_SEQ) - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE( - REGISTER_TENSOR_GATHER_ND_UPDATE_KERNELS, (DeviceType::kCUDA), - ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_TENSOR_GATHER_ND_ADD_KERNELS, (DeviceType::kCUDA), - CUDA_ATOMIC_ADD_SUPPORTED_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) - -#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 && CUDA_VERSION >= 10000) || defined(__HIP_DEVICE_COMPILE__) - -template<> -struct DeviceAdd { - __device__ __forceinline__ static void Invoke(const float16* x, float16* y) { - cuda::atomic::Add(reinterpret_cast(y), *(reinterpret_cast(x))); - } -}; - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_ND_INDEX_SLICE_FUNCTORS, (DeviceType::kCUDA), - FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_ND_INDEX_SLICE_KERNELS, (DeviceType::kCUDA), - FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) - -#endif - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/user/kernels/nd_index_slice_kernels.h" +#include "oneflow/core/hip/atomic.hip.h" + +namespace oneflow { + +namespace { + +template +__global__ void CudaGatherNd(NdIndexSliceArgs args, const I* indices, const T* dense, + T* slices) { + DoGatherNd(args.num_slices * args.slice_size, args.slice_size, args.index_ndims, args.dense_shape, + indices, dense, slices); +} + +template +__global__ void CudaScatterNdAdd(NdIndexSliceArgs args, const I* indices, const T* slices, + T* dense) { + DoScatterNdAdd(args.num_slices * args.slice_size, args.slice_size, + args.index_ndims, args.dense_shape, indices, slices, dense); +} + +template +__global__ void CudaScatterNdUpdate(NdIndexSliceArgs args, const I* indices, const T* slices, + T* dense) { + DoScatterNdUpdate(args.num_slices * args.slice_size, args.slice_size, + args.index_ndims, args.dense_shape, indices, slices, dense); +} + +template +__global__ void CudaFillByNdIndex(NdIndexSliceArgs args, const I* indices, T* dense, + T value) { + DoFillByNdIndex(args.num_slices * args.slice_size, args.slice_size, args.index_ndims, + args.dense_shape, indices, dense, value); +} + +} // namespace + +template +struct GatherNdFunctor final { + void operator()(ep::Stream* stream, const NdIndexSliceArgs& args, const I* indices, + const T* dense, T* slices) const { + RUN_CUDA_KERNEL((CudaGatherNd), stream, args.num_slices * args.slice_size, args, indices, + dense, slices); + } +}; + +template +struct ScatterNdAddFunctor final { + void operator()(ep::Stream* stream, const NdIndexSliceArgs& args, const I* indices, + const T* slices, T* dense) const { + RUN_CUDA_KERNEL((CudaScatterNdAdd), stream, args.num_slices * args.slice_size, args, + indices, slices, dense); + } +}; + +template +struct ScatterNdUpdateFunctor final { + void operator()(ep::Stream* stream, const NdIndexSliceArgs& args, const I* indices, + const T* slices, T* dense) const { + RUN_CUDA_KERNEL((CudaScatterNdUpdate), stream, args.num_slices * args.slice_size, args, + indices, slices, dense); + } +}; + +template +struct FillByNdIndexFunctor final { + void operator()(ep::Stream* stream, const NdIndexSliceArgs& args, const I* indices, + T* dense, T value) const { + RUN_CUDA_KERNEL((CudaFillByNdIndex), stream, args.num_slices * args.slice_size, args, + indices, dense, value); + } +}; + +template +struct DeviceAdd { + __device__ __forceinline__ static void Invoke(const T* x, T* y) { cuda::atomic::Add(y, *x); } +}; + +template<> +struct DeviceAdd { + __device__ __forceinline__ static void Invoke(const bool* x, bool* y) { *y += *x; } +}; + +template<> +struct DeviceAdd { + __device__ __forceinline__ static void Invoke(const uint8_t* x, uint8_t* y) { *y += *x; } +}; + +template<> +struct DeviceAdd { + __device__ __forceinline__ static void Invoke(const int8_t* x, int8_t* y) { *y += *x; } +}; + +template<> +struct DeviceAdd { + __device__ __forceinline__ static void Invoke(const int64_t* x, int64_t* y) { *y += *x; } +}; + +#define CUDA_ATOMIC_ADD_SUPPORTED_DATA_TYPE_SEQ \ + FLOATING_DATA_TYPE_SEQ \ + OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE( + INSTANTIATE_GATHER_ND_FUNCTOR, (DeviceType::kCUDA), + ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_SCATTER_ND_ADD_FUNCTOR, (DeviceType::kCUDA), + CUDA_ATOMIC_ADD_SUPPORTED_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, + INDEX_DATA_TYPE_SEQ) + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_FILL_BY_ND_INDEX_FUNCTOR, (DeviceType::kCUDA), + ARITHMETIC_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE( + REGISTER_GATHER_ND_KERNELS, (DeviceType::kCUDA), + ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE( + REGISTER_SCATTER_ND_KERNELS, (DeviceType::kCUDA), + ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SCATTER_ND_LIKE_KERNELS, (DeviceType::kCUDA), + CUDA_ATOMIC_ADD_SUPPORTED_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, + INDEX_DATA_TYPE_SEQ) + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE( + REGISTER_TENSOR_GATHER_ND_UPDATE_KERNELS, (DeviceType::kCUDA), + ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_TENSOR_GATHER_ND_ADD_KERNELS, (DeviceType::kCUDA), + CUDA_ATOMIC_ADD_SUPPORTED_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) + +#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 && CUDA_VERSION >= 10000) || defined(__HIP_DEVICE_COMPILE__) + +template<> +struct DeviceAdd { + __device__ __forceinline__ static void Invoke(const float16* x, float16* y) { + cuda::atomic::Add(reinterpret_cast(y), *(reinterpret_cast(x))); + } +}; + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_ND_INDEX_SLICE_FUNCTORS, (DeviceType::kCUDA), + FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_ND_INDEX_SLICE_KERNELS, (DeviceType::kCUDA), + FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) + +#endif + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/nll_kernel_util.hip.cpp b/oneflow/user/kernels/nll_kernel_util.hip.cpp index 90c82b7..52c68e5 100644 --- a/oneflow/user/kernels/nll_kernel_util.hip.cpp +++ b/oneflow/user/kernels/nll_kernel_util.hip.cpp @@ -1,93 +1,93 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/user/kernels/nll_kernel_util.h" -#include "oneflow/core/hip/atomic.hip.h" -#include "hip/hip_runtime.h" - -namespace oneflow { - -namespace { - -template -__global__ void NLLForward(const int32_t num_samples, const K num_classes, const K class_start, - const K ignore_index, const T* input, const K* target, const T* weight, - T* out, T* out_weight) { - const T zero = GetZeroVal(); - const T one = GetOneVal(); - CUDA_1D_KERNEL_LOOP(i, num_samples) { - K label = target[i]; - T w = zero; - T y = zero; - if (label != ignore_index) { - label -= class_start; - if (label >= 0 && label < num_classes) { - w = weight ? weight[label] : one; - y = -(input[i * num_classes + label] * w); - } - } - out[i] = y; - out_weight[i] = w; - } -} - -template -__global__ void NLLBackward(const int32_t num_samples, const K num_classes, const K class_start, - const K ignore_index, const T* out_grad, const K* target, - const T* weight, T* in_grad) { - const T one = GetOneVal(); - const T zero = GetZeroVal(); - CUDA_1D_KERNEL_LOOP_T(K, i, num_samples * num_classes) { - const K n = i / num_classes; - const K idx = i - n * num_classes; - const K label = target[n]; - if (label != ignore_index && idx == label - class_start) { - in_grad[i] = out_grad[n] * (weight ? -weight[idx] : -one); - } else { - in_grad[i] = zero; - } - } -} - -} // namespace - -template -struct NLLKernelUtil { - static void Forward(ep::Stream* stream, const int32_t num_samples, const K num_classes, - const K class_start, const K ignore_index, const T* input, const K* target, - const T* weight, T* out, T* out_weight) { - NLLForward<<As()->cuda_stream()>>>(num_samples, num_classes, - class_start, ignore_index, input, - target, weight, out, out_weight); - } - - static void Backward(ep::Stream* stream, const int32_t num_samples, const K num_classes, - const K class_start, const K ignore_index, const T* out_grad, - const K* target, const T* weight, T* in_grad) { - NLLBackward<<As()->cuda_stream()>>>( - num_samples, num_classes, class_start, ignore_index, out_grad, target, weight, in_grad); - } -}; - -template struct NLLKernelUtil; -template struct NLLKernelUtil; -template struct NLLKernelUtil; -template struct NLLKernelUtil; -template struct NLLKernelUtil; -template struct NLLKernelUtil; - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/user/kernels/nll_kernel_util.h" +#include "oneflow/core/hip/atomic.hip.h" +#include "hip/hip_runtime.h" + +namespace oneflow { + +namespace { + +template +__global__ void NLLForward(const int32_t num_samples, const K num_classes, const K class_start, + const K ignore_index, const T* input, const K* target, const T* weight, + T* out, T* out_weight) { + const T zero = GetZeroVal(); + const T one = GetOneVal(); + CUDA_1D_KERNEL_LOOP(i, num_samples) { + K label = target[i]; + T w = zero; + T y = zero; + if (label != ignore_index) { + label -= class_start; + if (label >= 0 && label < num_classes) { + w = weight ? weight[label] : one; + y = -(input[i * num_classes + label] * w); + } + } + out[i] = y; + out_weight[i] = w; + } +} + +template +__global__ void NLLBackward(const int32_t num_samples, const K num_classes, const K class_start, + const K ignore_index, const T* out_grad, const K* target, + const T* weight, T* in_grad) { + const T one = GetOneVal(); + const T zero = GetZeroVal(); + CUDA_1D_KERNEL_LOOP_T(K, i, num_samples * num_classes) { + const K n = i / num_classes; + const K idx = i - n * num_classes; + const K label = target[n]; + if (label != ignore_index && idx == label - class_start) { + in_grad[i] = out_grad[n] * (weight ? -weight[idx] : -one); + } else { + in_grad[i] = zero; + } + } +} + +} // namespace + +template +struct NLLKernelUtil { + static void Forward(ep::Stream* stream, const int32_t num_samples, const K num_classes, + const K class_start, const K ignore_index, const T* input, const K* target, + const T* weight, T* out, T* out_weight) { + NLLForward<<As()->cuda_stream()>>>(num_samples, num_classes, + class_start, ignore_index, input, + target, weight, out, out_weight); + } + + static void Backward(ep::Stream* stream, const int32_t num_samples, const K num_classes, + const K class_start, const K ignore_index, const T* out_grad, + const K* target, const T* weight, T* in_grad) { + NLLBackward<<As()->cuda_stream()>>>( + num_samples, num_classes, class_start, ignore_index, out_grad, target, weight, in_grad); + } +}; + +template struct NLLKernelUtil; +template struct NLLKernelUtil; +template struct NLLKernelUtil; +template struct NLLKernelUtil; +template struct NLLKernelUtil; +template struct NLLKernelUtil; + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/nms_kernel.hip.cpp b/oneflow/user/kernels/nms_kernel.hip.cpp index 0be72a5..bc8a3eb 100644 --- a/oneflow/user/kernels/nms_kernel.hip.cpp +++ b/oneflow/user/kernels/nms_kernel.hip.cpp @@ -1,145 +1,145 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -constexpr int kBlockSize = sizeof(int64_t) * 8; - -template -__host__ __device__ __forceinline__ T CeilDiv(T a, T b) { - return (a + b - 1) / b; -} - -template -__host__ __device__ __forceinline__ T IoU(T const* const a, T const* const b) { - T interS = - max(min(a[2], b[2]) - max(a[0], b[0]), 0.f) * max(min(a[3], b[3]) - max(a[1], b[1]), 0.f); - T Sa = (a[2] - a[0]) * (a[3] - a[1]); - T Sb = (b[2] - b[0]) * (b[3] - b[1]); - return interS / (Sa + Sb - interS); -} - -template -__global__ void CalcSuppressionBitmaskMatrix(int num_boxes, float iou_threshold, const T* boxes, - int64_t* suppression_bmask_matrix) { - const int row = blockIdx.y; - const int col = blockIdx.x; - - if (row > col) return; - - const int row_size = min(num_boxes - row * kBlockSize, kBlockSize); - const int col_size = min(num_boxes - col * kBlockSize, kBlockSize); - - __shared__ T block_boxes[kBlockSize * 4]; - if (threadIdx.x < col_size) { - block_boxes[threadIdx.x * 4 + 0] = boxes[(kBlockSize * col + threadIdx.x) * 4 + 0]; - block_boxes[threadIdx.x * 4 + 1] = boxes[(kBlockSize * col + threadIdx.x) * 4 + 1]; - block_boxes[threadIdx.x * 4 + 2] = boxes[(kBlockSize * col + threadIdx.x) * 4 + 2]; - block_boxes[threadIdx.x * 4 + 3] = boxes[(kBlockSize * col + threadIdx.x) * 4 + 3]; - } - __syncthreads(); - - if (threadIdx.x < row_size) { - const int cur_box_idx = kBlockSize * row + threadIdx.x; - const T* cur_box_ptr = boxes + cur_box_idx * 4; - unsigned long long bits = 0; - int start = 0; - if (row == col) { start = threadIdx.x + 1; } - for (int i = start; i < col_size; i++) { - if (IoU(cur_box_ptr, block_boxes + i * 4) > iou_threshold) { bits |= 1Ull << i; } - } - suppression_bmask_matrix[cur_box_idx * gridDim.y + col] = bits; - } -} - -__global__ void ScanSuppression(int num_boxes, int num_blocks, int num_keep, - int64_t* suppression_bmask, int8_t* keep_mask) { - extern __shared__ int64_t remv[]; - remv[threadIdx.x] = 0; - for (int i = 0; i < num_boxes; ++i) { - int block_n = i / kBlockSize; - int block_i = i % kBlockSize; - if (!(remv[block_n] & (1Ull << block_i))) { - remv[threadIdx.x] |= suppression_bmask[i * num_blocks + threadIdx.x]; - if (threadIdx.x == block_n && num_keep > 0) { - keep_mask[i] = 1; - num_keep -= 1; - } - } - } -} - -} // namespace - -template -class NmsGpuKernel final : public user_op::OpKernel { - public: - NmsGpuKernel() = default; - ~NmsGpuKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* boxes_blob = ctx->Tensor4ArgNameAndIndex("in", 0); - user_op::Tensor* keep_blob = ctx->Tensor4ArgNameAndIndex("out", 0); - user_op::Tensor* tmp_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const T* boxes = boxes_blob->dptr(); - int8_t* keep = keep_blob->mut_dptr(); - int64_t* suppression_mask = tmp_blob->mut_dptr(); - - const int num_boxes = boxes_blob->shape_view().At(0); - int num_keep = ctx->Attr("keep_n"); - if (num_keep <= 0 || num_keep > num_boxes) { num_keep = num_boxes; } - const int num_blocks = CeilDiv(num_boxes, kBlockSize); - Memset(ctx->stream(), suppression_mask, 0, - num_boxes * num_blocks * sizeof(int64_t)); - Memset(ctx->stream(), keep, 0, num_boxes * sizeof(int8_t)); - - dim3 blocks(num_blocks, num_blocks); - dim3 threads(kBlockSize); - CalcSuppressionBitmaskMatrix<<stream()->As()->cuda_stream()>>>( - num_boxes, ctx->Attr("iou_threshold"), boxes, suppression_mask); - ScanSuppression<<<1, num_blocks, num_blocks * sizeof(int64_t), - ctx->stream()->As()->cuda_stream()>>>( - num_boxes, num_blocks, num_keep, suppression_mask, keep); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_NMS_CUDA_KERNEL(dtype) \ - REGISTER_USER_KERNEL("nms") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("out", 0) == DataType::kInt8) \ - && (user_op::HobDataType("in", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ - Shape* in_shape = ctx->Shape4ArgNameAndIndex("in", 0); \ - int64_t num_boxes = in_shape->At(0); \ - int64_t blocks = CeilDiv(num_boxes, kBlockSize); \ - return num_boxes * blocks * sizeof(int64_t); \ - }); - -REGISTER_NMS_CUDA_KERNEL(float) -REGISTER_NMS_CUDA_KERNEL(double) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +constexpr int kBlockSize = sizeof(int64_t) * 8; + +template +__host__ __device__ __forceinline__ T CeilDiv(T a, T b) { + return (a + b - 1) / b; +} + +template +__host__ __device__ __forceinline__ T IoU(T const* const a, T const* const b) { + T interS = + max(min(a[2], b[2]) - max(a[0], b[0]), 0.f) * max(min(a[3], b[3]) - max(a[1], b[1]), 0.f); + T Sa = (a[2] - a[0]) * (a[3] - a[1]); + T Sb = (b[2] - b[0]) * (b[3] - b[1]); + return interS / (Sa + Sb - interS); +} + +template +__global__ void CalcSuppressionBitmaskMatrix(int num_boxes, float iou_threshold, const T* boxes, + int64_t* suppression_bmask_matrix) { + const int row = blockIdx.y; + const int col = blockIdx.x; + + if (row > col) return; + + const int row_size = min(num_boxes - row * kBlockSize, kBlockSize); + const int col_size = min(num_boxes - col * kBlockSize, kBlockSize); + + __shared__ T block_boxes[kBlockSize * 4]; + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 4 + 0] = boxes[(kBlockSize * col + threadIdx.x) * 4 + 0]; + block_boxes[threadIdx.x * 4 + 1] = boxes[(kBlockSize * col + threadIdx.x) * 4 + 1]; + block_boxes[threadIdx.x * 4 + 2] = boxes[(kBlockSize * col + threadIdx.x) * 4 + 2]; + block_boxes[threadIdx.x * 4 + 3] = boxes[(kBlockSize * col + threadIdx.x) * 4 + 3]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = kBlockSize * row + threadIdx.x; + const T* cur_box_ptr = boxes + cur_box_idx * 4; + unsigned long long bits = 0; + int start = 0; + if (row == col) { start = threadIdx.x + 1; } + for (int i = start; i < col_size; i++) { + if (IoU(cur_box_ptr, block_boxes + i * 4) > iou_threshold) { bits |= 1Ull << i; } + } + suppression_bmask_matrix[cur_box_idx * gridDim.y + col] = bits; + } +} + +__global__ void ScanSuppression(int num_boxes, int num_blocks, int num_keep, + int64_t* suppression_bmask, int8_t* keep_mask) { + extern __shared__ int64_t remv[]; + remv[threadIdx.x] = 0; + for (int i = 0; i < num_boxes; ++i) { + int block_n = i / kBlockSize; + int block_i = i % kBlockSize; + if (!(remv[block_n] & (1Ull << block_i))) { + remv[threadIdx.x] |= suppression_bmask[i * num_blocks + threadIdx.x]; + if (threadIdx.x == block_n && num_keep > 0) { + keep_mask[i] = 1; + num_keep -= 1; + } + } + } +} + +} // namespace + +template +class NmsGpuKernel final : public user_op::OpKernel { + public: + NmsGpuKernel() = default; + ~NmsGpuKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* boxes_blob = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* keep_blob = ctx->Tensor4ArgNameAndIndex("out", 0); + user_op::Tensor* tmp_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + const T* boxes = boxes_blob->dptr(); + int8_t* keep = keep_blob->mut_dptr(); + int64_t* suppression_mask = tmp_blob->mut_dptr(); + + const int num_boxes = boxes_blob->shape_view().At(0); + int num_keep = ctx->Attr("keep_n"); + if (num_keep <= 0 || num_keep > num_boxes) { num_keep = num_boxes; } + const int num_blocks = CeilDiv(num_boxes, kBlockSize); + Memset(ctx->stream(), suppression_mask, 0, + num_boxes * num_blocks * sizeof(int64_t)); + Memset(ctx->stream(), keep, 0, num_boxes * sizeof(int8_t)); + + dim3 blocks(num_blocks, num_blocks); + dim3 threads(kBlockSize); + CalcSuppressionBitmaskMatrix<<stream()->As()->cuda_stream()>>>( + num_boxes, ctx->Attr("iou_threshold"), boxes, suppression_mask); + ScanSuppression<<<1, num_blocks, num_blocks * sizeof(int64_t), + ctx->stream()->As()->cuda_stream()>>>( + num_boxes, num_blocks, num_keep, suppression_mask, keep); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_NMS_CUDA_KERNEL(dtype) \ + REGISTER_USER_KERNEL("nms") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("out", 0) == DataType::kInt8) \ + && (user_op::HobDataType("in", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ + Shape* in_shape = ctx->Shape4ArgNameAndIndex("in", 0); \ + int64_t num_boxes = in_shape->At(0); \ + int64_t blocks = CeilDiv(num_boxes, kBlockSize); \ + return num_boxes * blocks * sizeof(int64_t); \ + }); + +REGISTER_NMS_CUDA_KERNEL(float) +REGISTER_NMS_CUDA_KERNEL(double) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/normalization_kernel.hip.cpp b/oneflow/user/kernels/normalization_kernel.hip.cpp index e5ea5f0..f80809a 100644 --- a/oneflow/user/kernels/normalization_kernel.hip.cpp +++ b/oneflow/user/kernels/normalization_kernel.hip.cpp @@ -1,534 +1,534 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifdef WITH_ROCM - -#include -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/device/cudnn_util.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/kernel/cuda_graph_support.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include "hip/hsa_detail/device_functions.h" - -namespace oneflow { - -namespace { - -void InferDimSizeAndDataFormat(const ShapeView& x_shape, const int32_t axis, int32_t* n, int32_t* c, - int32_t* h, int32_t* w, hipdnnTensorFormat_t* format) { - if (x_shape.Count(axis + 1) == 1) { - if (axis == 0) { - *n = 1; - *h = 1; - } else { - *n = x_shape.At(0); - *h = x_shape.Count(1, axis); - } - *w = 1; - *c = x_shape.At(axis); - // *format = HIPDNN_TENSOR_NHWC; - *format = HIPDNN_TENSOR_NCHW; - // std::cout << "don't surpport HIPDNN_TENSOR_NHWC, use HIPDNN_TENSOR_NCHW instead, maybe cause wrong results" << std::endl; - } else { - *n = x_shape.Count(0, axis); - *c = x_shape.At(axis); - *h = x_shape.Count(axis + 1); - *w = 1; - *format = HIPDNN_TENSOR_NCHW; - } -} - -void InferXYCudnnTensorDesc(const ShapeView& xy_shape, const DataType& data_type, - const int32_t axis, hipdnnTensorDescriptor_t xy_desc) { - int32_t n, c, h, w; - hipdnnTensorFormat_t format; - InferDimSizeAndDataFormat(xy_shape, axis, &n, &c, &h, &w, &format); - OF_CUDNN_CHECK( - hipdnnSetTensor4dDescriptor(xy_desc, format, GetCudnnDataType(data_type), n, c, h, w)); -} - -void InferParamCudnnTensorDesc(const hipdnnTensorDescriptor_t xy_desc, hipdnnBatchNormMode_t mode, - hipdnnTensorDescriptor_t param_desc) { - OF_CUDNN_CHECK(hipdnnDeriveBNTensorDescriptor(param_desc, xy_desc, mode)); -} - -class CudnnTensorDescHelper final { - public: - OF_DISALLOW_COPY_AND_MOVE(CudnnTensorDescHelper); - CudnnTensorDescHelper(const ShapeView& xy_shape, const DataType& data_type, const int32_t axis, - hipdnnBatchNormMode_t mode) { - OF_CUDNN_CHECK(hipdnnCreateTensorDescriptor(&xy_desc_)); - InferXYCudnnTensorDesc(xy_shape, data_type, axis, xy_desc_); - OF_CUDNN_CHECK(hipdnnCreateTensorDescriptor(¶m_desc_)); - InferParamCudnnTensorDesc(xy_desc_, mode, param_desc_); - int n, c, h, w, n_stride, c_stride, h_stride, w_stride; - OF_CUDNN_CHECK(hipdnnGetTensor4dDescriptor(param_desc_, ¶m_data_type_, &n, &c, &h, &w, - &n_stride, &c_stride, &h_stride, &w_stride)); - param_size_ = c; - } - ~CudnnTensorDescHelper() { - OF_CUDNN_CHECK(hipdnnDestroyTensorDescriptor(param_desc_)); - OF_CUDNN_CHECK(hipdnnDestroyTensorDescriptor(xy_desc_)); - } - - hipdnnTensorDescriptor_t xy_desc() const { return xy_desc_; } - - hipdnnTensorDescriptor_t param_desc() const { return param_desc_; } - - void CheckParamTensor(const user_op::Tensor* tensor) const { - CHECK_NOTNULL(tensor); - CHECK_EQ(tensor->shape_view().NumAxes(), 1); - CHECK_EQ(tensor->shape_view().At(0), param_size_); - // CHECK_EQ(GetCudnnDataType(tensor->data_type()), param_data_type_); - } - - private: - hipdnnTensorDescriptor_t xy_desc_ = nullptr; - hipdnnTensorDescriptor_t param_desc_ = nullptr; - hipdnnDataType_t param_data_type_; - int32_t param_size_ = 0; -}; - -size_t InferTrainWorkspaceSize(const ShapeView& x_shape, const DataType data_type, - const int32_t axis) { - return 1; -} - -size_t InferTrainTmpSize(user_op::InferContext* ctx) { - const auto& x = ctx->InputTensorDesc("x", 0); - const auto axis = ctx->Attr("axis"); - return InferTrainWorkspaceSize(x.shape(), x.data_type(), axis); -} - -size_t InferGradWorkspaceSize(const ShapeView& x_shape, const DataType data_type, - const int32_t axis) { - return 1; -} - -size_t InferGradTmpSize(user_op::InferContext* ctx) { - const auto& dy = ctx->InputTensorDesc("dy", 0); - const auto axis = ctx->Attr("axis"); - size_t tmp_size = 0; - if (ctx->op_type_name() == "normalization_add_relu_grad" && !ctx->has_output("addend_diff", 0)) { - tmp_size += GetCudaAlignedSize(dy.shape().elem_cnt() * GetSizeOfDataType(dy.data_type())); - } - tmp_size += GetCudaAlignedSize(InferGradWorkspaceSize(dy.shape(), dy.data_type(), axis)); - return tmp_size; -} - -template -class NormalizationInferenceKernel final : public user_op::OpKernel, - public user_op::CudaGraphSupport { - public: - NormalizationInferenceKernel() = default; - ~NormalizationInferenceKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const bool training = ctx->Attr("training"); - CHECK(!training); - const auto* x = ctx->Tensor4ArgNameAndIndex("x", 0); - auto* y = ctx->Tensor4ArgNameAndIndex("y", 0); - const auto* gamma = ctx->Tensor4ArgNameAndIndex("gamma", 0); - const auto* beta = ctx->Tensor4ArgNameAndIndex("beta", 0); - auto* moving_mean = ctx->Tensor4ArgNameAndIndex("moving_mean", 0); - auto* moving_variance = ctx->Tensor4ArgNameAndIndex("moving_variance", 0); - const auto axis = ctx->Attr("axis"); - const auto epsilon = ctx->Attr("epsilon"); - - const DataType data_type = x->data_type(); - CHECK_EQ(x->shape_view(), y->shape_view()); - CHECK_EQ(y->data_type(), data_type); - CHECK_GE(axis, 0); - CHECK_LT(axis, x->shape_view().NumAxes()); - - const CudnnTensorDescHelper desc_helper(x->shape_view(), data_type, axis, - HIPDNN_BATCHNORM_SPATIAL); - desc_helper.CheckParamTensor(gamma); - desc_helper.CheckParamTensor(beta); - desc_helper.CheckParamTensor(moving_mean); - desc_helper.CheckParamTensor(moving_variance); - - const void* sp_alpha = CudnnSPOnePtr(); - const void* sp_beta; - if (ctx->has_input("_add_to_output", 0)) { - const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); - CHECK_EQ(add_to_output->data_type(), y->data_type()); - CHECK_EQ(add_to_output->shape_view(), y->shape_view()); - Memcpy( - ctx->stream(), y->mut_dptr(), add_to_output->dptr(), - add_to_output->shape_view().elem_cnt() * GetSizeOfDataType(add_to_output->data_type())); - sp_beta = CudnnSPOnePtr(); - } else { - sp_beta = CudnnSPZeroPtr(); - } - - OF_CUDNN_CHECK(hipdnnBatchNormalizationForwardInference( - ctx->stream()->As()->cudnn_handle(), HIPDNN_BATCHNORM_SPATIAL, sp_alpha, - sp_beta, desc_helper.xy_desc(), x->dptr(), desc_helper.xy_desc(), y->mut_dptr(), - desc_helper.param_desc(), gamma->dptr(), beta->dptr(), moving_mean->dptr(), - moving_variance->dptr(), epsilon)); - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_BN_INFERENCE_KERNEL(dtype) \ - REGISTER_USER_KERNEL("normalization") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value) \ - && (user_op::HobAttr("training") == false)) \ - .SetInplaceProposalFn([](const user_op::InferContext& ctx, \ - user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe { \ - if (ctx.has_input("_add_to_output", 0)) { \ - OF_RETURN_IF_ERROR(AddInplaceArgPairFn("y", 0, "_add_to_output", 0, true)); \ - } \ - return Maybe::Ok(); \ - }); - -REGISTER_BN_INFERENCE_KERNEL(float16) -REGISTER_BN_INFERENCE_KERNEL(float) -REGISTER_BN_INFERENCE_KERNEL(double) - -#undef REGISTER_BN_INFERENCE_KERNEL - -constexpr int64_t kCudaWarpSize = 64; - -template -__global__ void ReluGpu(int64_t n, const T* x, T* y, int64_t* mask) { - const int32_t lane_id = threadIdx.x % kCudaWarpSize; - CUDA_1D_KERNEL_LOOP(i, n) { - const T x_val = x[i]; - const bool is_positive = (x_val > 0); - int64_t warp_mask = __ballot(static_cast(is_positive)); - if (lane_id == 0) { mask[i / kCudaWarpSize] = warp_mask; } - y[i] = is_positive ? x_val : 0; - } -} - -template<> -__global__ void ReluGpu(int64_t n, const half* x, half* y, int64_t* mask) { - const int32_t lane_id = threadIdx.x % kCudaWarpSize; - const half zero = __float2half(0.0f); - CUDA_1D_KERNEL_LOOP(i, n) { - const half x_val = x[i]; - const bool is_positive = __hgt(x_val, zero); - int64_t warp_mask = __ballot(static_cast(is_positive)); - if (lane_id == 0) { mask[i / kCudaWarpSize] = warp_mask; } - y[i] = is_positive ? x_val : zero; - } -} - -template -__global__ void AddReluGpu(int64_t n, const T* x, const T* addend, T* y, int64_t* mask) { - const int32_t lane_id = threadIdx.x % kCudaWarpSize; - CUDA_1D_KERNEL_LOOP(i, n) { - const T sum = x[i] + addend[i]; - const bool is_positive = (sum > 0); - int64_t warp_mask = __ballot(static_cast(is_positive)); - if (lane_id == 0) { mask[i / kCudaWarpSize] = warp_mask; } - y[i] = is_positive ? sum : 0; - } -} - -template<> -__global__ void AddReluGpu(int64_t n, const half* x, const half* addend, half* y, - int64_t* mask) { - const int32_t lane_id = threadIdx.x % kCudaWarpSize; - const half zero = __float2half(0.0f); - CUDA_1D_KERNEL_LOOP(i, n) { - const half sum = __hadd(x[i], addend[i]); - const bool is_positive = __hgt(sum, zero); - int64_t warp_mask = __ballot(static_cast(is_positive)); - if (lane_id == 0) { mask[i / kCudaWarpSize] = warp_mask; } - y[i] = is_positive ? sum : zero; - } -} - -template -void Relu(ep::Stream* stream, int64_t n, const T* x, T* y, int64_t* mask) { - ReluGpu<<As()->cuda_stream()>>>(n, x, y, mask); -} - -template<> -void Relu(ep::Stream* stream, int64_t n, const float16* x, float16* y, int64_t* mask) { - Relu(stream, n, reinterpret_cast(x), reinterpret_cast(y), mask); -} - -template -void AddRelu(ep::Stream* stream, int64_t n, const T* x, const T* addend, T* y, int64_t* mask) { - AddReluGpu<<As()->cuda_stream()>>>(n, x, addend, y, mask); -} - -template<> -void AddRelu(ep::Stream* stream, int64_t n, const float16* x, const float16* addend, - float16* y, int64_t* mask) { - AddRelu(stream, n, reinterpret_cast(x), reinterpret_cast(addend), - reinterpret_cast(y), mask); -} - -template -__global__ void ReluBackwardGpu(int64_t n, const int64_t* mask, const T* dy, T* addend_diff) { - int32_t lane_id = threadIdx.x % kCudaWarpSize; - CUDA_1D_KERNEL_LOOP(i, n) { - int64_t mask_val = mask[i / kCudaWarpSize]; - bool is_positive = mask_val & (1 << lane_id); - addend_diff[i] = static_cast(is_positive) * dy[i]; - } -} - -template -void ReluBackward(ep::Stream* stream, int64_t n, const int64_t* mask, const T* dy, T* addend_diff) { - ReluBackwardGpu<<As()->cuda_stream()>>>(n, mask, dy, addend_diff); -} - -template<> -void ReluBackward(ep::Stream* stream, int64_t n, const int64_t* mask, const float16* dy, - float16* addend_diff) { - ReluBackward(stream, n, mask, reinterpret_cast(dy), - reinterpret_cast(addend_diff)); -} - -template -class NormalizationTrainKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { - public: - NormalizationTrainKernel() = default; - ~NormalizationTrainKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - if (ctx->op_type_name() == "normalization") { CHECK(ctx->Attr("training")); } - const auto* x = ctx->Tensor4ArgNameAndIndex("x", 0); - auto* y = ctx->Tensor4ArgNameAndIndex("y", 0); - - const auto axis = ctx->Attr("axis"); - const auto epsilon = ctx->Attr("epsilon"); - const auto momentum = ctx->Attr("momentum"); - - const DataType data_type = x->data_type(); - CHECK_EQ(x->shape_view(), y->shape_view()); - CHECK_EQ(y->data_type(), data_type); - CHECK_GE(axis, 0); - CHECK_LT(axis, x->shape_view().NumAxes()); - const CudnnTensorDescHelper desc_helper(x->shape_view(), data_type, axis, - HIPDNN_BATCHNORM_SPATIAL_PERSISTENT); - - const auto* gamma = ctx->Tensor4ArgNameAndIndex("gamma", 0); - const auto* beta = ctx->Tensor4ArgNameAndIndex("beta", 0); - auto* mean = ctx->Tensor4ArgNameAndIndex("mean", 0); - auto* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0); - desc_helper.CheckParamTensor(gamma); - desc_helper.CheckParamTensor(beta); - desc_helper.CheckParamTensor(mean); - desc_helper.CheckParamTensor(inv_variance); - - user_op::Tensor* moving_mean = nullptr; - user_op::Tensor* moving_variance = nullptr; - if (ctx->has_input("moving_mean", 0)) { - CHECK(ctx->has_input("moving_variance", 0)); - moving_mean = ctx->Tensor4ArgNameAndIndex("moving_mean", 0); - moving_variance = ctx->Tensor4ArgNameAndIndex("moving_variance", 0); - desc_helper.CheckParamTensor(moving_mean); - desc_helper.CheckParamTensor(moving_variance); - } - - const void* sp_alpha = CudnnSPOnePtr(); - const void* sp_beta; - if (ctx->has_input("_add_to_output", 0)) { - const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); - CHECK_EQ(add_to_output->data_type(), y->data_type()); - CHECK_EQ(add_to_output->shape_view(), y->shape_view()); - Memcpy( - ctx->stream(), y->mut_dptr(), add_to_output->dptr(), - add_to_output->shape_view().elem_cnt() * GetSizeOfDataType(add_to_output->data_type())); - sp_beta = CudnnSPOnePtr(); - } else { - sp_beta = CudnnSPZeroPtr(); - } - - OF_CUDNN_CHECK(hipdnnBatchNormalizationForwardTraining( - ctx->stream()->As()->cudnn_handle(), HIPDNN_BATCHNORM_SPATIAL_PERSISTENT, - const_cast(sp_alpha), const_cast(sp_beta), desc_helper.xy_desc(), x->dptr(), desc_helper.xy_desc(), y->mut_dptr(), - desc_helper.param_desc(), const_cast(gamma->dptr()), const_cast(beta->dptr()), 1.0 - momentum, - moving_mean ? moving_mean->mut_dptr() : NULL, - moving_variance ? moving_variance->mut_dptr() : NULL, epsilon, mean->mut_dptr(), - inv_variance->mut_dptr())); - - if (ctx->op_type_name() == "normalization_add_relu") { - CHECK(!ctx->has_input("_add_to_output", 0)); - const int64_t elem_cnt = x->shape_view().elem_cnt(); - auto* mask = ctx->Tensor4ArgNameAndIndex("reserve_space", 0); - if (ctx->has_input("addend", 0)) { - const auto* addend = ctx->Tensor4ArgNameAndIndex("addend", 0); - AddRelu(ctx->stream(), elem_cnt, y->dptr(), addend->dptr(), y->mut_dptr(), - mask->mut_dptr()); - } else { - Relu(ctx->stream(), elem_cnt, y->dptr(), y->mut_dptr(), mask->mut_dptr()); - } - } - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_BN_TRAIN_KERNEL(dtype) \ - REGISTER_USER_KERNEL("normalization") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value) \ - && (user_op::HobAttr("training") == true)) \ - .SetInferTmpSizeFn(InferTrainTmpSize) \ - .SetInplaceProposalFn([](const user_op::InferContext& ctx, \ - user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe { \ - if (ctx.has_input("_add_to_output", 0)) { \ - OF_RETURN_IF_ERROR(AddInplaceArgPairFn("y", 0, "_add_to_output", 0, true)); \ - } \ - return Maybe::Ok(); \ - }); - -REGISTER_BN_TRAIN_KERNEL(float16) -REGISTER_BN_TRAIN_KERNEL(float) -REGISTER_BN_TRAIN_KERNEL(double) - -#define REGISTER_BN_ADD_RELU_KERNEL(dtype) \ - REGISTER_USER_KERNEL("normalization_add_relu") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn(InferTrainTmpSize); - -REGISTER_BN_ADD_RELU_KERNEL(float16) -REGISTER_BN_ADD_RELU_KERNEL(float) -REGISTER_BN_ADD_RELU_KERNEL(double) - -template -class NormalizationGradUserKernel final : public user_op::OpKernel, - public user_op::CudaGraphSupport { - public: - NormalizationGradUserKernel() = default; - ~NormalizationGradUserKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const auto* x = ctx->Tensor4ArgNameAndIndex("x", 0); - auto* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - const auto* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - const auto* gamma = ctx->Tensor4ArgNameAndIndex("gamma", 0); - auto* gamma_diff = ctx->Tensor4ArgNameAndIndex("gamma_diff", 0); - auto* beta_diff = ctx->Tensor4ArgNameAndIndex("beta_diff", 0); - const auto* mean = ctx->Tensor4ArgNameAndIndex("mean", 0); - const auto* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0); - auto* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const auto axis = ctx->Attr("axis"); - const auto epsilon = ctx->Attr("epsilon"); - - const DataType data_type = x->data_type(); - CHECK_EQ(dy->shape_view(), x->shape_view()); - CHECK_EQ(dy->data_type(), data_type); - CHECK_EQ(dx->shape_view(), x->shape_view()); - CHECK_EQ(dx->data_type(), data_type); - CHECK_GE(axis, 0); - CHECK_LT(axis, x->shape_view().NumAxes()); - - const CudnnTensorDescHelper desc_helper(x->shape_view(), data_type, axis, - HIPDNN_BATCHNORM_SPATIAL_PERSISTENT); - desc_helper.CheckParamTensor(gamma); - desc_helper.CheckParamTensor(gamma_diff); - desc_helper.CheckParamTensor(beta_diff); - desc_helper.CheckParamTensor(mean); - desc_helper.CheckParamTensor(inv_variance); - - void* bn_workspace_ptr; - size_t bn_workspace_size; - const void* bn_dy_ptr; - - if (ctx->op_type_name() == "normalization_grad") { - bn_workspace_ptr = tmp_buffer->mut_dptr(); - bn_workspace_size = tmp_buffer->shape_view().elem_cnt(); - bn_dy_ptr = dy->dptr(); - } else if (ctx->op_type_name() == "normalization_add_relu_grad") { - const int64_t elem_cnt = dy->shape_view().elem_cnt(); - const auto* mask = ctx->Tensor4ArgNameAndIndex("reserve_space", 0); - user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - if (ctx->has_output("addend_diff", 0)) { - user_op::Tensor* addend_diff = ctx->Tensor4ArgNameAndIndex("addend_diff", 0); - ReluBackward(ctx->stream(), elem_cnt, mask->dptr(), dy->dptr(), - addend_diff->mut_dptr()); - bn_workspace_ptr = tmp_buffer->mut_dptr(); - bn_workspace_size = tmp_buffer->shape_view().elem_cnt(); - bn_dy_ptr = addend_diff->dptr(); - } else { - const size_t tmp_buffer_size = tmp_buffer->shape_view().elem_cnt(); - const size_t relu_dx_size = - GetCudaAlignedSize(dy->shape_view().elem_cnt() * GetSizeOfDataType(dy->data_type())); - CHECK_GE(tmp_buffer_size, relu_dx_size); - ReluBackward(ctx->stream(), elem_cnt, mask->dptr(), dy->dptr(), - reinterpret_cast(tmp_buffer->mut_dptr())); - bn_workspace_ptr = tmp_buffer->mut_dptr() + relu_dx_size; - bn_workspace_size = tmp_buffer_size - relu_dx_size; - bn_dy_ptr = tmp_buffer->dptr(); - } - } else { - UNIMPLEMENTED(); - } - - OF_CUDNN_CHECK(hipdnnBatchNormalizationBackward( - ctx->stream()->As()->cudnn_handle(), HIPDNN_BATCHNORM_SPATIAL_PERSISTENT, - CudnnSPOnePtr(), CudnnSPZeroPtr(), CudnnSPOnePtr(), CudnnSPZeroPtr(), - desc_helper.xy_desc(), x->dptr(), desc_helper.xy_desc(), bn_dy_ptr, desc_helper.xy_desc(), - dx->mut_dptr(), desc_helper.param_desc(), gamma->dptr(), gamma_diff->mut_dptr(), - beta_diff->mut_dptr(), epsilon, mean->dptr(), inv_variance->dptr())); - - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_BN_GRAD_KERNEL(dtype) \ - REGISTER_USER_KERNEL("normalization_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn(InferGradTmpSize); - -REGISTER_BN_GRAD_KERNEL(float16) -REGISTER_BN_GRAD_KERNEL(float) -REGISTER_BN_GRAD_KERNEL(double) - -#define REGISTER_BN_ADD_RELU_GRAD_KERNEL(dtype) \ - REGISTER_USER_KERNEL("normalization_add_relu_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn(InferGradTmpSize); - -REGISTER_BN_ADD_RELU_GRAD_KERNEL(float16) -REGISTER_BN_ADD_RELU_GRAD_KERNEL(float) -REGISTER_BN_ADD_RELU_GRAD_KERNEL(double) - - -} // namespace -} // namespace oneflow - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef WITH_ROCM + +#include +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/device/cudnn_util.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/kernel/cuda_graph_support.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include "hip/hsa_detail/device_functions.h" + +namespace oneflow { + +namespace { + +void InferDimSizeAndDataFormat(const ShapeView& x_shape, const int32_t axis, int32_t* n, int32_t* c, + int32_t* h, int32_t* w, hipdnnTensorFormat_t* format) { + if (x_shape.Count(axis + 1) == 1) { + if (axis == 0) { + *n = 1; + *h = 1; + } else { + *n = x_shape.At(0); + *h = x_shape.Count(1, axis); + } + *w = 1; + *c = x_shape.At(axis); + // *format = HIPDNN_TENSOR_NHWC; + *format = HIPDNN_TENSOR_NCHW; + // std::cout << "don't surpport HIPDNN_TENSOR_NHWC, use HIPDNN_TENSOR_NCHW instead, maybe cause wrong results" << std::endl; + } else { + *n = x_shape.Count(0, axis); + *c = x_shape.At(axis); + *h = x_shape.Count(axis + 1); + *w = 1; + *format = HIPDNN_TENSOR_NCHW; + } +} + +void InferXYCudnnTensorDesc(const ShapeView& xy_shape, const DataType& data_type, + const int32_t axis, hipdnnTensorDescriptor_t xy_desc) { + int32_t n, c, h, w; + hipdnnTensorFormat_t format; + InferDimSizeAndDataFormat(xy_shape, axis, &n, &c, &h, &w, &format); + OF_CUDNN_CHECK( + hipdnnSetTensor4dDescriptor(xy_desc, format, GetCudnnDataType(data_type), n, c, h, w)); +} + +void InferParamCudnnTensorDesc(const hipdnnTensorDescriptor_t xy_desc, hipdnnBatchNormMode_t mode, + hipdnnTensorDescriptor_t param_desc) { + OF_CUDNN_CHECK(hipdnnDeriveBNTensorDescriptor(param_desc, xy_desc, mode)); +} + +class CudnnTensorDescHelper final { + public: + OF_DISALLOW_COPY_AND_MOVE(CudnnTensorDescHelper); + CudnnTensorDescHelper(const ShapeView& xy_shape, const DataType& data_type, const int32_t axis, + hipdnnBatchNormMode_t mode) { + OF_CUDNN_CHECK(hipdnnCreateTensorDescriptor(&xy_desc_)); + InferXYCudnnTensorDesc(xy_shape, data_type, axis, xy_desc_); + OF_CUDNN_CHECK(hipdnnCreateTensorDescriptor(¶m_desc_)); + InferParamCudnnTensorDesc(xy_desc_, mode, param_desc_); + int n, c, h, w, n_stride, c_stride, h_stride, w_stride; + OF_CUDNN_CHECK(hipdnnGetTensor4dDescriptor(param_desc_, ¶m_data_type_, &n, &c, &h, &w, + &n_stride, &c_stride, &h_stride, &w_stride)); + param_size_ = c; + } + ~CudnnTensorDescHelper() { + OF_CUDNN_CHECK(hipdnnDestroyTensorDescriptor(param_desc_)); + OF_CUDNN_CHECK(hipdnnDestroyTensorDescriptor(xy_desc_)); + } + + hipdnnTensorDescriptor_t xy_desc() const { return xy_desc_; } + + hipdnnTensorDescriptor_t param_desc() const { return param_desc_; } + + void CheckParamTensor(const user_op::Tensor* tensor) const { + CHECK_NOTNULL(tensor); + CHECK_EQ(tensor->shape_view().NumAxes(), 1); + CHECK_EQ(tensor->shape_view().At(0), param_size_); + // CHECK_EQ(GetCudnnDataType(tensor->data_type()), param_data_type_); + } + + private: + hipdnnTensorDescriptor_t xy_desc_ = nullptr; + hipdnnTensorDescriptor_t param_desc_ = nullptr; + hipdnnDataType_t param_data_type_; + int32_t param_size_ = 0; +}; + +size_t InferTrainWorkspaceSize(const ShapeView& x_shape, const DataType data_type, + const int32_t axis) { + return 1; +} + +size_t InferTrainTmpSize(user_op::InferContext* ctx) { + const auto& x = ctx->InputTensorDesc("x", 0); + const auto axis = ctx->Attr("axis"); + return InferTrainWorkspaceSize(x.shape(), x.data_type(), axis); +} + +size_t InferGradWorkspaceSize(const ShapeView& x_shape, const DataType data_type, + const int32_t axis) { + return 1; +} + +size_t InferGradTmpSize(user_op::InferContext* ctx) { + const auto& dy = ctx->InputTensorDesc("dy", 0); + const auto axis = ctx->Attr("axis"); + size_t tmp_size = 0; + if (ctx->op_type_name() == "normalization_add_relu_grad" && !ctx->has_output("addend_diff", 0)) { + tmp_size += GetCudaAlignedSize(dy.shape().elem_cnt() * GetSizeOfDataType(dy.data_type())); + } + tmp_size += GetCudaAlignedSize(InferGradWorkspaceSize(dy.shape(), dy.data_type(), axis)); + return tmp_size; +} + +template +class NormalizationInferenceKernel final : public user_op::OpKernel, + public user_op::CudaGraphSupport { + public: + NormalizationInferenceKernel() = default; + ~NormalizationInferenceKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const bool training = ctx->Attr("training"); + CHECK(!training); + const auto* x = ctx->Tensor4ArgNameAndIndex("x", 0); + auto* y = ctx->Tensor4ArgNameAndIndex("y", 0); + const auto* gamma = ctx->Tensor4ArgNameAndIndex("gamma", 0); + const auto* beta = ctx->Tensor4ArgNameAndIndex("beta", 0); + auto* moving_mean = ctx->Tensor4ArgNameAndIndex("moving_mean", 0); + auto* moving_variance = ctx->Tensor4ArgNameAndIndex("moving_variance", 0); + const auto axis = ctx->Attr("axis"); + const auto epsilon = ctx->Attr("epsilon"); + + const DataType data_type = x->data_type(); + CHECK_EQ(x->shape_view(), y->shape_view()); + CHECK_EQ(y->data_type(), data_type); + CHECK_GE(axis, 0); + CHECK_LT(axis, x->shape_view().NumAxes()); + + const CudnnTensorDescHelper desc_helper(x->shape_view(), data_type, axis, + HIPDNN_BATCHNORM_SPATIAL); + desc_helper.CheckParamTensor(gamma); + desc_helper.CheckParamTensor(beta); + desc_helper.CheckParamTensor(moving_mean); + desc_helper.CheckParamTensor(moving_variance); + + const void* sp_alpha = CudnnSPOnePtr(); + const void* sp_beta; + if (ctx->has_input("_add_to_output", 0)) { + const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); + CHECK_EQ(add_to_output->data_type(), y->data_type()); + CHECK_EQ(add_to_output->shape_view(), y->shape_view()); + Memcpy( + ctx->stream(), y->mut_dptr(), add_to_output->dptr(), + add_to_output->shape_view().elem_cnt() * GetSizeOfDataType(add_to_output->data_type())); + sp_beta = CudnnSPOnePtr(); + } else { + sp_beta = CudnnSPZeroPtr(); + } + + OF_CUDNN_CHECK(hipdnnBatchNormalizationForwardInference( + ctx->stream()->As()->cudnn_handle(), HIPDNN_BATCHNORM_SPATIAL, sp_alpha, + sp_beta, desc_helper.xy_desc(), x->dptr(), desc_helper.xy_desc(), y->mut_dptr(), + desc_helper.param_desc(), gamma->dptr(), beta->dptr(), moving_mean->dptr(), + moving_variance->dptr(), epsilon)); + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_BN_INFERENCE_KERNEL(dtype) \ + REGISTER_USER_KERNEL("normalization") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("y", 0) == GetDataType::value) \ + && (user_op::HobAttr("training") == false)) \ + .SetInplaceProposalFn([](const user_op::InferContext& ctx, \ + user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe { \ + if (ctx.has_input("_add_to_output", 0)) { \ + OF_RETURN_IF_ERROR(AddInplaceArgPairFn("y", 0, "_add_to_output", 0, true)); \ + } \ + return Maybe::Ok(); \ + }); + +REGISTER_BN_INFERENCE_KERNEL(float16) +REGISTER_BN_INFERENCE_KERNEL(float) +REGISTER_BN_INFERENCE_KERNEL(double) + +#undef REGISTER_BN_INFERENCE_KERNEL + +constexpr int64_t kCudaWarpSize = 64; + +template +__global__ void ReluGpu(int64_t n, const T* x, T* y, int64_t* mask) { + const int32_t lane_id = threadIdx.x % kCudaWarpSize; + CUDA_1D_KERNEL_LOOP(i, n) { + const T x_val = x[i]; + const bool is_positive = (x_val > 0); + int64_t warp_mask = __ballot(static_cast(is_positive)); + if (lane_id == 0) { mask[i / kCudaWarpSize] = warp_mask; } + y[i] = is_positive ? x_val : 0; + } +} + +template<> +__global__ void ReluGpu(int64_t n, const half* x, half* y, int64_t* mask) { + const int32_t lane_id = threadIdx.x % kCudaWarpSize; + const half zero = __float2half(0.0f); + CUDA_1D_KERNEL_LOOP(i, n) { + const half x_val = x[i]; + const bool is_positive = __hgt(x_val, zero); + int64_t warp_mask = __ballot(static_cast(is_positive)); + if (lane_id == 0) { mask[i / kCudaWarpSize] = warp_mask; } + y[i] = is_positive ? x_val : zero; + } +} + +template +__global__ void AddReluGpu(int64_t n, const T* x, const T* addend, T* y, int64_t* mask) { + const int32_t lane_id = threadIdx.x % kCudaWarpSize; + CUDA_1D_KERNEL_LOOP(i, n) { + const T sum = x[i] + addend[i]; + const bool is_positive = (sum > 0); + int64_t warp_mask = __ballot(static_cast(is_positive)); + if (lane_id == 0) { mask[i / kCudaWarpSize] = warp_mask; } + y[i] = is_positive ? sum : 0; + } +} + +template<> +__global__ void AddReluGpu(int64_t n, const half* x, const half* addend, half* y, + int64_t* mask) { + const int32_t lane_id = threadIdx.x % kCudaWarpSize; + const half zero = __float2half(0.0f); + CUDA_1D_KERNEL_LOOP(i, n) { + const half sum = __hadd(x[i], addend[i]); + const bool is_positive = __hgt(sum, zero); + int64_t warp_mask = __ballot(static_cast(is_positive)); + if (lane_id == 0) { mask[i / kCudaWarpSize] = warp_mask; } + y[i] = is_positive ? sum : zero; + } +} + +template +void Relu(ep::Stream* stream, int64_t n, const T* x, T* y, int64_t* mask) { + ReluGpu<<As()->cuda_stream()>>>(n, x, y, mask); +} + +template<> +void Relu(ep::Stream* stream, int64_t n, const float16* x, float16* y, int64_t* mask) { + Relu(stream, n, reinterpret_cast(x), reinterpret_cast(y), mask); +} + +template +void AddRelu(ep::Stream* stream, int64_t n, const T* x, const T* addend, T* y, int64_t* mask) { + AddReluGpu<<As()->cuda_stream()>>>(n, x, addend, y, mask); +} + +template<> +void AddRelu(ep::Stream* stream, int64_t n, const float16* x, const float16* addend, + float16* y, int64_t* mask) { + AddRelu(stream, n, reinterpret_cast(x), reinterpret_cast(addend), + reinterpret_cast(y), mask); +} + +template +__global__ void ReluBackwardGpu(int64_t n, const int64_t* mask, const T* dy, T* addend_diff) { + int32_t lane_id = threadIdx.x % kCudaWarpSize; + CUDA_1D_KERNEL_LOOP(i, n) { + int64_t mask_val = mask[i / kCudaWarpSize]; + bool is_positive = mask_val & (1 << lane_id); + addend_diff[i] = static_cast(is_positive) * dy[i]; + } +} + +template +void ReluBackward(ep::Stream* stream, int64_t n, const int64_t* mask, const T* dy, T* addend_diff) { + ReluBackwardGpu<<As()->cuda_stream()>>>(n, mask, dy, addend_diff); +} + +template<> +void ReluBackward(ep::Stream* stream, int64_t n, const int64_t* mask, const float16* dy, + float16* addend_diff) { + ReluBackward(stream, n, mask, reinterpret_cast(dy), + reinterpret_cast(addend_diff)); +} + +template +class NormalizationTrainKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { + public: + NormalizationTrainKernel() = default; + ~NormalizationTrainKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + if (ctx->op_type_name() == "normalization") { CHECK(ctx->Attr("training")); } + const auto* x = ctx->Tensor4ArgNameAndIndex("x", 0); + auto* y = ctx->Tensor4ArgNameAndIndex("y", 0); + + const auto axis = ctx->Attr("axis"); + const auto epsilon = ctx->Attr("epsilon"); + const auto momentum = ctx->Attr("momentum"); + + const DataType data_type = x->data_type(); + CHECK_EQ(x->shape_view(), y->shape_view()); + CHECK_EQ(y->data_type(), data_type); + CHECK_GE(axis, 0); + CHECK_LT(axis, x->shape_view().NumAxes()); + const CudnnTensorDescHelper desc_helper(x->shape_view(), data_type, axis, + HIPDNN_BATCHNORM_SPATIAL_PERSISTENT); + + const auto* gamma = ctx->Tensor4ArgNameAndIndex("gamma", 0); + const auto* beta = ctx->Tensor4ArgNameAndIndex("beta", 0); + auto* mean = ctx->Tensor4ArgNameAndIndex("mean", 0); + auto* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0); + desc_helper.CheckParamTensor(gamma); + desc_helper.CheckParamTensor(beta); + desc_helper.CheckParamTensor(mean); + desc_helper.CheckParamTensor(inv_variance); + + user_op::Tensor* moving_mean = nullptr; + user_op::Tensor* moving_variance = nullptr; + if (ctx->has_input("moving_mean", 0)) { + CHECK(ctx->has_input("moving_variance", 0)); + moving_mean = ctx->Tensor4ArgNameAndIndex("moving_mean", 0); + moving_variance = ctx->Tensor4ArgNameAndIndex("moving_variance", 0); + desc_helper.CheckParamTensor(moving_mean); + desc_helper.CheckParamTensor(moving_variance); + } + + const void* sp_alpha = CudnnSPOnePtr(); + const void* sp_beta; + if (ctx->has_input("_add_to_output", 0)) { + const user_op::Tensor* add_to_output = ctx->Tensor4ArgNameAndIndex("_add_to_output", 0); + CHECK_EQ(add_to_output->data_type(), y->data_type()); + CHECK_EQ(add_to_output->shape_view(), y->shape_view()); + Memcpy( + ctx->stream(), y->mut_dptr(), add_to_output->dptr(), + add_to_output->shape_view().elem_cnt() * GetSizeOfDataType(add_to_output->data_type())); + sp_beta = CudnnSPOnePtr(); + } else { + sp_beta = CudnnSPZeroPtr(); + } + + OF_CUDNN_CHECK(hipdnnBatchNormalizationForwardTraining( + ctx->stream()->As()->cudnn_handle(), HIPDNN_BATCHNORM_SPATIAL_PERSISTENT, + const_cast(sp_alpha), const_cast(sp_beta), desc_helper.xy_desc(), x->dptr(), desc_helper.xy_desc(), y->mut_dptr(), + desc_helper.param_desc(), const_cast(gamma->dptr()), const_cast(beta->dptr()), 1.0 - momentum, + moving_mean ? moving_mean->mut_dptr() : NULL, + moving_variance ? moving_variance->mut_dptr() : NULL, epsilon, mean->mut_dptr(), + inv_variance->mut_dptr())); + + if (ctx->op_type_name() == "normalization_add_relu") { + CHECK(!ctx->has_input("_add_to_output", 0)); + const int64_t elem_cnt = x->shape_view().elem_cnt(); + auto* mask = ctx->Tensor4ArgNameAndIndex("reserve_space", 0); + if (ctx->has_input("addend", 0)) { + const auto* addend = ctx->Tensor4ArgNameAndIndex("addend", 0); + AddRelu(ctx->stream(), elem_cnt, y->dptr(), addend->dptr(), y->mut_dptr(), + mask->mut_dptr()); + } else { + Relu(ctx->stream(), elem_cnt, y->dptr(), y->mut_dptr(), mask->mut_dptr()); + } + } + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_BN_TRAIN_KERNEL(dtype) \ + REGISTER_USER_KERNEL("normalization") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("y", 0) == GetDataType::value) \ + && (user_op::HobAttr("training") == true)) \ + .SetInferTmpSizeFn(InferTrainTmpSize) \ + .SetInplaceProposalFn([](const user_op::InferContext& ctx, \ + user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe { \ + if (ctx.has_input("_add_to_output", 0)) { \ + OF_RETURN_IF_ERROR(AddInplaceArgPairFn("y", 0, "_add_to_output", 0, true)); \ + } \ + return Maybe::Ok(); \ + }); + +REGISTER_BN_TRAIN_KERNEL(float16) +REGISTER_BN_TRAIN_KERNEL(float) +REGISTER_BN_TRAIN_KERNEL(double) + +#define REGISTER_BN_ADD_RELU_KERNEL(dtype) \ + REGISTER_USER_KERNEL("normalization_add_relu") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("y", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn(InferTrainTmpSize); + +REGISTER_BN_ADD_RELU_KERNEL(float16) +REGISTER_BN_ADD_RELU_KERNEL(float) +REGISTER_BN_ADD_RELU_KERNEL(double) + +template +class NormalizationGradUserKernel final : public user_op::OpKernel, + public user_op::CudaGraphSupport { + public: + NormalizationGradUserKernel() = default; + ~NormalizationGradUserKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const auto* x = ctx->Tensor4ArgNameAndIndex("x", 0); + auto* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + const auto* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + const auto* gamma = ctx->Tensor4ArgNameAndIndex("gamma", 0); + auto* gamma_diff = ctx->Tensor4ArgNameAndIndex("gamma_diff", 0); + auto* beta_diff = ctx->Tensor4ArgNameAndIndex("beta_diff", 0); + const auto* mean = ctx->Tensor4ArgNameAndIndex("mean", 0); + const auto* inv_variance = ctx->Tensor4ArgNameAndIndex("inv_variance", 0); + auto* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + const auto axis = ctx->Attr("axis"); + const auto epsilon = ctx->Attr("epsilon"); + + const DataType data_type = x->data_type(); + CHECK_EQ(dy->shape_view(), x->shape_view()); + CHECK_EQ(dy->data_type(), data_type); + CHECK_EQ(dx->shape_view(), x->shape_view()); + CHECK_EQ(dx->data_type(), data_type); + CHECK_GE(axis, 0); + CHECK_LT(axis, x->shape_view().NumAxes()); + + const CudnnTensorDescHelper desc_helper(x->shape_view(), data_type, axis, + HIPDNN_BATCHNORM_SPATIAL_PERSISTENT); + desc_helper.CheckParamTensor(gamma); + desc_helper.CheckParamTensor(gamma_diff); + desc_helper.CheckParamTensor(beta_diff); + desc_helper.CheckParamTensor(mean); + desc_helper.CheckParamTensor(inv_variance); + + void* bn_workspace_ptr; + size_t bn_workspace_size; + const void* bn_dy_ptr; + + if (ctx->op_type_name() == "normalization_grad") { + bn_workspace_ptr = tmp_buffer->mut_dptr(); + bn_workspace_size = tmp_buffer->shape_view().elem_cnt(); + bn_dy_ptr = dy->dptr(); + } else if (ctx->op_type_name() == "normalization_add_relu_grad") { + const int64_t elem_cnt = dy->shape_view().elem_cnt(); + const auto* mask = ctx->Tensor4ArgNameAndIndex("reserve_space", 0); + user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); + if (ctx->has_output("addend_diff", 0)) { + user_op::Tensor* addend_diff = ctx->Tensor4ArgNameAndIndex("addend_diff", 0); + ReluBackward(ctx->stream(), elem_cnt, mask->dptr(), dy->dptr(), + addend_diff->mut_dptr()); + bn_workspace_ptr = tmp_buffer->mut_dptr(); + bn_workspace_size = tmp_buffer->shape_view().elem_cnt(); + bn_dy_ptr = addend_diff->dptr(); + } else { + const size_t tmp_buffer_size = tmp_buffer->shape_view().elem_cnt(); + const size_t relu_dx_size = + GetCudaAlignedSize(dy->shape_view().elem_cnt() * GetSizeOfDataType(dy->data_type())); + CHECK_GE(tmp_buffer_size, relu_dx_size); + ReluBackward(ctx->stream(), elem_cnt, mask->dptr(), dy->dptr(), + reinterpret_cast(tmp_buffer->mut_dptr())); + bn_workspace_ptr = tmp_buffer->mut_dptr() + relu_dx_size; + bn_workspace_size = tmp_buffer_size - relu_dx_size; + bn_dy_ptr = tmp_buffer->dptr(); + } + } else { + UNIMPLEMENTED(); + } + + OF_CUDNN_CHECK(hipdnnBatchNormalizationBackward( + ctx->stream()->As()->cudnn_handle(), HIPDNN_BATCHNORM_SPATIAL_PERSISTENT, + CudnnSPOnePtr(), CudnnSPZeroPtr(), CudnnSPOnePtr(), CudnnSPZeroPtr(), + desc_helper.xy_desc(), x->dptr(), desc_helper.xy_desc(), bn_dy_ptr, desc_helper.xy_desc(), + dx->mut_dptr(), desc_helper.param_desc(), gamma->dptr(), gamma_diff->mut_dptr(), + beta_diff->mut_dptr(), epsilon, mean->dptr(), inv_variance->dptr())); + + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_BN_GRAD_KERNEL(dtype) \ + REGISTER_USER_KERNEL("normalization_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn(InferGradTmpSize); + +REGISTER_BN_GRAD_KERNEL(float16) +REGISTER_BN_GRAD_KERNEL(float) +REGISTER_BN_GRAD_KERNEL(double) + +#define REGISTER_BN_ADD_RELU_GRAD_KERNEL(dtype) \ + REGISTER_USER_KERNEL("normalization_add_relu_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn(InferGradTmpSize); + +REGISTER_BN_ADD_RELU_GRAD_KERNEL(float16) +REGISTER_BN_ADD_RELU_GRAD_KERNEL(float) +REGISTER_BN_ADD_RELU_GRAD_KERNEL(double) + + +} // namespace +} // namespace oneflow + #endif \ No newline at end of file diff --git a/oneflow/user/kernels/nvtx_range_kernel.hip.cpp b/oneflow/user/kernels/nvtx_range_kernel.hip.cpp index 8f22f5f..24a1fa5 100644 --- a/oneflow/user/kernels/nvtx_range_kernel.hip.cpp +++ b/oneflow/user/kernels/nvtx_range_kernel.hip.cpp @@ -1,138 +1,138 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" - -#ifdef OF_ENABLE_PROFILER -#include -#endif // OF_ENABLE_PROFILER - -namespace oneflow { - -namespace { - -#ifdef OF_ENABLE_PROFILER -static thread_local HashMap mark2range_id; -#endif - -} // namespace - -class NvtxOpKernelState final : public user_op::OpKernelState { - public: - NvtxOpKernelState() : counter_(0) { -#ifndef OF_ENABLE_PROFILER - LOG(WARNING) << "To use NVTX, run cmake with -DBUILD_PROFILER=ON"; -#endif - } - ~NvtxOpKernelState() override = default; - - int64_t counter() const { return counter_; } - void IncreaseCount() { counter_ += 1; } - - private: - int64_t counter_; -}; - -class NvtxStartKernel final : public user_op::OpKernel { - public: - NvtxStartKernel() = default; - ~NvtxStartKernel() override = default; - - std::shared_ptr CreateOpKernelState( - user_op::KernelInitContext* ctx) const override { - return std::make_shared(); - } - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, - const user_op::OpKernelCache*) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const ShapeView& in_shape = in->shape_view(); - CHECK_EQ(out->shape_view(), in_shape); - const DataType in_data_type = in->data_type(); - CHECK_EQ(out->data_type(), in_data_type); - Memcpy(ctx->stream(), out->mut_dptr(), in->dptr(), - in_shape.elem_cnt() * GetSizeOfDataType(in_data_type)); -#ifdef OF_ENABLE_PROFILER - auto* kernel_state = dynamic_cast(state); - const std::string mark_prefix = ctx->Attr("mark_prefix"); - const std::string mark = mark_prefix + "-" + std::to_string(kernel_state->counter()); - roctx_range_id_t range_id = roctxRangeStartA(mark.c_str()); - CHECK(mark2range_id.emplace(mark, range_id).second); - kernel_state->IncreaseCount(); -#endif // OF_ENABLE_PROFILER - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -REGISTER_USER_KERNEL("nvtx_start") - .SetCreateFn() - .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA) - .SetInplaceProposalFn([](const user_op::InferContext&, - user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe { - OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false)); - return Maybe::Ok(); - }); - -class NvtxEndKernel final : public user_op::OpKernel { - public: - NvtxEndKernel() = default; - ~NvtxEndKernel() override = default; - - std::shared_ptr CreateOpKernelState( - user_op::KernelInitContext* ctx) const override { - return std::make_shared(); - } - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, - const user_op::OpKernelCache*) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const ShapeView& in_shape = in->shape_view(); - CHECK_EQ(out->shape_view(), in_shape); - const DataType in_data_type = in->data_type(); - CHECK_EQ(out->data_type(), in_data_type); -#ifdef OF_ENABLE_PROFILER - auto* kernel_state = dynamic_cast(state); - const std::string mark_prefix = ctx->Attr("mark_prefix"); - const std::string mark = mark_prefix + "-" + std::to_string(kernel_state->counter()); - auto it = mark2range_id.find(mark.c_str()); - CHECK(it != mark2range_id.end()); - roctx_range_id_t range_id = it->second; - mark2range_id.erase(it); - roctxRangeStop(range_id); - Memcpy(ctx->stream(), out->mut_dptr(), in->dptr(), - in_shape.elem_cnt() * GetSizeOfDataType(in_data_type)); - kernel_state->IncreaseCount(); -#endif - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -REGISTER_USER_KERNEL("nvtx_end") - .SetCreateFn() - .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA) - .SetInplaceProposalFn([](const user_op::InferContext&, - user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe { - OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false)); - return Maybe::Ok(); - }); - -} // namespace oneflow +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/new_kernel_util.h" + +#ifdef OF_ENABLE_PROFILER +#include +#endif // OF_ENABLE_PROFILER + +namespace oneflow { + +namespace { + +#ifdef OF_ENABLE_PROFILER +static thread_local HashMap mark2range_id; +#endif + +} // namespace + +class NvtxOpKernelState final : public user_op::OpKernelState { + public: + NvtxOpKernelState() : counter_(0) { +#ifndef OF_ENABLE_PROFILER + LOG(WARNING) << "To use NVTX, run cmake with -DBUILD_PROFILER=ON"; +#endif + } + ~NvtxOpKernelState() override = default; + + int64_t counter() const { return counter_; } + void IncreaseCount() { counter_ += 1; } + + private: + int64_t counter_; +}; + +class NvtxStartKernel final : public user_op::OpKernel { + public: + NvtxStartKernel() = default; + ~NvtxStartKernel() override = default; + + std::shared_ptr CreateOpKernelState( + user_op::KernelInitContext* ctx) const override { + return std::make_shared(); + } + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, + const user_op::OpKernelCache*) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + const ShapeView& in_shape = in->shape_view(); + CHECK_EQ(out->shape_view(), in_shape); + const DataType in_data_type = in->data_type(); + CHECK_EQ(out->data_type(), in_data_type); + Memcpy(ctx->stream(), out->mut_dptr(), in->dptr(), + in_shape.elem_cnt() * GetSizeOfDataType(in_data_type)); +#ifdef OF_ENABLE_PROFILER + auto* kernel_state = dynamic_cast(state); + const std::string mark_prefix = ctx->Attr("mark_prefix"); + const std::string mark = mark_prefix + "-" + std::to_string(kernel_state->counter()); + roctx_range_id_t range_id = roctxRangeStartA(mark.c_str()); + CHECK(mark2range_id.emplace(mark, range_id).second); + kernel_state->IncreaseCount(); +#endif // OF_ENABLE_PROFILER + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +REGISTER_USER_KERNEL("nvtx_start") + .SetCreateFn() + .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA) + .SetInplaceProposalFn([](const user_op::InferContext&, + user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe { + OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false)); + return Maybe::Ok(); + }); + +class NvtxEndKernel final : public user_op::OpKernel { + public: + NvtxEndKernel() = default; + ~NvtxEndKernel() override = default; + + std::shared_ptr CreateOpKernelState( + user_op::KernelInitContext* ctx) const override { + return std::make_shared(); + } + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, + const user_op::OpKernelCache*) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + const ShapeView& in_shape = in->shape_view(); + CHECK_EQ(out->shape_view(), in_shape); + const DataType in_data_type = in->data_type(); + CHECK_EQ(out->data_type(), in_data_type); +#ifdef OF_ENABLE_PROFILER + auto* kernel_state = dynamic_cast(state); + const std::string mark_prefix = ctx->Attr("mark_prefix"); + const std::string mark = mark_prefix + "-" + std::to_string(kernel_state->counter()); + auto it = mark2range_id.find(mark.c_str()); + CHECK(it != mark2range_id.end()); + roctx_range_id_t range_id = it->second; + mark2range_id.erase(it); + roctxRangeStop(range_id); + Memcpy(ctx->stream(), out->mut_dptr(), in->dptr(), + in_shape.elem_cnt() * GetSizeOfDataType(in_data_type)); + kernel_state->IncreaseCount(); +#endif + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +REGISTER_USER_KERNEL("nvtx_end") + .SetCreateFn() + .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA) + .SetInplaceProposalFn([](const user_op::InferContext&, + user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe { + OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false)); + return Maybe::Ok(); + }); + +} // namespace oneflow diff --git a/oneflow/user/kernels/one_embedding_kernels.hip.cpp b/oneflow/user/kernels/one_embedding_kernels.hip.cpp index 59d3e50..cf0d24f 100644 --- a/oneflow/user/kernels/one_embedding_kernels.hip.cpp +++ b/oneflow/user/kernels/one_embedding_kernels.hip.cpp @@ -1,634 +1,634 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/embedding/key_value_store.h" -#include "oneflow/core/embedding/embedding_manager.h" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/user/kernels/random_mask_generator.h" -#include "oneflow/core/framework/random_generator_impl.h" -#include "oneflow/core/hip/atomic.hip.h" -#include "oneflow/core/ep/include/primitive/copy_nd.h" -#include "oneflow/core/ep/include/primitive/cast.h" -#include "oneflow/core/ep/include/device.h" - -namespace oneflow { - -namespace { - -enum class InitializerType { kUniform, kNormal, kConstant }; - -struct EmbeddingInitializer { - InitializerType type; - union { - struct { - float low; - float high; - } uniform_param; - struct { - float mean; - float std; - } normal_param; - struct { - float value; - } constant_param; - }; - - bool operator==(const EmbeddingInitializer& rhs) const { - if (this->type != rhs.type) { return false; } - if (rhs.type == InitializerType::kUniform) { - return (this->uniform_param.low == rhs.uniform_param.low) - && (this->uniform_param.high == rhs.uniform_param.high); - } else if (rhs.type == InitializerType::kNormal) { - return (this->normal_param.mean == rhs.normal_param.mean) - && (this->normal_param.std == rhs.normal_param.std); - } else if (rhs.type == InitializerType::kConstant) { - return this->constant_param.value == rhs.constant_param.value; - } else { - UNIMPLEMENTED(); - return false; - } - } -}; - -void ParseInitializerFromJson(const nlohmann::json& initializer, - EmbeddingInitializer* embedding_initializer) { - CHECK(initializer.contains("type")); - CHECK(initializer["type"].is_string()); - std::string type = initializer["type"].get(); - if (type == "uniform") { - embedding_initializer->type = InitializerType::kUniform; - CHECK(initializer.contains("low")); - CHECK(initializer.contains("high")); - CHECK(initializer["low"].is_number()); - CHECK(initializer["high"].is_number()); - embedding_initializer->uniform_param.low = initializer["low"]; - embedding_initializer->uniform_param.high = initializer["high"]; - } else if (type == "normal") { - CHECK(initializer.contains("mean")); - CHECK(initializer.contains("std")); - CHECK(initializer["mean"].is_number()); - CHECK(initializer["std"].is_number()); - embedding_initializer->type = InitializerType::kNormal; - embedding_initializer->normal_param.mean = initializer["mean"]; - embedding_initializer->normal_param.std = initializer["std"]; - } else if (type == "constant") { - CHECK(initializer.contains("value")); - CHECK(initializer["value"].is_number()); - embedding_initializer->type = InitializerType::kConstant; - embedding_initializer->constant_param.value = initializer["value"]; - } else { - UNIMPLEMENTED() << "Unsupported initializer type"; - } -} - -int32_t ParseJsonToUniqueInitializerVecAndReturnOffset( - const nlohmann::json& initializer, std::vector* initializers) { - EmbeddingInitializer embedding_initializer; - ParseInitializerFromJson(initializer, &embedding_initializer); - for (int32_t i = 0; i < initializers->size(); ++i) { - if (initializers->at(i) == embedding_initializer) { return i; } - } - initializers->push_back(embedding_initializer); - return initializers->size() - 1; -} - -void SetInitializerIndex(int32_t row_id, int32_t col_start, int32_t col_end, int64_t line_size, - int8_t index, std::vector* initializer_index) { - int64_t row_offset = row_id * line_size; - for (int32_t col = col_start; col < col_end; ++col) { - initializer_index->at(row_offset + col) = index; - } -} - -void ParseAndSetStateInitializerIndex(const std::string& state_initializer, - const int32_t num_tables, const int64_t line_size, - const int64_t embedding_size, - std::vector* initializer_params, - std::vector* initializer_index) { - if (line_size == embedding_size) { return; } - CHECK(!state_initializer.empty()); - auto initializers = nlohmann::json::parse(state_initializer); - CHECK(initializers.is_array()); - const int num_states = line_size / embedding_size - 1; - CHECK_EQ(num_states, initializers.size()); - for (int32_t i = 0; i < num_states; ++i) { - int32_t offset = - ParseJsonToUniqueInitializerVecAndReturnOffset(initializers.at(i), initializer_params); - int32_t col_start = embedding_size + i * embedding_size; - int32_t col_end = col_start + embedding_size; - CHECK_LE(col_end, line_size); - for (int32_t j = 0; j < num_tables; ++j) { - SetInitializerIndex(j, col_start, col_end, line_size, offset, initializer_index); - } - } -} - -void ParseAndSetModelInitializerIndex(const nlohmann::json& tables, - const std::vector& column_dims, - const int32_t num_tables, const int32_t num_columns, - const int64_t line_size, const int64_t embedding_size, - std::vector* initializer_params, - std::vector* initializer_index) { - for (int32_t i = 0; i < num_tables; ++i) { - auto table = tables.at(i); - CHECK(table.contains("columns")); - auto columns = table["columns"]; - CHECK(columns.is_array()); - CHECK_EQ(num_columns, columns.size()) << "columns size must equal to num embedding dims"; - int32_t col_start = 0; - for (int k = 0; k < columns.size(); ++k) { - auto column = columns.at(k); - CHECK(column.contains("initializer")); - int32_t offset = - ParseJsonToUniqueInitializerVecAndReturnOffset(column["initializer"], initializer_params); - int32_t col_end = col_start + column_dims.at(k); - SetInitializerIndex(i, col_start, col_end, line_size, offset, initializer_index); - col_start = col_end; - } - CHECK_EQ(col_start, embedding_size); - } -} - -void ParseInitializers(const int64_t line_size, const int64_t embedding_size, - const std::string& state_initializer, const std::string& json_serialized, - std::vector* initializer_params, - std::vector* initializer_index) { - auto json_object = nlohmann::json::parse(json_serialized); - CHECK(json_object.contains("column_dims")); - std::vector column_dims = json_object["column_dims"]; - const int32_t num_columns = column_dims.size(); - CHECK(json_object.contains("tables")); - auto tables = json_object["tables"]; - CHECK(tables.is_array()); - const int32_t num_tables = tables.size(); - initializer_index->resize(num_tables * line_size); - ParseAndSetStateInitializerIndex(state_initializer, num_tables, line_size, embedding_size, - initializer_params, initializer_index); - ParseAndSetModelInitializerIndex(tables, column_dims, num_tables, num_columns, line_size, - embedding_size, initializer_params, initializer_index); -} - -template -class EmbeddingKernelState final : public user_op::OpKernelState { - public: - explicit EmbeddingKernelState(user_op::KernelInitContext* ctx) - : device_index_(-1), generator_(CHECK_JUST(one::MakeGenerator(DeviceType::kCUDA))) { - OF_CUDA_CHECK(hipGetDevice(&device_index_)); - OF_CUDA_CHECK(hipMallocHost(&host_num_keys_, sizeof(IDX))); - key_value_store_ = Singleton::Get()->GetKeyValueStore( - ctx->Attr("embedding_name"), ctx->parallel_ctx().parallel_id()); - uint32_t max_query_length = - ctx->TensorDesc4ArgNameAndIndex("unique_ids", 0)->shape().elem_cnt(); - key_value_store_->ReserveQueryLength(max_query_length); - - const int64_t embedding_size = ctx->Attr("embedding_size"); - const int64_t line_size = ctx->Attr("line_size"); - const std::string& state_initializer = ctx->Attr("state_initializer"); - - std::vector initializer_param; - std::vector initializer_index; - ParseInitializers(line_size, embedding_size, state_initializer, - ctx->Attr("embedding_tables"), &initializer_param, - &initializer_index); - - const size_t param_size_bytes = initializer_param.size() * sizeof(EmbeddingInitializer); - OF_CUDA_CHECK(hipMallocHost(reinterpret_cast(&host_initializer_param_), param_size_bytes)); - std::memcpy(host_initializer_param_, initializer_param.data(), param_size_bytes); - OF_CUDA_CHECK(hipMalloc(&device_initializer_param_, param_size_bytes)); - OF_CUDA_CHECK(hipMemcpyAsync(device_initializer_param_, host_initializer_param_, - param_size_bytes, hipMemcpyDefault, - ctx->stream()->As()->cuda_stream())); - - const size_t index_size_bytes = initializer_index.size() * sizeof(int8_t); - OF_CUDA_CHECK(hipMallocHost(reinterpret_cast(&host_initializer_index_), index_size_bytes)); - std::memcpy(host_initializer_index_, initializer_index.data(), index_size_bytes); - OF_CUDA_CHECK(hipMalloc(&device_initializer_index_, index_size_bytes)); - OF_CUDA_CHECK(hipMemcpyAsync(device_initializer_index_, host_initializer_index_, - index_size_bytes, hipMemcpyDefault, - ctx->stream()->As()->cuda_stream())); - } - ~EmbeddingKernelState() override { - CudaCurrentDeviceGuard guard(device_index_); - OF_CUDA_CHECK(hipHostFree(host_num_keys_)); - OF_CUDA_CHECK(hipHostFree(host_initializer_param_)); - OF_CUDA_CHECK(hipFree(device_initializer_param_)); - OF_CUDA_CHECK(hipHostFree(host_initializer_index_)); - OF_CUDA_CHECK(hipFree(device_initializer_index_)); - } - - void* HostNumKeys() { return host_num_keys_; } - - embedding::KeyValueStore* KeyValueStore() { return key_value_store_; } - - one::Generator* generator() { return generator_.get(); } - - const int8_t* InitializerIndex() { return device_initializer_index_; } - const EmbeddingInitializer* Initializers() { return device_initializer_param_; } - - private: - int device_index_; - void* host_num_keys_; - std::shared_ptr generator_; - embedding::KeyValueStore* key_value_store_; - - EmbeddingInitializer* host_initializer_param_; - EmbeddingInitializer* device_initializer_param_; - int8_t* host_initializer_index_; - int8_t* device_initializer_index_; -}; - -template -class EmbeddingPutKernelState final : public user_op::OpKernelState { - public: - explicit EmbeddingPutKernelState(user_op::KernelInitContext* ctx) : device_index_(-1) { - OF_CUDA_CHECK(hipGetDevice(&device_index_)); - OF_CUDA_CHECK(hipMallocHost(&host_num_keys_, sizeof(IDX))); - key_value_store_ = Singleton::Get()->GetKeyValueStore( - ctx->Attr("embedding_name"), ctx->parallel_ctx().parallel_id()); - uint32_t max_query_length = - ctx->TensorDesc4ArgNameAndIndex("unique_ids", 0)->shape().elem_cnt(); - key_value_store_->ReserveQueryLength(max_query_length); - } - ~EmbeddingPutKernelState() override { - CudaCurrentDeviceGuard guard(device_index_); - OF_CUDA_CHECK(hipHostFree(host_num_keys_)); - } - - void* HostNumKeys() { return host_num_keys_; } - embedding::KeyValueStore* KeyValueStore() { return key_value_store_; } - - private: - int device_index_; - void* host_num_keys_; - embedding::KeyValueStore* key_value_store_; -}; - -enum class EmbeddingBufferType { kNumMissing = 0, kMissingIndices, kValues, kMaxType }; - -class EmbeddingTmpBufferManager final { - public: - OF_DISALLOW_COPY_AND_MOVE(EmbeddingTmpBufferManager); - EmbeddingTmpBufferManager(void* ptr, const int64_t num_ids, const int64_t value_byte_size, - const bool need_value_buffer) - : offset_(0), offsets_(static_cast(EmbeddingBufferType::kMaxType), -1), ptr_(ptr) { - AllocBuffer(EmbeddingBufferType::kNumMissing, sizeof(uint32_t)); - AllocBuffer(EmbeddingBufferType::kMissingIndices, num_ids * sizeof(uint32_t)); - if (need_value_buffer) { AllocBuffer(EmbeddingBufferType::kValues, num_ids * value_byte_size); } - } - - template - T* Ptr(EmbeddingBufferType type) { - CHECK(ptr_ != nullptr); - int64_t offset = offsets_.at(static_cast(type)); - CHECK_NE(offset, -1); - return reinterpret_cast(reinterpret_cast(ptr_) + offset); - } - - size_t TotalBufferSize() const { return offset_; } - - private: - void AllocBuffer(EmbeddingBufferType type, size_t size) { - const size_t type_id = static_cast(type); - CHECK_EQ(offsets_.at(type_id), -1); - offsets_.at(type_id) = offset_; - offset_ += GetCudaAlignedSize(size); - } - - size_t offset_; - std::vector offsets_; - void* ptr_; -}; - -template -__global__ void InitValueKernel(uint64_t seed, one::CUDAGeneratorState* cuda_gen_state, - uint64_t inc_offset, const int32_t line_size, - const int32_t embedding_size, - const EmbeddingInitializer* initializer_param, - const int8_t* initializer_index, const U* table_ids, - const uint32_t* num_missing_keys, const uint32_t* missing_indices, - T* values) { - int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - hiprandStatePhilox4_32_10_t state; - hiprand_init(seed, global_thread_id, cuda_gen_state->dev_offset, &state); - int64_t n = *num_missing_keys * line_size; - CUDA_1D_KERNEL_LOOP(i, n) { - int row = i / line_size; - int col = i - row * line_size; - const uint32_t index = missing_indices[row]; - const int64_t offset = index * line_size + col; - const int32_t table_idx = table_ids[index]; - const int32_t initializer_idx = initializer_index[table_idx * line_size + col]; - EmbeddingInitializer initializer = initializer_param[initializer_idx]; - T value; - if (initializer.type == InitializerType::kUniform) { - const float low = initializer.uniform_param.low; - const float high = initializer.uniform_param.high; - value = hiprand_uniform(&state) * (high - low) + low; - } else if (initializer.type == InitializerType::kNormal) { - const float mean = initializer.normal_param.mean; - const float std = initializer.normal_param.std; - value = hiprand_normal(&state) * std + mean; - } else if (initializer.type == InitializerType::kConstant) { - value = initializer.constant_param.value; - } else { - asm volatile("s_trap 0;"); - } - values[offset] = value; - } - __syncthreads(); - if (threadIdx.x == 0) { - int32_t new_counter = cuda::atomic::Add(&cuda_gen_state->dev_counter, 1) + 1; - if (new_counter == gridDim.x) { - cuda_gen_state->dev_counter = 0; // reset counter to zero - cuda_gen_state->dev_offset += inc_offset; // maintain the state of generator's dev_offset - } - } -} - -template -void LookupAndInitMissing(ep::Stream* stream, EmbeddingKernelState* embedding_state, - const int64_t num_ids, const int64_t embedding_size, - const int64_t line_size, const void* num_unique_ptr, - const void* unique_ids, const void* table_ids, T* values_ptr, - void* tmp_buffer_ptr, uint32_t* return_num_unique, - const bool put_to_kv_store) { - const auto& generator = embedding_state->generator(); - CHECK_NOTNULL(generator); - std::shared_ptr cuda_generator = - CHECK_JUST(generator->template Get(stream->device()->device_index())); - uint64_t seed = cuda_generator->current_seed(); - one::CUDAGeneratorState* cuda_gen_state = cuda_generator->cuda_gen_state(); - embedding::KeyValueStore* store = embedding_state->KeyValueStore(); - const EmbeddingInitializer* initializer_param = embedding_state->Initializers(); - const int8_t* initializer_index = embedding_state->InitializerIndex(); - bool need_value_buffer = (values_ptr == nullptr); - EmbeddingTmpBufferManager buffer_manager(tmp_buffer_ptr, num_ids, line_size * sizeof(T), - need_value_buffer); - void* host_num_keys = embedding_state->HostNumKeys(); - OF_CUDA_CHECK(hipMemcpyAsync(host_num_keys, num_unique_ptr, sizeof(IDX), hipMemcpyDefault, - stream->As()->cuda_stream())); - CHECK_JUST(stream->Sync()); - uint32_t num_unique = *reinterpret_cast(host_num_keys); - uint32_t* num_missing_ptr = - buffer_manager.template Ptr(EmbeddingBufferType::kNumMissing); - uint32_t* missing_indices = - buffer_manager.template Ptr(EmbeddingBufferType::kMissingIndices); - T* store_values = - need_value_buffer ? buffer_manager.template Ptr(EmbeddingBufferType::kValues) : values_ptr; - store->Get(stream, num_unique, unique_ids, store_values, num_missing_ptr, missing_indices); - CHECK_GE(sizeof(IDX), sizeof(uint32_t)); // host_num_keys's buffer size is sizeof(IDX) - OF_CUDA_CHECK(hipMemcpyAsync(host_num_keys, num_missing_ptr, sizeof(uint32_t), hipMemcpyDefault, - stream->As()->cuda_stream())); - CHECK_JUST(stream->Sync()); - uint32_t num_missing = *reinterpret_cast(host_num_keys); - // init missing values - if (num_missing > 0) { - const int64_t elem_cnt = num_missing * line_size; - const int64_t num_blocks = BlocksNum4ThreadsNum(elem_cnt); - const uint64_t inc_offset = std::ceil(elem_cnt / num_blocks / kCudaThreadsNumPerBlock); - InitValueKernel - <<As()->cuda_stream()>>>( - seed, cuda_gen_state, inc_offset, line_size, embedding_size, initializer_param, - initializer_index, reinterpret_cast(table_ids), num_missing_ptr, - missing_indices, store_values); - } - if (put_to_kv_store) { store->Put(stream, num_unique, unique_ids, store_values); } - *return_num_unique = num_unique; -} - -template -__global__ void Copy2D(int64_t out_elem_cnt, const int32_t in_cols, const int32_t out_cols, - const T* in, U* out) { - CUDA_1D_KERNEL_LOOP(i, out_elem_cnt) { - const int32_t row = i / out_cols; - const int32_t col = i - row * out_cols; - const int64_t in_offset = row * in_cols + col; - out[i] = static_cast(in[in_offset]); - } -} - -template -void CopyValuesToEmbeddings(ep::Stream* stream, int64_t num_unique, const int32_t embedding_size, - const int32_t value_size, const DataType value_dtype, - const DataType embedding_dtype, const T* values, void* embeddings) { - bool need_cast = (value_dtype != embedding_dtype); - bool need_copy_nd = (embedding_size != value_size); - CHECK(need_cast || need_copy_nd); - if (need_cast && !need_copy_nd) { - const int64_t cast_elem_count = num_unique * embedding_size; - std::unique_ptr cast_primitive = - ep::primitive::NewPrimitive(DeviceType::kCUDA, value_dtype, - embedding_dtype); - cast_primitive->Launch(stream, values, embeddings, cast_elem_count); - } else if (!need_cast && need_copy_nd) { - const int32_t ndims = 2; - DimVector src_pos_vec(ndims, 0); - DimVector dst_pos_vec(ndims, 0); - DimVector src_shape = {num_unique, value_size}; - DimVector dst_shape = {num_unique, embedding_size}; - DimVector extent_shape = {num_unique, embedding_size}; - std::unique_ptr copy_nd_primitive = - ep::primitive::NewPrimitive(DeviceType::kCUDA, ndims); - CHECK(copy_nd_primitive); - copy_nd_primitive->Launch(stream, value_dtype, ndims, embeddings, dst_shape.data(), - dst_pos_vec.data(), values, src_shape.data(), src_pos_vec.data(), - extent_shape.data()); - } else { - const int64_t embedding_elem_cnt = num_unique * embedding_size; - if (embedding_dtype == DataType::kFloat16) { - Copy2D<<As()->cuda_stream()>>>( - embedding_elem_cnt, value_size, embedding_size, values, - reinterpret_cast(embeddings)); - } else { - UNIMPLEMENTED(); - } - } -} - -} // namespace - -template -class EmbeddingPrefetchKernel final : public user_op::OpKernel { - public: - EmbeddingPrefetchKernel() = default; - ~EmbeddingPrefetchKernel() override = default; - - std::shared_ptr CreateOpKernelState( - user_op::KernelInitContext* ctx) const override { - return std::make_shared>(ctx); - } - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, - const user_op::OpKernelCache*) const override { - auto* embedding_state = dynamic_cast*>(state); - CHECK(embedding_state != nullptr); - - const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0); - const user_op::Tensor* unique_ids = ctx->Tensor4ArgNameAndIndex("unique_ids", 0); - const user_op::Tensor* table_ids = ctx->Tensor4ArgNameAndIndex("table_ids", 0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const int64_t embedding_size = ctx->Attr("embedding_size"); - const int64_t line_size = ctx->Attr("line_size"); - uint32_t num_unique; - T* values_ptr = nullptr; - LookupAndInitMissing(ctx->stream(), embedding_state, - unique_ids->shape_view().elem_cnt(), embedding_size, line_size, - num_unique_ids->dptr(), unique_ids->dptr(), table_ids->dptr(), - values_ptr, tmp_buffer->mut_dptr(), &num_unique, true); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define EMBEDDING_DATA_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(float, DataType::kFloat) - -#define TABLE_ID_DATA_TYPE_SEQ \ - OF_PP_MAKE_TUPLE_SEQ(uint8_t, DataType::kUInt8) \ - OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \ - OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64) \ - OF_PP_MAKE_TUPLE_SEQ(int8_t, DataType::kInt8) \ - OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) \ - OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64) - -#define IDX_DATA_TYPE_SEQ \ - OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \ - OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) - -#define REGISTER_CUDA_EMBEDDING_PREFETCH_KERNEL(t_dtype_pair, table_dtype_pair, idx_dtype_pair) \ - REGISTER_USER_KERNEL("embedding_prefetch") \ - .SetCreateFn>() \ - .SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("table_ids", 0) == OF_PP_PAIR_SECOND(table_dtype_pair)) \ - && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ - const user_op::TensorDesc& unique_ids = ctx->InputTensorDesc("unique_ids", 0); \ - EmbeddingTmpBufferManager buffer_manager( \ - nullptr, unique_ids.shape().elem_cnt(), \ - ctx->Attr("line_size") * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)), true); \ - return buffer_manager.TotalBufferSize(); \ - }); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_PREFETCH_KERNEL, EMBEDDING_DATA_TYPE_SEQ, - TABLE_ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) - -template -class EmbeddingLookupKernel final : public user_op::OpKernel { - public: - EmbeddingLookupKernel() = default; - ~EmbeddingLookupKernel() override = default; - - std::shared_ptr CreateOpKernelState( - user_op::KernelInitContext* ctx) const override { - return std::make_shared>(ctx); - } - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, - const user_op::OpKernelCache*) const override { - auto* embedding_state = dynamic_cast*>(state); - CHECK(embedding_state != nullptr); - const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0); - const user_op::Tensor* unique_ids = ctx->Tensor4ArgNameAndIndex("unique_ids", 0); - const user_op::Tensor* table_ids = ctx->Tensor4ArgNameAndIndex("table_ids", 0); - user_op::Tensor* unique_values = ctx->Tensor4ArgNameAndIndex("unique_values", 0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const int64_t embedding_size = ctx->Attr("embedding_size"); - const int64_t line_size = ctx->Attr("line_size"); - uint32_t num_unique; - LookupAndInitMissing( - ctx->stream(), embedding_state, unique_ids->shape_view().elem_cnt(), embedding_size, - line_size, num_unique_ids->dptr(), unique_ids->dptr(), table_ids->dptr(), - unique_values->mut_dptr(), tmp_buffer->mut_dptr(), &num_unique, false); - if (ctx->has_output("embeddings", 0)) { - user_op::Tensor* embeddings = ctx->Tensor4ArgNameAndIndex("embeddings", 0); - CopyValuesToEmbeddings(ctx->stream(), num_unique, embedding_size, line_size, - unique_values->data_type(), embeddings->data_type(), - unique_values->dptr(), embeddings->mut_dptr()); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_EMBEDDING_LOOKUP_KERNEL(t_dtype_pair, table_dtype_pair, idx_dtype_pair) \ - REGISTER_USER_KERNEL("embedding_lookup") \ - .SetCreateFn>() \ - .SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("unique_values", 0) == OF_PP_PAIR_SECOND(t_dtype_pair)) \ - && (user_op::HobDataType("table_ids", 0) == OF_PP_PAIR_SECOND(table_dtype_pair)) \ - && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ - const user_op::TensorDesc& unique_ids = ctx->InputTensorDesc("unique_ids", 0); \ - EmbeddingTmpBufferManager buffer_manager( \ - nullptr, unique_ids.shape().elem_cnt(), \ - ctx->Attr("line_size") * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)), false); \ - return buffer_manager.TotalBufferSize(); \ - }); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_LOOKUP_KERNEL, EMBEDDING_DATA_TYPE_SEQ, - TABLE_ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) - -template -class EmbeddingPutKernel final : public user_op::OpKernel { - public: - EmbeddingPutKernel() = default; - ~EmbeddingPutKernel() override = default; - - std::shared_ptr CreateOpKernelState( - user_op::KernelInitContext* ctx) const override { - return std::make_shared>(ctx); - } - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, - const user_op::OpKernelCache*) const override { - auto* embedding_state = dynamic_cast*>(state); - CHECK(embedding_state != nullptr); - embedding::KeyValueStore* store = embedding_state->KeyValueStore(); - const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0); - const user_op::Tensor* unique_ids = ctx->Tensor4ArgNameAndIndex("unique_ids", 0); - const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0); - - IDX* host_num_keys = reinterpret_cast(embedding_state->HostNumKeys()); - OF_CUDA_CHECK(hipMemcpyAsync(host_num_keys, num_unique_ids->dptr(), sizeof(IDX), - hipMemcpyDefault, - ctx->stream()->As()->cuda_stream())); - CHECK_JUST(ctx->stream()->Sync()); - - store->Put(ctx->stream(), *host_num_keys, unique_ids->dptr(), unique_embeddings->dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_EMBEDDING_PUT_KERNEL(dtype, typeproto) \ - REGISTER_USER_KERNEL("embedding_put") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("num_unique_ids", 0) == typeproto)); - -OF_PP_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_PUT_KERNEL, IDX_DATA_TYPE_SEQ) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/embedding/key_value_store.h" +#include "oneflow/core/embedding/embedding_manager.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/user/kernels/random_mask_generator.h" +#include "oneflow/core/framework/random_generator_impl.h" +#include "oneflow/core/hip/atomic.hip.h" +#include "oneflow/core/ep/include/primitive/copy_nd.h" +#include "oneflow/core/ep/include/primitive/cast.h" +#include "oneflow/core/ep/include/device.h" + +namespace oneflow { + +namespace { + +enum class InitializerType { kUniform, kNormal, kConstant }; + +struct EmbeddingInitializer { + InitializerType type; + union { + struct { + float low; + float high; + } uniform_param; + struct { + float mean; + float std; + } normal_param; + struct { + float value; + } constant_param; + }; + + bool operator==(const EmbeddingInitializer& rhs) const { + if (this->type != rhs.type) { return false; } + if (rhs.type == InitializerType::kUniform) { + return (this->uniform_param.low == rhs.uniform_param.low) + && (this->uniform_param.high == rhs.uniform_param.high); + } else if (rhs.type == InitializerType::kNormal) { + return (this->normal_param.mean == rhs.normal_param.mean) + && (this->normal_param.std == rhs.normal_param.std); + } else if (rhs.type == InitializerType::kConstant) { + return this->constant_param.value == rhs.constant_param.value; + } else { + UNIMPLEMENTED(); + return false; + } + } +}; + +void ParseInitializerFromJson(const nlohmann::json& initializer, + EmbeddingInitializer* embedding_initializer) { + CHECK(initializer.contains("type")); + CHECK(initializer["type"].is_string()); + std::string type = initializer["type"].get(); + if (type == "uniform") { + embedding_initializer->type = InitializerType::kUniform; + CHECK(initializer.contains("low")); + CHECK(initializer.contains("high")); + CHECK(initializer["low"].is_number()); + CHECK(initializer["high"].is_number()); + embedding_initializer->uniform_param.low = initializer["low"]; + embedding_initializer->uniform_param.high = initializer["high"]; + } else if (type == "normal") { + CHECK(initializer.contains("mean")); + CHECK(initializer.contains("std")); + CHECK(initializer["mean"].is_number()); + CHECK(initializer["std"].is_number()); + embedding_initializer->type = InitializerType::kNormal; + embedding_initializer->normal_param.mean = initializer["mean"]; + embedding_initializer->normal_param.std = initializer["std"]; + } else if (type == "constant") { + CHECK(initializer.contains("value")); + CHECK(initializer["value"].is_number()); + embedding_initializer->type = InitializerType::kConstant; + embedding_initializer->constant_param.value = initializer["value"]; + } else { + UNIMPLEMENTED() << "Unsupported initializer type"; + } +} + +int32_t ParseJsonToUniqueInitializerVecAndReturnOffset( + const nlohmann::json& initializer, std::vector* initializers) { + EmbeddingInitializer embedding_initializer; + ParseInitializerFromJson(initializer, &embedding_initializer); + for (int32_t i = 0; i < initializers->size(); ++i) { + if (initializers->at(i) == embedding_initializer) { return i; } + } + initializers->push_back(embedding_initializer); + return initializers->size() - 1; +} + +void SetInitializerIndex(int32_t row_id, int32_t col_start, int32_t col_end, int64_t line_size, + int8_t index, std::vector* initializer_index) { + int64_t row_offset = row_id * line_size; + for (int32_t col = col_start; col < col_end; ++col) { + initializer_index->at(row_offset + col) = index; + } +} + +void ParseAndSetStateInitializerIndex(const std::string& state_initializer, + const int32_t num_tables, const int64_t line_size, + const int64_t embedding_size, + std::vector* initializer_params, + std::vector* initializer_index) { + if (line_size == embedding_size) { return; } + CHECK(!state_initializer.empty()); + auto initializers = nlohmann::json::parse(state_initializer); + CHECK(initializers.is_array()); + const int num_states = line_size / embedding_size - 1; + CHECK_EQ(num_states, initializers.size()); + for (int32_t i = 0; i < num_states; ++i) { + int32_t offset = + ParseJsonToUniqueInitializerVecAndReturnOffset(initializers.at(i), initializer_params); + int32_t col_start = embedding_size + i * embedding_size; + int32_t col_end = col_start + embedding_size; + CHECK_LE(col_end, line_size); + for (int32_t j = 0; j < num_tables; ++j) { + SetInitializerIndex(j, col_start, col_end, line_size, offset, initializer_index); + } + } +} + +void ParseAndSetModelInitializerIndex(const nlohmann::json& tables, + const std::vector& column_dims, + const int32_t num_tables, const int32_t num_columns, + const int64_t line_size, const int64_t embedding_size, + std::vector* initializer_params, + std::vector* initializer_index) { + for (int32_t i = 0; i < num_tables; ++i) { + auto table = tables.at(i); + CHECK(table.contains("columns")); + auto columns = table["columns"]; + CHECK(columns.is_array()); + CHECK_EQ(num_columns, columns.size()) << "columns size must equal to num embedding dims"; + int32_t col_start = 0; + for (int k = 0; k < columns.size(); ++k) { + auto column = columns.at(k); + CHECK(column.contains("initializer")); + int32_t offset = + ParseJsonToUniqueInitializerVecAndReturnOffset(column["initializer"], initializer_params); + int32_t col_end = col_start + column_dims.at(k); + SetInitializerIndex(i, col_start, col_end, line_size, offset, initializer_index); + col_start = col_end; + } + CHECK_EQ(col_start, embedding_size); + } +} + +void ParseInitializers(const int64_t line_size, const int64_t embedding_size, + const std::string& state_initializer, const std::string& json_serialized, + std::vector* initializer_params, + std::vector* initializer_index) { + auto json_object = nlohmann::json::parse(json_serialized); + CHECK(json_object.contains("column_dims")); + std::vector column_dims = json_object["column_dims"]; + const int32_t num_columns = column_dims.size(); + CHECK(json_object.contains("tables")); + auto tables = json_object["tables"]; + CHECK(tables.is_array()); + const int32_t num_tables = tables.size(); + initializer_index->resize(num_tables * line_size); + ParseAndSetStateInitializerIndex(state_initializer, num_tables, line_size, embedding_size, + initializer_params, initializer_index); + ParseAndSetModelInitializerIndex(tables, column_dims, num_tables, num_columns, line_size, + embedding_size, initializer_params, initializer_index); +} + +template +class EmbeddingKernelState final : public user_op::OpKernelState { + public: + explicit EmbeddingKernelState(user_op::KernelInitContext* ctx) + : device_index_(-1), generator_(CHECK_JUST(one::MakeGenerator(DeviceType::kCUDA))) { + OF_CUDA_CHECK(hipGetDevice(&device_index_)); + OF_CUDA_CHECK(hipMallocHost(&host_num_keys_, sizeof(IDX))); + key_value_store_ = Singleton::Get()->GetKeyValueStore( + ctx->Attr("embedding_name"), ctx->parallel_ctx().parallel_id()); + uint32_t max_query_length = + ctx->TensorDesc4ArgNameAndIndex("unique_ids", 0)->shape().elem_cnt(); + key_value_store_->ReserveQueryLength(max_query_length); + + const int64_t embedding_size = ctx->Attr("embedding_size"); + const int64_t line_size = ctx->Attr("line_size"); + const std::string& state_initializer = ctx->Attr("state_initializer"); + + std::vector initializer_param; + std::vector initializer_index; + ParseInitializers(line_size, embedding_size, state_initializer, + ctx->Attr("embedding_tables"), &initializer_param, + &initializer_index); + + const size_t param_size_bytes = initializer_param.size() * sizeof(EmbeddingInitializer); + OF_CUDA_CHECK(hipMallocHost(reinterpret_cast(&host_initializer_param_), param_size_bytes)); + std::memcpy(host_initializer_param_, initializer_param.data(), param_size_bytes); + OF_CUDA_CHECK(hipMalloc(&device_initializer_param_, param_size_bytes)); + OF_CUDA_CHECK(hipMemcpyAsync(device_initializer_param_, host_initializer_param_, + param_size_bytes, hipMemcpyDefault, + ctx->stream()->As()->cuda_stream())); + + const size_t index_size_bytes = initializer_index.size() * sizeof(int8_t); + OF_CUDA_CHECK(hipMallocHost(reinterpret_cast(&host_initializer_index_), index_size_bytes)); + std::memcpy(host_initializer_index_, initializer_index.data(), index_size_bytes); + OF_CUDA_CHECK(hipMalloc(&device_initializer_index_, index_size_bytes)); + OF_CUDA_CHECK(hipMemcpyAsync(device_initializer_index_, host_initializer_index_, + index_size_bytes, hipMemcpyDefault, + ctx->stream()->As()->cuda_stream())); + } + ~EmbeddingKernelState() override { + CudaCurrentDeviceGuard guard(device_index_); + OF_CUDA_CHECK(hipHostFree(host_num_keys_)); + OF_CUDA_CHECK(hipHostFree(host_initializer_param_)); + OF_CUDA_CHECK(hipFree(device_initializer_param_)); + OF_CUDA_CHECK(hipHostFree(host_initializer_index_)); + OF_CUDA_CHECK(hipFree(device_initializer_index_)); + } + + void* HostNumKeys() { return host_num_keys_; } + + embedding::KeyValueStore* KeyValueStore() { return key_value_store_; } + + one::Generator* generator() { return generator_.get(); } + + const int8_t* InitializerIndex() { return device_initializer_index_; } + const EmbeddingInitializer* Initializers() { return device_initializer_param_; } + + private: + int device_index_; + void* host_num_keys_; + std::shared_ptr generator_; + embedding::KeyValueStore* key_value_store_; + + EmbeddingInitializer* host_initializer_param_; + EmbeddingInitializer* device_initializer_param_; + int8_t* host_initializer_index_; + int8_t* device_initializer_index_; +}; + +template +class EmbeddingPutKernelState final : public user_op::OpKernelState { + public: + explicit EmbeddingPutKernelState(user_op::KernelInitContext* ctx) : device_index_(-1) { + OF_CUDA_CHECK(hipGetDevice(&device_index_)); + OF_CUDA_CHECK(hipMallocHost(&host_num_keys_, sizeof(IDX))); + key_value_store_ = Singleton::Get()->GetKeyValueStore( + ctx->Attr("embedding_name"), ctx->parallel_ctx().parallel_id()); + uint32_t max_query_length = + ctx->TensorDesc4ArgNameAndIndex("unique_ids", 0)->shape().elem_cnt(); + key_value_store_->ReserveQueryLength(max_query_length); + } + ~EmbeddingPutKernelState() override { + CudaCurrentDeviceGuard guard(device_index_); + OF_CUDA_CHECK(hipHostFree(host_num_keys_)); + } + + void* HostNumKeys() { return host_num_keys_; } + embedding::KeyValueStore* KeyValueStore() { return key_value_store_; } + + private: + int device_index_; + void* host_num_keys_; + embedding::KeyValueStore* key_value_store_; +}; + +enum class EmbeddingBufferType { kNumMissing = 0, kMissingIndices, kValues, kMaxType }; + +class EmbeddingTmpBufferManager final { + public: + OF_DISALLOW_COPY_AND_MOVE(EmbeddingTmpBufferManager); + EmbeddingTmpBufferManager(void* ptr, const int64_t num_ids, const int64_t value_byte_size, + const bool need_value_buffer) + : offset_(0), offsets_(static_cast(EmbeddingBufferType::kMaxType), -1), ptr_(ptr) { + AllocBuffer(EmbeddingBufferType::kNumMissing, sizeof(uint32_t)); + AllocBuffer(EmbeddingBufferType::kMissingIndices, num_ids * sizeof(uint32_t)); + if (need_value_buffer) { AllocBuffer(EmbeddingBufferType::kValues, num_ids * value_byte_size); } + } + + template + T* Ptr(EmbeddingBufferType type) { + CHECK(ptr_ != nullptr); + int64_t offset = offsets_.at(static_cast(type)); + CHECK_NE(offset, -1); + return reinterpret_cast(reinterpret_cast(ptr_) + offset); + } + + size_t TotalBufferSize() const { return offset_; } + + private: + void AllocBuffer(EmbeddingBufferType type, size_t size) { + const size_t type_id = static_cast(type); + CHECK_EQ(offsets_.at(type_id), -1); + offsets_.at(type_id) = offset_; + offset_ += GetCudaAlignedSize(size); + } + + size_t offset_; + std::vector offsets_; + void* ptr_; +}; + +template +__global__ void InitValueKernel(uint64_t seed, one::CUDAGeneratorState* cuda_gen_state, + uint64_t inc_offset, const int32_t line_size, + const int32_t embedding_size, + const EmbeddingInitializer* initializer_param, + const int8_t* initializer_index, const U* table_ids, + const uint32_t* num_missing_keys, const uint32_t* missing_indices, + T* values) { + int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + hiprandStatePhilox4_32_10_t state; + hiprand_init(seed, global_thread_id, cuda_gen_state->dev_offset, &state); + int64_t n = *num_missing_keys * line_size; + CUDA_1D_KERNEL_LOOP(i, n) { + int row = i / line_size; + int col = i - row * line_size; + const uint32_t index = missing_indices[row]; + const int64_t offset = index * line_size + col; + const int32_t table_idx = table_ids[index]; + const int32_t initializer_idx = initializer_index[table_idx * line_size + col]; + EmbeddingInitializer initializer = initializer_param[initializer_idx]; + T value; + if (initializer.type == InitializerType::kUniform) { + const float low = initializer.uniform_param.low; + const float high = initializer.uniform_param.high; + value = hiprand_uniform(&state) * (high - low) + low; + } else if (initializer.type == InitializerType::kNormal) { + const float mean = initializer.normal_param.mean; + const float std = initializer.normal_param.std; + value = hiprand_normal(&state) * std + mean; + } else if (initializer.type == InitializerType::kConstant) { + value = initializer.constant_param.value; + } else { + asm volatile("s_trap 0;"); + } + values[offset] = value; + } + __syncthreads(); + if (threadIdx.x == 0) { + int32_t new_counter = cuda::atomic::Add(&cuda_gen_state->dev_counter, 1) + 1; + if (new_counter == gridDim.x) { + cuda_gen_state->dev_counter = 0; // reset counter to zero + cuda_gen_state->dev_offset += inc_offset; // maintain the state of generator's dev_offset + } + } +} + +template +void LookupAndInitMissing(ep::Stream* stream, EmbeddingKernelState* embedding_state, + const int64_t num_ids, const int64_t embedding_size, + const int64_t line_size, const void* num_unique_ptr, + const void* unique_ids, const void* table_ids, T* values_ptr, + void* tmp_buffer_ptr, uint32_t* return_num_unique, + const bool put_to_kv_store) { + const auto& generator = embedding_state->generator(); + CHECK_NOTNULL(generator); + std::shared_ptr cuda_generator = + CHECK_JUST(generator->template Get(stream->device()->device_index())); + uint64_t seed = cuda_generator->current_seed(); + one::CUDAGeneratorState* cuda_gen_state = cuda_generator->cuda_gen_state(); + embedding::KeyValueStore* store = embedding_state->KeyValueStore(); + const EmbeddingInitializer* initializer_param = embedding_state->Initializers(); + const int8_t* initializer_index = embedding_state->InitializerIndex(); + bool need_value_buffer = (values_ptr == nullptr); + EmbeddingTmpBufferManager buffer_manager(tmp_buffer_ptr, num_ids, line_size * sizeof(T), + need_value_buffer); + void* host_num_keys = embedding_state->HostNumKeys(); + OF_CUDA_CHECK(hipMemcpyAsync(host_num_keys, num_unique_ptr, sizeof(IDX), hipMemcpyDefault, + stream->As()->cuda_stream())); + CHECK_JUST(stream->Sync()); + uint32_t num_unique = *reinterpret_cast(host_num_keys); + uint32_t* num_missing_ptr = + buffer_manager.template Ptr(EmbeddingBufferType::kNumMissing); + uint32_t* missing_indices = + buffer_manager.template Ptr(EmbeddingBufferType::kMissingIndices); + T* store_values = + need_value_buffer ? buffer_manager.template Ptr(EmbeddingBufferType::kValues) : values_ptr; + store->Get(stream, num_unique, unique_ids, store_values, num_missing_ptr, missing_indices); + CHECK_GE(sizeof(IDX), sizeof(uint32_t)); // host_num_keys's buffer size is sizeof(IDX) + OF_CUDA_CHECK(hipMemcpyAsync(host_num_keys, num_missing_ptr, sizeof(uint32_t), hipMemcpyDefault, + stream->As()->cuda_stream())); + CHECK_JUST(stream->Sync()); + uint32_t num_missing = *reinterpret_cast(host_num_keys); + // init missing values + if (num_missing > 0) { + const int64_t elem_cnt = num_missing * line_size; + const int64_t num_blocks = BlocksNum4ThreadsNum(elem_cnt); + const uint64_t inc_offset = std::ceil(elem_cnt / num_blocks / kCudaThreadsNumPerBlock); + InitValueKernel + <<As()->cuda_stream()>>>( + seed, cuda_gen_state, inc_offset, line_size, embedding_size, initializer_param, + initializer_index, reinterpret_cast(table_ids), num_missing_ptr, + missing_indices, store_values); + } + if (put_to_kv_store) { store->Put(stream, num_unique, unique_ids, store_values); } + *return_num_unique = num_unique; +} + +template +__global__ void Copy2D(int64_t out_elem_cnt, const int32_t in_cols, const int32_t out_cols, + const T* in, U* out) { + CUDA_1D_KERNEL_LOOP(i, out_elem_cnt) { + const int32_t row = i / out_cols; + const int32_t col = i - row * out_cols; + const int64_t in_offset = row * in_cols + col; + out[i] = static_cast(in[in_offset]); + } +} + +template +void CopyValuesToEmbeddings(ep::Stream* stream, int64_t num_unique, const int32_t embedding_size, + const int32_t value_size, const DataType value_dtype, + const DataType embedding_dtype, const T* values, void* embeddings) { + bool need_cast = (value_dtype != embedding_dtype); + bool need_copy_nd = (embedding_size != value_size); + CHECK(need_cast || need_copy_nd); + if (need_cast && !need_copy_nd) { + const int64_t cast_elem_count = num_unique * embedding_size; + std::unique_ptr cast_primitive = + ep::primitive::NewPrimitive(DeviceType::kCUDA, value_dtype, + embedding_dtype); + cast_primitive->Launch(stream, values, embeddings, cast_elem_count); + } else if (!need_cast && need_copy_nd) { + const int32_t ndims = 2; + DimVector src_pos_vec(ndims, 0); + DimVector dst_pos_vec(ndims, 0); + DimVector src_shape = {num_unique, value_size}; + DimVector dst_shape = {num_unique, embedding_size}; + DimVector extent_shape = {num_unique, embedding_size}; + std::unique_ptr copy_nd_primitive = + ep::primitive::NewPrimitive(DeviceType::kCUDA, ndims); + CHECK(copy_nd_primitive); + copy_nd_primitive->Launch(stream, value_dtype, ndims, embeddings, dst_shape.data(), + dst_pos_vec.data(), values, src_shape.data(), src_pos_vec.data(), + extent_shape.data()); + } else { + const int64_t embedding_elem_cnt = num_unique * embedding_size; + if (embedding_dtype == DataType::kFloat16) { + Copy2D<<As()->cuda_stream()>>>( + embedding_elem_cnt, value_size, embedding_size, values, + reinterpret_cast(embeddings)); + } else { + UNIMPLEMENTED(); + } + } +} + +} // namespace + +template +class EmbeddingPrefetchKernel final : public user_op::OpKernel { + public: + EmbeddingPrefetchKernel() = default; + ~EmbeddingPrefetchKernel() override = default; + + std::shared_ptr CreateOpKernelState( + user_op::KernelInitContext* ctx) const override { + return std::make_shared>(ctx); + } + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, + const user_op::OpKernelCache*) const override { + auto* embedding_state = dynamic_cast*>(state); + CHECK(embedding_state != nullptr); + + const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0); + const user_op::Tensor* unique_ids = ctx->Tensor4ArgNameAndIndex("unique_ids", 0); + const user_op::Tensor* table_ids = ctx->Tensor4ArgNameAndIndex("table_ids", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + const int64_t embedding_size = ctx->Attr("embedding_size"); + const int64_t line_size = ctx->Attr("line_size"); + uint32_t num_unique; + T* values_ptr = nullptr; + LookupAndInitMissing(ctx->stream(), embedding_state, + unique_ids->shape_view().elem_cnt(), embedding_size, line_size, + num_unique_ids->dptr(), unique_ids->dptr(), table_ids->dptr(), + values_ptr, tmp_buffer->mut_dptr(), &num_unique, true); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define EMBEDDING_DATA_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(float, DataType::kFloat) + +#define TABLE_ID_DATA_TYPE_SEQ \ + OF_PP_MAKE_TUPLE_SEQ(uint8_t, DataType::kUInt8) \ + OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \ + OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64) \ + OF_PP_MAKE_TUPLE_SEQ(int8_t, DataType::kInt8) \ + OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) \ + OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64) + +#define IDX_DATA_TYPE_SEQ \ + OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \ + OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) + +#define REGISTER_CUDA_EMBEDDING_PREFETCH_KERNEL(t_dtype_pair, table_dtype_pair, idx_dtype_pair) \ + REGISTER_USER_KERNEL("embedding_prefetch") \ + .SetCreateFn>() \ + .SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("table_ids", 0) == OF_PP_PAIR_SECOND(table_dtype_pair)) \ + && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ + const user_op::TensorDesc& unique_ids = ctx->InputTensorDesc("unique_ids", 0); \ + EmbeddingTmpBufferManager buffer_manager( \ + nullptr, unique_ids.shape().elem_cnt(), \ + ctx->Attr("line_size") * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)), true); \ + return buffer_manager.TotalBufferSize(); \ + }); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_PREFETCH_KERNEL, EMBEDDING_DATA_TYPE_SEQ, + TABLE_ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) + +template +class EmbeddingLookupKernel final : public user_op::OpKernel { + public: + EmbeddingLookupKernel() = default; + ~EmbeddingLookupKernel() override = default; + + std::shared_ptr CreateOpKernelState( + user_op::KernelInitContext* ctx) const override { + return std::make_shared>(ctx); + } + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, + const user_op::OpKernelCache*) const override { + auto* embedding_state = dynamic_cast*>(state); + CHECK(embedding_state != nullptr); + const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0); + const user_op::Tensor* unique_ids = ctx->Tensor4ArgNameAndIndex("unique_ids", 0); + const user_op::Tensor* table_ids = ctx->Tensor4ArgNameAndIndex("table_ids", 0); + user_op::Tensor* unique_values = ctx->Tensor4ArgNameAndIndex("unique_values", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + const int64_t embedding_size = ctx->Attr("embedding_size"); + const int64_t line_size = ctx->Attr("line_size"); + uint32_t num_unique; + LookupAndInitMissing( + ctx->stream(), embedding_state, unique_ids->shape_view().elem_cnt(), embedding_size, + line_size, num_unique_ids->dptr(), unique_ids->dptr(), table_ids->dptr(), + unique_values->mut_dptr(), tmp_buffer->mut_dptr(), &num_unique, false); + if (ctx->has_output("embeddings", 0)) { + user_op::Tensor* embeddings = ctx->Tensor4ArgNameAndIndex("embeddings", 0); + CopyValuesToEmbeddings(ctx->stream(), num_unique, embedding_size, line_size, + unique_values->data_type(), embeddings->data_type(), + unique_values->dptr(), embeddings->mut_dptr()); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_EMBEDDING_LOOKUP_KERNEL(t_dtype_pair, table_dtype_pair, idx_dtype_pair) \ + REGISTER_USER_KERNEL("embedding_lookup") \ + .SetCreateFn>() \ + .SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("unique_values", 0) == OF_PP_PAIR_SECOND(t_dtype_pair)) \ + && (user_op::HobDataType("table_ids", 0) == OF_PP_PAIR_SECOND(table_dtype_pair)) \ + && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair))) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ + const user_op::TensorDesc& unique_ids = ctx->InputTensorDesc("unique_ids", 0); \ + EmbeddingTmpBufferManager buffer_manager( \ + nullptr, unique_ids.shape().elem_cnt(), \ + ctx->Attr("line_size") * sizeof(OF_PP_PAIR_FIRST(t_dtype_pair)), false); \ + return buffer_manager.TotalBufferSize(); \ + }); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_LOOKUP_KERNEL, EMBEDDING_DATA_TYPE_SEQ, + TABLE_ID_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) + +template +class EmbeddingPutKernel final : public user_op::OpKernel { + public: + EmbeddingPutKernel() = default; + ~EmbeddingPutKernel() override = default; + + std::shared_ptr CreateOpKernelState( + user_op::KernelInitContext* ctx) const override { + return std::make_shared>(ctx); + } + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, + const user_op::OpKernelCache*) const override { + auto* embedding_state = dynamic_cast*>(state); + CHECK(embedding_state != nullptr); + embedding::KeyValueStore* store = embedding_state->KeyValueStore(); + const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0); + const user_op::Tensor* unique_ids = ctx->Tensor4ArgNameAndIndex("unique_ids", 0); + const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0); + + IDX* host_num_keys = reinterpret_cast(embedding_state->HostNumKeys()); + OF_CUDA_CHECK(hipMemcpyAsync(host_num_keys, num_unique_ids->dptr(), sizeof(IDX), + hipMemcpyDefault, + ctx->stream()->As()->cuda_stream())); + CHECK_JUST(ctx->stream()->Sync()); + + store->Put(ctx->stream(), *host_num_keys, unique_ids->dptr(), unique_embeddings->dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_EMBEDDING_PUT_KERNEL(dtype, typeproto) \ + REGISTER_USER_KERNEL("embedding_put") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("num_unique_ids", 0) == typeproto)); + +OF_PP_FOR_EACH_TUPLE(REGISTER_CUDA_EMBEDDING_PUT_KERNEL, IDX_DATA_TYPE_SEQ) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/one_embedding_update_kernels.hip.cpp b/oneflow/user/kernels/one_embedding_update_kernels.hip.cpp index a134dda..db1decc 100644 --- a/oneflow/user/kernels/one_embedding_update_kernels.hip.cpp +++ b/oneflow/user/kernels/one_embedding_update_kernels.hip.cpp @@ -1,604 +1,604 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/user/kernels/model_update_kernel_util.h" - -namespace oneflow { - -namespace { - -template -__global__ void SGDUpdateKernel(const int64_t embedding_size, T scale, float l1, float l2, - float weight_decay, const IDX* num_unique_ids, - const float* learning_rate, const T* scale_by_ptr, - const T* down_scale_by_ptr, const int64_t* skip_if, - const G* model_diff, const T* model, T* updated_model) { - if (skip_if != nullptr && *skip_if != 0) { - const int64_t n = *num_unique_ids * embedding_size; - CUDA_1D_KERNEL_LOOP(i, n) { updated_model[i] = model[i]; } - } else { - if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } - if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; } - float learning_rate_val = *learning_rate; - const int64_t n = *num_unique_ids * embedding_size; - CUDA_1D_KERNEL_LOOP(i, n) { - updated_model[i] = model[i]; - SGDUpdateFunctor()(model_diff + i, updated_model + i, scale, l1, l2, weight_decay, - learning_rate_val); - } - } -} - -__device__ void GetMomentumOffset(const int32_t line_size, const int32_t embedding_size, - int64_t model_diff_offset, int64_t* model_offset, - int64_t* momentum_offset) { - const int32_t row = model_diff_offset / embedding_size; - const int32_t col = model_diff_offset - row * embedding_size; - *model_offset = row * line_size + col; - *momentum_offset = *model_offset + embedding_size; -} - -template -__global__ void MomentumUpdateKernel(const int64_t line_size, const int64_t embedding_size, T scale, - float l1, float l2, float weight_decay, float beta, - const IDX* num_unique_ids, const float* learning_rate, - const T* scale_by_ptr, const T* down_scale_by_ptr, - const int64_t* skip_if, const G* model_diff, - const T* unique_values, T* updated_unique_values) { - if (skip_if != nullptr && *skip_if != 0) { - const int64_t n = *num_unique_ids * line_size; - CUDA_1D_KERNEL_LOOP(i, n) { updated_unique_values[i] = unique_values[i]; } - } else { - if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } - if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; } - float learning_rate_val = *learning_rate; - const int64_t n = *num_unique_ids * embedding_size; - CUDA_1D_KERNEL_LOOP(i, n) { - int64_t model_offset; - int64_t momentum_offset; - GetMomentumOffset(line_size, embedding_size, i, &model_offset, &momentum_offset); - updated_unique_values[model_offset] = unique_values[model_offset]; - updated_unique_values[momentum_offset] = unique_values[momentum_offset]; - MomentumUpdateFunctor()(model_diff + i, updated_unique_values + model_offset, - updated_unique_values + momentum_offset, scale, l1, l2, beta, - weight_decay, learning_rate_val); - } - } -} - -__device__ void GetAdamOffset(const int32_t line_size, const int32_t embedding_size, - int64_t model_diff_offset, int64_t* model_offset, int64_t* m_offset, - int64_t* v_offset) { - const int32_t row = model_diff_offset / embedding_size; - const int32_t col = model_diff_offset - row * embedding_size; - *model_offset = row * line_size + col; - *m_offset = *model_offset + embedding_size; - *v_offset = *model_offset + 2 * embedding_size; -} - -template -__global__ void AdamUpdateKernel(const int32_t line_size, const int32_t embedding_size, T scale, - float l1, float l2, float weight_decay, float beta1, float beta2, - float epsilon, const float* bias_correction1_ptr, - const float* bias_correction2_ptr, const IDX* num_unique_ids, - const float* learning_rate, const T* scale_by_ptr, - const T* down_scale_by_ptr, const int64_t* skip_if, - const G* model_diff, const T* unique_values, - T* updated_unique_values) { - if (skip_if != nullptr && *skip_if != 0) { - const int64_t n = *num_unique_ids * line_size; - CUDA_1D_KERNEL_LOOP(i, n) { - // The n is the unique_values elem_cnt, so not need to use GetAdamOffset. - updated_unique_values[i] = unique_values[i]; - } - } else { - if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } - if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; } - float bias_correction1_val = 1.0; - float bias_correction2_val = 1.0; - if (bias_correction1_ptr != nullptr) { bias_correction1_val = *bias_correction1_ptr; } - if (bias_correction2_ptr != nullptr) { bias_correction2_val = *bias_correction2_ptr; } - float learning_rate_val = *learning_rate; - const int64_t n = *num_unique_ids * embedding_size; - // The n is model_diff elem_cnt. - CUDA_1D_KERNEL_LOOP(i, n) { - int64_t model_offset; - int64_t m_offset; - int64_t v_offset; - GetAdamOffset(line_size, embedding_size, i, &model_offset, &m_offset, &v_offset); - updated_unique_values[model_offset] = unique_values[model_offset]; - updated_unique_values[m_offset] = unique_values[m_offset]; - updated_unique_values[v_offset] = unique_values[v_offset]; - AdamUpdateFunctor()(model_diff + i, updated_unique_values + model_offset, - updated_unique_values + m_offset, updated_unique_values + v_offset, - nullptr, scale, l1, l2, beta1, beta2, epsilon, weight_decay, false, - bias_correction1_val, bias_correction2_val, learning_rate_val); - } - } -} - -template -__global__ void AdagradUpdateKernel(const int64_t line_size, const int64_t embedding_size, T scale, - float l1, float l2, float weight_decay, float lr_decay, - float epsilon, const IDX* num_unique_ids, - const float* learning_rate, const int64_t* train_step_ptr, - const T* scale_by_ptr, const T* down_scale_by_ptr, - const int64_t* skip_if, const G* model_diff, - const T* unique_values, T* updated_unique_values) { - if (skip_if != nullptr && *skip_if != 0) { - const int64_t n = *num_unique_ids * line_size; - CUDA_1D_KERNEL_LOOP(i, n) { updated_unique_values[i] = unique_values[i]; } - } else { - int64_t train_step = *train_step_ptr + 1; - if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } - if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; } - float learning_rate_val = *learning_rate; - learning_rate_val = learning_rate_val / (1 + (train_step - 1) * lr_decay); - const int64_t n = *num_unique_ids * embedding_size; - CUDA_1D_KERNEL_LOOP(i, n) { - int64_t model_offset; - int64_t sum_offset; - GetMomentumOffset(line_size, embedding_size, i, &model_offset, &sum_offset); - updated_unique_values[model_offset] = unique_values[model_offset]; - updated_unique_values[sum_offset] = unique_values[sum_offset]; - AdagradUpdateFunctor()(model_diff + i, updated_unique_values + model_offset, - updated_unique_values + sum_offset, scale, l1, l2, epsilon, - weight_decay, learning_rate_val); - } - } -} - -__device__ void GetFtrlOffset(const int32_t line_size, const int32_t embedding_size, - int64_t model_diff_offset, int64_t* model_offset, - int64_t* accumulate_offset, int64_t* z_offset) { - const int32_t row = model_diff_offset / embedding_size; - const int32_t col = model_diff_offset - row * embedding_size; - *model_offset = row * line_size + col; - *accumulate_offset = *model_offset + embedding_size; - *z_offset = *model_offset + 2 * embedding_size; -} - -template -__global__ void FtrlUpdateKernel(const int32_t line_size, const int32_t embedding_size, T scale, - float l1, float l2, float weight_decay, float lr_power, - float lambda1, float lambda2, float beta, - const IDX* num_unique_ids, const float* learning_rate, - const T* down_scale_by_ptr, const int64_t* skip_if, - const G* model_diff, const T* unique_values, - T* updated_unique_values) { - if (skip_if != nullptr && *skip_if != 0) { - const int64_t n = *num_unique_ids * line_size; - CUDA_1D_KERNEL_LOOP(i, n) { updated_unique_values[i] = unique_values[i]; } - } else { - if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; } - float learning_rate_val = *learning_rate; - const int64_t n = *num_unique_ids * embedding_size; - CUDA_1D_KERNEL_LOOP(i, n) { - int64_t model_offset; - int64_t accumulate_offset; - int64_t z_offset; - GetFtrlOffset(line_size, embedding_size, i, &model_offset, &accumulate_offset, &z_offset); - updated_unique_values[model_offset] = unique_values[model_offset]; - updated_unique_values[accumulate_offset] = unique_values[accumulate_offset]; - updated_unique_values[z_offset] = unique_values[z_offset]; - FtrlUpdateFunctor()(model_diff + i, updated_unique_values + model_offset, - updated_unique_values + accumulate_offset, - updated_unique_values + z_offset, scale, l1, l2, lr_power, lambda1, - lambda2, beta, weight_decay, learning_rate_val); - } - } -} - -} // namespace - -template -class SgdEmbeddingUpdateKernel final : public user_op::OpKernel { - public: - SgdEmbeddingUpdateKernel() = default; - ~SgdEmbeddingUpdateKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0); - const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0); - const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0); - user_op::Tensor* updated_unique_embeddings = - ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0); - CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2); - CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2); - const int64_t line_size = unique_embeddings->shape_view().At(1); - const int64_t embedding_size = embedding_grad->shape_view().At(1); - CHECK_EQ(line_size, embedding_size); - const auto scale = ctx->Attr("scale"); - const float l1 = ctx->Attr("l1"); - const float l2 = ctx->Attr("l2"); - const auto weight_decay = ctx->Attr("weight_decay"); - const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0); - const float* learning_rate_ptr = learning_rate->dptr(); - const T* scale_by_ptr = nullptr; - if (ctx->has_input("scale_by_tensor", 0)) { - const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); - CHECK_EQ(scale_by_tensor->data_type(), unique_embeddings->data_type()); - CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1); - scale_by_ptr = scale_by_tensor->dptr(); - } - const T* down_scale_by_ptr = nullptr; - if (ctx->has_input("down_scale_by_tensor", 0)) { - const user_op::Tensor* down_scale_by_tensor = - ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0); - CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type()); - CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1); - down_scale_by_ptr = down_scale_by_tensor->dptr(); - } - const int64_t* skip_if_ptr = nullptr; - if (ctx->has_input("skip_if", 0)) { - const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); - CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); - skip_if_ptr = skip_if->dptr(); - } - // update kernel - SGDUpdateKernel - <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, - 0, ctx->stream()->As()->cuda_stream()>>>( - embedding_size, scale, l1, l2, weight_decay, - reinterpret_cast(num_unique_ids->dptr()), learning_rate_ptr, scale_by_ptr, - down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr(), unique_embeddings->dptr(), - updated_unique_embeddings->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define IDX_DATA_TYPE_SEQ \ - OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \ - OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) - -#define REGISTER_CUDA_SGD_EMBEDDING_UPDATE_KERNEL(t_dtype_pair, g_type_pair, idx_dtype_pair) \ - REGISTER_USER_KERNEL("sgd_embedding_update") \ - .SetCreateFn< \ - SgdEmbeddingUpdateKernel>() \ - .SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair)) \ - && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(g_type_pair)) \ - && (user_op::HobDataType("unique_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair))); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_SGD_EMBEDDING_UPDATE_KERNEL, FLOATING_DATA_TYPE_SEQ, - // FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) - FLOATING_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) -template -class MomentumEmbeddingUpdateKernel final : public user_op::OpKernel { - public: - MomentumEmbeddingUpdateKernel() = default; - ~MomentumEmbeddingUpdateKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0); - const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0); - const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0); - user_op::Tensor* updated_unique_embeddings = - ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0); - CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2); - CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2); - const int64_t num_keys = unique_embeddings->shape_view().At(0); - const int64_t line_size = unique_embeddings->shape_view().At(1); - const int64_t embedding_size = embedding_grad->shape_view().At(1); - CHECK_EQ(line_size, embedding_size * 2); - const float l1 = ctx->Attr("l1"); - const float l2 = ctx->Attr("l2"); - const auto weight_decay = ctx->Attr("weight_decay"); - const auto beta = ctx->Attr("beta"); - const auto scale = ctx->Attr("scale"); - const T* scale_by_ptr = nullptr; - if (ctx->has_input("scale_by_tensor", 0)) { - const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); - CHECK_EQ(scale_by_tensor->data_type(), unique_embeddings->data_type()); - CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1); - scale_by_ptr = scale_by_tensor->dptr(); - } - const T* down_scale_by_ptr = nullptr; - if (ctx->has_input("down_scale_by_tensor", 0)) { - const user_op::Tensor* down_scale_by_tensor = - ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0); - CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type()); - CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1); - down_scale_by_ptr = down_scale_by_tensor->dptr(); - } - const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0); - const float* learning_rate_ptr = learning_rate->dptr(); - const int64_t* skip_if_ptr = nullptr; - if (ctx->has_input("skip_if", 0)) { - const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); - CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); - skip_if_ptr = skip_if->dptr(); - } - // update kernel - MomentumUpdateKernel - <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, - 0, ctx->stream()->As()->cuda_stream()>>>( - line_size, embedding_size, scale, l1, l2, weight_decay, beta, - reinterpret_cast(num_unique_ids->dptr()), learning_rate_ptr, scale_by_ptr, - down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr(), unique_embeddings->dptr(), - updated_unique_embeddings->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_MOMENTUM_EMBEDDING_UPDATE_KERNEL(t_dtype_pair, g_type_pair, idx_dtype_pair) \ - REGISTER_USER_KERNEL("momentum_embedding_update") \ - .SetCreateFn>() \ - .SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair)) \ - && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(g_type_pair)) \ - && (user_op::HobDataType("unique_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair))); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_MOMENTUM_EMBEDDING_UPDATE_KERNEL, - // FLOATING_DATA_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, - FLOATING_DATA_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ, - IDX_DATA_TYPE_SEQ) - -template -class AdamEmbeddingUpdateKernel final : public user_op::OpKernel { - public: - AdamEmbeddingUpdateKernel() = default; - ~AdamEmbeddingUpdateKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0); - const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0); - const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0); - user_op::Tensor* updated_unique_embeddings = - ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0); - CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2); - CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2); - const int64_t num_keys = unique_embeddings->shape_view().At(0); - const int64_t line_size = unique_embeddings->shape_view().At(1); - const int64_t embedding_size = embedding_grad->shape_view().At(1); - CHECK_EQ(line_size, embedding_size * 3); - - const float l1 = ctx->Attr("l1"); - const float l2 = ctx->Attr("l2"); - const auto weight_decay = ctx->Attr("weight_decay"); - const auto beta1 = ctx->Attr("beta1"); - const auto beta2 = ctx->Attr("beta2"); - const auto epsilon = ctx->Attr("epsilon"); - const bool do_bias_correction = ctx->Attr("do_bias_correction"); - const auto scale = ctx->Attr("scale"); - const T* scale_by_ptr = nullptr; - if (ctx->has_input("scale_by_tensor", 0)) { - const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); - CHECK_EQ(scale_by_tensor->data_type(), unique_embeddings->data_type()); - CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1); - scale_by_ptr = scale_by_tensor->dptr(); - } - const T* down_scale_by_ptr = nullptr; - if (ctx->has_input("down_scale_by_tensor", 0)) { - const user_op::Tensor* down_scale_by_tensor = - ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0); - CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type()); - CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1); - down_scale_by_ptr = down_scale_by_tensor->dptr(); - } - const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0); - const float* learning_rate_ptr = learning_rate->dptr(); - const int64_t* skip_if_ptr = nullptr; - if (ctx->has_input("skip_if", 0)) { - const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); - CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); - skip_if_ptr = skip_if->dptr(); - } - const float* bias_correction1_ptr = nullptr; - if (ctx->has_input("bias_correction1", 0)) { - bias_correction1_ptr = ctx->Tensor4ArgNameAndIndex("bias_correction1", 0)->dptr(); - } - const float* bias_correction2_ptr = nullptr; - if (ctx->has_input("bias_correction2", 0)) { - bias_correction2_ptr = ctx->Tensor4ArgNameAndIndex("bias_correction2", 0)->dptr(); - } - // update kernel - AdamUpdateKernel - <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, - 0, ctx->stream()->As()->cuda_stream()>>>( - line_size, embedding_size, static_cast(scale), l1, l2, weight_decay, beta1, beta2, - epsilon, bias_correction1_ptr, bias_correction2_ptr, - reinterpret_cast(num_unique_ids->dptr()), learning_rate_ptr, scale_by_ptr, - down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr(), unique_embeddings->dptr(), - updated_unique_embeddings->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_ADAM_EMBEDDING_UPDATE_KERNEL(t_dtype_pair, g_type_pair, idx_dtype_pair) \ - REGISTER_USER_KERNEL("adam_embedding_update") \ - .SetCreateFn< \ - AdamEmbeddingUpdateKernel>() \ - .SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair)) \ - && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(g_type_pair)) \ - && (user_op::HobDataType("unique_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair))); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_ADAM_EMBEDDING_UPDATE_KERNEL, FLOATING_DATA_TYPE_SEQ, - // FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) - FLOATING_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) - -template -class AdagradEmbeddingUpdateKernel final : public user_op::OpKernel { - public: - AdagradEmbeddingUpdateKernel() = default; - ~AdagradEmbeddingUpdateKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0); - const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0); - const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0); - user_op::Tensor* updated_unique_embeddings = - ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0); - CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2); - CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2); - const int64_t num_keys = unique_embeddings->shape_view().At(0); - const int64_t line_size = unique_embeddings->shape_view().At(1); - const int64_t embedding_size = embedding_grad->shape_view().At(1); - CHECK_EQ(line_size, embedding_size * 2); - - const float l1 = ctx->Attr("l1"); - const float l2 = ctx->Attr("l2"); - const auto weight_decay = ctx->Attr("weight_decay"); - const auto lr_decay = ctx->Attr("lr_decay"); - const auto epsilon = ctx->Attr("epsilon"); - const auto scale = ctx->Attr("scale"); - const T* scale_by_ptr = nullptr; - if (ctx->has_input("scale_by_tensor", 0)) { - const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); - CHECK_EQ(scale_by_tensor->data_type(), unique_embeddings->data_type()); - CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1); - scale_by_ptr = scale_by_tensor->dptr(); - } - const T* down_scale_by_ptr = nullptr; - if (ctx->has_input("down_scale_by_tensor", 0)) { - const user_op::Tensor* down_scale_by_tensor = - ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0); - CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type()); - CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1); - down_scale_by_ptr = down_scale_by_tensor->dptr(); - } - const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0); - const float* learning_rate_ptr = learning_rate->dptr(); - const int64_t* train_step_ptr = ctx->Tensor4ArgNameAndIndex("train_step", 0)->dptr(); - const int64_t* skip_if_ptr = nullptr; - if (ctx->has_input("skip_if", 0)) { - const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); - CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); - skip_if_ptr = skip_if->dptr(); - } - // update kernel - AdagradUpdateKernel - <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, - 0, ctx->stream()->As()->cuda_stream()>>>( - line_size, embedding_size, static_cast(scale), l1, l2, weight_decay, lr_decay, - epsilon, reinterpret_cast(num_unique_ids->dptr()), learning_rate_ptr, - train_step_ptr, scale_by_ptr, down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr(), - unique_embeddings->dptr(), updated_unique_embeddings->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_ADAGRAD_EMBEDDING_UPDATE_KERNEL(t_dtype_pair, g_type_pair, idx_dtype_pair) \ - REGISTER_USER_KERNEL("adagrad_embedding_update") \ - .SetCreateFn>() \ - .SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair)) \ - && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(g_type_pair)) \ - && (user_op::HobDataType("unique_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair))); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_ADAGRAD_EMBEDDING_UPDATE_KERNEL, - // FLOATING_DATA_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, - FLOATING_DATA_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ, - IDX_DATA_TYPE_SEQ) - -template -class FtrlEmbeddingUpdateKernel final : public user_op::OpKernel { - public: - FtrlEmbeddingUpdateKernel() = default; - ~FtrlEmbeddingUpdateKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0); - const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0); - const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0); - user_op::Tensor* updated_unique_embeddings = - ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0); - CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2) - << "The NumAxes of unique_embedding should be equal to 2. "; - CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2) - << "The NumAxes of embedding_grad should be equal to 2. "; - const int64_t num_keys = unique_embeddings->shape_view().At(0); - const int64_t line_size = unique_embeddings->shape_view().At(1); - const int64_t embedding_size = embedding_grad->shape_view().At(1); - CHECK_EQ(line_size, embedding_size * 3) - << "The line_size should be equal to 3 x embedding_size. "; - const float l1 = 0.0; - const float l2 = 0.0; - const float weight_decay = ctx->Attr("weight_decay"); - // TODO(zhengzekang): Undefined behavior for ftrl optimizer with weight_decay in `abs(new_z_val) - // < lambda1` condition. - CHECK_EQ(weight_decay, static_cast(0.0)) - << "Currently not support for setting weight decay. "; - const float lr_power = ctx->Attr("lr_power"); - const float lambda1 = ctx->Attr("lambda1"); - const float lambda2 = ctx->Attr("lambda2"); - const float beta = ctx->Attr("beta"); - const double scale = ctx->Attr("scale"); - const T* down_scale_by_ptr = nullptr; - if (ctx->has_input("down_scale_by_tensor", 0)) { - const user_op::Tensor* down_scale_by_tensor = - ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0); - CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type()); - CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1); - down_scale_by_ptr = down_scale_by_tensor->dptr(); - } - const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0); - const float* learning_rate_ptr = learning_rate->dptr(); - const int64_t* skip_if_ptr = nullptr; - if (ctx->has_input("skip_if", 0)) { - const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); - CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); - skip_if_ptr = skip_if->dptr(); - } - // update kernel - FtrlUpdateKernel - <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, - 0, ctx->stream()->As()->cuda_stream()>>>( - line_size, embedding_size, static_cast(scale), l1, l2, weight_decay, lr_power, - lambda1, lambda2, beta, reinterpret_cast(num_unique_ids->dptr()), - learning_rate_ptr, down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr(), - unique_embeddings->dptr(), updated_unique_embeddings->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; -#define REGISTER_CUDA_FTRL_EMBEDDING_UPDATE_KERNEL(t_dtype_pair, g_type_pair, idx_dtype_pair) \ - REGISTER_USER_KERNEL("ftrl_embedding_update") \ - .SetCreateFn< \ - FtrlEmbeddingUpdateKernel>() \ - .SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair)) \ - && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(g_type_pair)) \ - && (user_op::HobDataType("unique_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair))); -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_FTRL_EMBEDDING_UPDATE_KERNEL, FLOATING_DATA_TYPE_SEQ, - // FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) - FLOATING_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/user/kernels/model_update_kernel_util.h" + +namespace oneflow { + +namespace { + +template +__global__ void SGDUpdateKernel(const int64_t embedding_size, T scale, float l1, float l2, + float weight_decay, const IDX* num_unique_ids, + const float* learning_rate, const T* scale_by_ptr, + const T* down_scale_by_ptr, const int64_t* skip_if, + const G* model_diff, const T* model, T* updated_model) { + if (skip_if != nullptr && *skip_if != 0) { + const int64_t n = *num_unique_ids * embedding_size; + CUDA_1D_KERNEL_LOOP(i, n) { updated_model[i] = model[i]; } + } else { + if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } + if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; } + float learning_rate_val = *learning_rate; + const int64_t n = *num_unique_ids * embedding_size; + CUDA_1D_KERNEL_LOOP(i, n) { + updated_model[i] = model[i]; + SGDUpdateFunctor()(model_diff + i, updated_model + i, scale, l1, l2, weight_decay, + learning_rate_val); + } + } +} + +__device__ void GetMomentumOffset(const int32_t line_size, const int32_t embedding_size, + int64_t model_diff_offset, int64_t* model_offset, + int64_t* momentum_offset) { + const int32_t row = model_diff_offset / embedding_size; + const int32_t col = model_diff_offset - row * embedding_size; + *model_offset = row * line_size + col; + *momentum_offset = *model_offset + embedding_size; +} + +template +__global__ void MomentumUpdateKernel(const int64_t line_size, const int64_t embedding_size, T scale, + float l1, float l2, float weight_decay, float beta, + const IDX* num_unique_ids, const float* learning_rate, + const T* scale_by_ptr, const T* down_scale_by_ptr, + const int64_t* skip_if, const G* model_diff, + const T* unique_values, T* updated_unique_values) { + if (skip_if != nullptr && *skip_if != 0) { + const int64_t n = *num_unique_ids * line_size; + CUDA_1D_KERNEL_LOOP(i, n) { updated_unique_values[i] = unique_values[i]; } + } else { + if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } + if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; } + float learning_rate_val = *learning_rate; + const int64_t n = *num_unique_ids * embedding_size; + CUDA_1D_KERNEL_LOOP(i, n) { + int64_t model_offset; + int64_t momentum_offset; + GetMomentumOffset(line_size, embedding_size, i, &model_offset, &momentum_offset); + updated_unique_values[model_offset] = unique_values[model_offset]; + updated_unique_values[momentum_offset] = unique_values[momentum_offset]; + MomentumUpdateFunctor()(model_diff + i, updated_unique_values + model_offset, + updated_unique_values + momentum_offset, scale, l1, l2, beta, + weight_decay, learning_rate_val); + } + } +} + +__device__ void GetAdamOffset(const int32_t line_size, const int32_t embedding_size, + int64_t model_diff_offset, int64_t* model_offset, int64_t* m_offset, + int64_t* v_offset) { + const int32_t row = model_diff_offset / embedding_size; + const int32_t col = model_diff_offset - row * embedding_size; + *model_offset = row * line_size + col; + *m_offset = *model_offset + embedding_size; + *v_offset = *model_offset + 2 * embedding_size; +} + +template +__global__ void AdamUpdateKernel(const int32_t line_size, const int32_t embedding_size, T scale, + float l1, float l2, float weight_decay, float beta1, float beta2, + float epsilon, const float* bias_correction1_ptr, + const float* bias_correction2_ptr, const IDX* num_unique_ids, + const float* learning_rate, const T* scale_by_ptr, + const T* down_scale_by_ptr, const int64_t* skip_if, + const G* model_diff, const T* unique_values, + T* updated_unique_values) { + if (skip_if != nullptr && *skip_if != 0) { + const int64_t n = *num_unique_ids * line_size; + CUDA_1D_KERNEL_LOOP(i, n) { + // The n is the unique_values elem_cnt, so not need to use GetAdamOffset. + updated_unique_values[i] = unique_values[i]; + } + } else { + if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } + if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; } + float bias_correction1_val = 1.0; + float bias_correction2_val = 1.0; + if (bias_correction1_ptr != nullptr) { bias_correction1_val = *bias_correction1_ptr; } + if (bias_correction2_ptr != nullptr) { bias_correction2_val = *bias_correction2_ptr; } + float learning_rate_val = *learning_rate; + const int64_t n = *num_unique_ids * embedding_size; + // The n is model_diff elem_cnt. + CUDA_1D_KERNEL_LOOP(i, n) { + int64_t model_offset; + int64_t m_offset; + int64_t v_offset; + GetAdamOffset(line_size, embedding_size, i, &model_offset, &m_offset, &v_offset); + updated_unique_values[model_offset] = unique_values[model_offset]; + updated_unique_values[m_offset] = unique_values[m_offset]; + updated_unique_values[v_offset] = unique_values[v_offset]; + AdamUpdateFunctor()(model_diff + i, updated_unique_values + model_offset, + updated_unique_values + m_offset, updated_unique_values + v_offset, + nullptr, scale, l1, l2, beta1, beta2, epsilon, weight_decay, false, + bias_correction1_val, bias_correction2_val, learning_rate_val); + } + } +} + +template +__global__ void AdagradUpdateKernel(const int64_t line_size, const int64_t embedding_size, T scale, + float l1, float l2, float weight_decay, float lr_decay, + float epsilon, const IDX* num_unique_ids, + const float* learning_rate, const int64_t* train_step_ptr, + const T* scale_by_ptr, const T* down_scale_by_ptr, + const int64_t* skip_if, const G* model_diff, + const T* unique_values, T* updated_unique_values) { + if (skip_if != nullptr && *skip_if != 0) { + const int64_t n = *num_unique_ids * line_size; + CUDA_1D_KERNEL_LOOP(i, n) { updated_unique_values[i] = unique_values[i]; } + } else { + int64_t train_step = *train_step_ptr + 1; + if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; } + if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; } + float learning_rate_val = *learning_rate; + learning_rate_val = learning_rate_val / (1 + (train_step - 1) * lr_decay); + const int64_t n = *num_unique_ids * embedding_size; + CUDA_1D_KERNEL_LOOP(i, n) { + int64_t model_offset; + int64_t sum_offset; + GetMomentumOffset(line_size, embedding_size, i, &model_offset, &sum_offset); + updated_unique_values[model_offset] = unique_values[model_offset]; + updated_unique_values[sum_offset] = unique_values[sum_offset]; + AdagradUpdateFunctor()(model_diff + i, updated_unique_values + model_offset, + updated_unique_values + sum_offset, scale, l1, l2, epsilon, + weight_decay, learning_rate_val); + } + } +} + +__device__ void GetFtrlOffset(const int32_t line_size, const int32_t embedding_size, + int64_t model_diff_offset, int64_t* model_offset, + int64_t* accumulate_offset, int64_t* z_offset) { + const int32_t row = model_diff_offset / embedding_size; + const int32_t col = model_diff_offset - row * embedding_size; + *model_offset = row * line_size + col; + *accumulate_offset = *model_offset + embedding_size; + *z_offset = *model_offset + 2 * embedding_size; +} + +template +__global__ void FtrlUpdateKernel(const int32_t line_size, const int32_t embedding_size, T scale, + float l1, float l2, float weight_decay, float lr_power, + float lambda1, float lambda2, float beta, + const IDX* num_unique_ids, const float* learning_rate, + const T* down_scale_by_ptr, const int64_t* skip_if, + const G* model_diff, const T* unique_values, + T* updated_unique_values) { + if (skip_if != nullptr && *skip_if != 0) { + const int64_t n = *num_unique_ids * line_size; + CUDA_1D_KERNEL_LOOP(i, n) { updated_unique_values[i] = unique_values[i]; } + } else { + if (down_scale_by_ptr != nullptr) { scale /= *down_scale_by_ptr; } + float learning_rate_val = *learning_rate; + const int64_t n = *num_unique_ids * embedding_size; + CUDA_1D_KERNEL_LOOP(i, n) { + int64_t model_offset; + int64_t accumulate_offset; + int64_t z_offset; + GetFtrlOffset(line_size, embedding_size, i, &model_offset, &accumulate_offset, &z_offset); + updated_unique_values[model_offset] = unique_values[model_offset]; + updated_unique_values[accumulate_offset] = unique_values[accumulate_offset]; + updated_unique_values[z_offset] = unique_values[z_offset]; + FtrlUpdateFunctor()(model_diff + i, updated_unique_values + model_offset, + updated_unique_values + accumulate_offset, + updated_unique_values + z_offset, scale, l1, l2, lr_power, lambda1, + lambda2, beta, weight_decay, learning_rate_val); + } + } +} + +} // namespace + +template +class SgdEmbeddingUpdateKernel final : public user_op::OpKernel { + public: + SgdEmbeddingUpdateKernel() = default; + ~SgdEmbeddingUpdateKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0); + const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0); + const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0); + user_op::Tensor* updated_unique_embeddings = + ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0); + CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2); + CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2); + const int64_t line_size = unique_embeddings->shape_view().At(1); + const int64_t embedding_size = embedding_grad->shape_view().At(1); + CHECK_EQ(line_size, embedding_size); + const auto scale = ctx->Attr("scale"); + const float l1 = ctx->Attr("l1"); + const float l2 = ctx->Attr("l2"); + const auto weight_decay = ctx->Attr("weight_decay"); + const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0); + const float* learning_rate_ptr = learning_rate->dptr(); + const T* scale_by_ptr = nullptr; + if (ctx->has_input("scale_by_tensor", 0)) { + const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); + CHECK_EQ(scale_by_tensor->data_type(), unique_embeddings->data_type()); + CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1); + scale_by_ptr = scale_by_tensor->dptr(); + } + const T* down_scale_by_ptr = nullptr; + if (ctx->has_input("down_scale_by_tensor", 0)) { + const user_op::Tensor* down_scale_by_tensor = + ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0); + CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type()); + CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1); + down_scale_by_ptr = down_scale_by_tensor->dptr(); + } + const int64_t* skip_if_ptr = nullptr; + if (ctx->has_input("skip_if", 0)) { + const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); + CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); + skip_if_ptr = skip_if->dptr(); + } + // update kernel + SGDUpdateKernel + <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, + 0, ctx->stream()->As()->cuda_stream()>>>( + embedding_size, scale, l1, l2, weight_decay, + reinterpret_cast(num_unique_ids->dptr()), learning_rate_ptr, scale_by_ptr, + down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr(), unique_embeddings->dptr(), + updated_unique_embeddings->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define IDX_DATA_TYPE_SEQ \ + OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32) \ + OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) + +#define REGISTER_CUDA_SGD_EMBEDDING_UPDATE_KERNEL(t_dtype_pair, g_type_pair, idx_dtype_pair) \ + REGISTER_USER_KERNEL("sgd_embedding_update") \ + .SetCreateFn< \ + SgdEmbeddingUpdateKernel>() \ + .SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair)) \ + && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(g_type_pair)) \ + && (user_op::HobDataType("unique_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair))); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_SGD_EMBEDDING_UPDATE_KERNEL, FLOATING_DATA_TYPE_SEQ, + // FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) + FLOATING_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) +template +class MomentumEmbeddingUpdateKernel final : public user_op::OpKernel { + public: + MomentumEmbeddingUpdateKernel() = default; + ~MomentumEmbeddingUpdateKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0); + const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0); + const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0); + user_op::Tensor* updated_unique_embeddings = + ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0); + CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2); + CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2); + const int64_t num_keys = unique_embeddings->shape_view().At(0); + const int64_t line_size = unique_embeddings->shape_view().At(1); + const int64_t embedding_size = embedding_grad->shape_view().At(1); + CHECK_EQ(line_size, embedding_size * 2); + const float l1 = ctx->Attr("l1"); + const float l2 = ctx->Attr("l2"); + const auto weight_decay = ctx->Attr("weight_decay"); + const auto beta = ctx->Attr("beta"); + const auto scale = ctx->Attr("scale"); + const T* scale_by_ptr = nullptr; + if (ctx->has_input("scale_by_tensor", 0)) { + const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); + CHECK_EQ(scale_by_tensor->data_type(), unique_embeddings->data_type()); + CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1); + scale_by_ptr = scale_by_tensor->dptr(); + } + const T* down_scale_by_ptr = nullptr; + if (ctx->has_input("down_scale_by_tensor", 0)) { + const user_op::Tensor* down_scale_by_tensor = + ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0); + CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type()); + CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1); + down_scale_by_ptr = down_scale_by_tensor->dptr(); + } + const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0); + const float* learning_rate_ptr = learning_rate->dptr(); + const int64_t* skip_if_ptr = nullptr; + if (ctx->has_input("skip_if", 0)) { + const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); + CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); + skip_if_ptr = skip_if->dptr(); + } + // update kernel + MomentumUpdateKernel + <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, + 0, ctx->stream()->As()->cuda_stream()>>>( + line_size, embedding_size, scale, l1, l2, weight_decay, beta, + reinterpret_cast(num_unique_ids->dptr()), learning_rate_ptr, scale_by_ptr, + down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr(), unique_embeddings->dptr(), + updated_unique_embeddings->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_MOMENTUM_EMBEDDING_UPDATE_KERNEL(t_dtype_pair, g_type_pair, idx_dtype_pair) \ + REGISTER_USER_KERNEL("momentum_embedding_update") \ + .SetCreateFn>() \ + .SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair)) \ + && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(g_type_pair)) \ + && (user_op::HobDataType("unique_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair))); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_MOMENTUM_EMBEDDING_UPDATE_KERNEL, + // FLOATING_DATA_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, + FLOATING_DATA_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ, + IDX_DATA_TYPE_SEQ) + +template +class AdamEmbeddingUpdateKernel final : public user_op::OpKernel { + public: + AdamEmbeddingUpdateKernel() = default; + ~AdamEmbeddingUpdateKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0); + const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0); + const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0); + user_op::Tensor* updated_unique_embeddings = + ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0); + CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2); + CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2); + const int64_t num_keys = unique_embeddings->shape_view().At(0); + const int64_t line_size = unique_embeddings->shape_view().At(1); + const int64_t embedding_size = embedding_grad->shape_view().At(1); + CHECK_EQ(line_size, embedding_size * 3); + + const float l1 = ctx->Attr("l1"); + const float l2 = ctx->Attr("l2"); + const auto weight_decay = ctx->Attr("weight_decay"); + const auto beta1 = ctx->Attr("beta1"); + const auto beta2 = ctx->Attr("beta2"); + const auto epsilon = ctx->Attr("epsilon"); + const bool do_bias_correction = ctx->Attr("do_bias_correction"); + const auto scale = ctx->Attr("scale"); + const T* scale_by_ptr = nullptr; + if (ctx->has_input("scale_by_tensor", 0)) { + const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); + CHECK_EQ(scale_by_tensor->data_type(), unique_embeddings->data_type()); + CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1); + scale_by_ptr = scale_by_tensor->dptr(); + } + const T* down_scale_by_ptr = nullptr; + if (ctx->has_input("down_scale_by_tensor", 0)) { + const user_op::Tensor* down_scale_by_tensor = + ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0); + CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type()); + CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1); + down_scale_by_ptr = down_scale_by_tensor->dptr(); + } + const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0); + const float* learning_rate_ptr = learning_rate->dptr(); + const int64_t* skip_if_ptr = nullptr; + if (ctx->has_input("skip_if", 0)) { + const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); + CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); + skip_if_ptr = skip_if->dptr(); + } + const float* bias_correction1_ptr = nullptr; + if (ctx->has_input("bias_correction1", 0)) { + bias_correction1_ptr = ctx->Tensor4ArgNameAndIndex("bias_correction1", 0)->dptr(); + } + const float* bias_correction2_ptr = nullptr; + if (ctx->has_input("bias_correction2", 0)) { + bias_correction2_ptr = ctx->Tensor4ArgNameAndIndex("bias_correction2", 0)->dptr(); + } + // update kernel + AdamUpdateKernel + <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, + 0, ctx->stream()->As()->cuda_stream()>>>( + line_size, embedding_size, static_cast(scale), l1, l2, weight_decay, beta1, beta2, + epsilon, bias_correction1_ptr, bias_correction2_ptr, + reinterpret_cast(num_unique_ids->dptr()), learning_rate_ptr, scale_by_ptr, + down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr(), unique_embeddings->dptr(), + updated_unique_embeddings->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_ADAM_EMBEDDING_UPDATE_KERNEL(t_dtype_pair, g_type_pair, idx_dtype_pair) \ + REGISTER_USER_KERNEL("adam_embedding_update") \ + .SetCreateFn< \ + AdamEmbeddingUpdateKernel>() \ + .SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair)) \ + && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(g_type_pair)) \ + && (user_op::HobDataType("unique_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair))); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_ADAM_EMBEDDING_UPDATE_KERNEL, FLOATING_DATA_TYPE_SEQ, + // FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) + FLOATING_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) + +template +class AdagradEmbeddingUpdateKernel final : public user_op::OpKernel { + public: + AdagradEmbeddingUpdateKernel() = default; + ~AdagradEmbeddingUpdateKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0); + const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0); + const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0); + user_op::Tensor* updated_unique_embeddings = + ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0); + CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2); + CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2); + const int64_t num_keys = unique_embeddings->shape_view().At(0); + const int64_t line_size = unique_embeddings->shape_view().At(1); + const int64_t embedding_size = embedding_grad->shape_view().At(1); + CHECK_EQ(line_size, embedding_size * 2); + + const float l1 = ctx->Attr("l1"); + const float l2 = ctx->Attr("l2"); + const auto weight_decay = ctx->Attr("weight_decay"); + const auto lr_decay = ctx->Attr("lr_decay"); + const auto epsilon = ctx->Attr("epsilon"); + const auto scale = ctx->Attr("scale"); + const T* scale_by_ptr = nullptr; + if (ctx->has_input("scale_by_tensor", 0)) { + const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0); + CHECK_EQ(scale_by_tensor->data_type(), unique_embeddings->data_type()); + CHECK_EQ(scale_by_tensor->shape_view().elem_cnt(), 1); + scale_by_ptr = scale_by_tensor->dptr(); + } + const T* down_scale_by_ptr = nullptr; + if (ctx->has_input("down_scale_by_tensor", 0)) { + const user_op::Tensor* down_scale_by_tensor = + ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0); + CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type()); + CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1); + down_scale_by_ptr = down_scale_by_tensor->dptr(); + } + const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0); + const float* learning_rate_ptr = learning_rate->dptr(); + const int64_t* train_step_ptr = ctx->Tensor4ArgNameAndIndex("train_step", 0)->dptr(); + const int64_t* skip_if_ptr = nullptr; + if (ctx->has_input("skip_if", 0)) { + const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); + CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); + skip_if_ptr = skip_if->dptr(); + } + // update kernel + AdagradUpdateKernel + <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, + 0, ctx->stream()->As()->cuda_stream()>>>( + line_size, embedding_size, static_cast(scale), l1, l2, weight_decay, lr_decay, + epsilon, reinterpret_cast(num_unique_ids->dptr()), learning_rate_ptr, + train_step_ptr, scale_by_ptr, down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr(), + unique_embeddings->dptr(), updated_unique_embeddings->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_ADAGRAD_EMBEDDING_UPDATE_KERNEL(t_dtype_pair, g_type_pair, idx_dtype_pair) \ + REGISTER_USER_KERNEL("adagrad_embedding_update") \ + .SetCreateFn>() \ + .SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair)) \ + && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(g_type_pair)) \ + && (user_op::HobDataType("unique_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair))); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_ADAGRAD_EMBEDDING_UPDATE_KERNEL, + // FLOATING_DATA_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, + FLOATING_DATA_TYPE_SEQ, FLOATING_DATA_TYPE_SEQ, + IDX_DATA_TYPE_SEQ) + +template +class FtrlEmbeddingUpdateKernel final : public user_op::OpKernel { + public: + FtrlEmbeddingUpdateKernel() = default; + ~FtrlEmbeddingUpdateKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* num_unique_ids = ctx->Tensor4ArgNameAndIndex("num_unique_ids", 0); + const user_op::Tensor* unique_embeddings = ctx->Tensor4ArgNameAndIndex("unique_embeddings", 0); + const user_op::Tensor* embedding_grad = ctx->Tensor4ArgNameAndIndex("embedding_grad", 0); + user_op::Tensor* updated_unique_embeddings = + ctx->Tensor4ArgNameAndIndex("updated_unique_embeddings", 0); + CHECK_EQ(unique_embeddings->shape_view().NumAxes(), 2) + << "The NumAxes of unique_embedding should be equal to 2. "; + CHECK_EQ(embedding_grad->shape_view().NumAxes(), 2) + << "The NumAxes of embedding_grad should be equal to 2. "; + const int64_t num_keys = unique_embeddings->shape_view().At(0); + const int64_t line_size = unique_embeddings->shape_view().At(1); + const int64_t embedding_size = embedding_grad->shape_view().At(1); + CHECK_EQ(line_size, embedding_size * 3) + << "The line_size should be equal to 3 x embedding_size. "; + const float l1 = 0.0; + const float l2 = 0.0; + const float weight_decay = ctx->Attr("weight_decay"); + // TODO(zhengzekang): Undefined behavior for ftrl optimizer with weight_decay in `abs(new_z_val) + // < lambda1` condition. + CHECK_EQ(weight_decay, static_cast(0.0)) + << "Currently not support for setting weight decay. "; + const float lr_power = ctx->Attr("lr_power"); + const float lambda1 = ctx->Attr("lambda1"); + const float lambda2 = ctx->Attr("lambda2"); + const float beta = ctx->Attr("beta"); + const double scale = ctx->Attr("scale"); + const T* down_scale_by_ptr = nullptr; + if (ctx->has_input("down_scale_by_tensor", 0)) { + const user_op::Tensor* down_scale_by_tensor = + ctx->Tensor4ArgNameAndIndex("down_scale_by_tensor", 0); + CHECK_EQ(down_scale_by_tensor->data_type(), unique_embeddings->data_type()); + CHECK_EQ(down_scale_by_tensor->shape_view().elem_cnt(), 1); + down_scale_by_ptr = down_scale_by_tensor->dptr(); + } + const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0); + const float* learning_rate_ptr = learning_rate->dptr(); + const int64_t* skip_if_ptr = nullptr; + if (ctx->has_input("skip_if", 0)) { + const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0); + CHECK_EQ(skip_if->shape_view().elem_cnt(), 1); + skip_if_ptr = skip_if->dptr(); + } + // update kernel + FtrlUpdateKernel + <<shape_view().elem_cnt()), kCudaThreadsNumPerBlock, + 0, ctx->stream()->As()->cuda_stream()>>>( + line_size, embedding_size, static_cast(scale), l1, l2, weight_decay, lr_power, + lambda1, lambda2, beta, reinterpret_cast(num_unique_ids->dptr()), + learning_rate_ptr, down_scale_by_ptr, skip_if_ptr, embedding_grad->dptr(), + unique_embeddings->dptr(), updated_unique_embeddings->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; +#define REGISTER_CUDA_FTRL_EMBEDDING_UPDATE_KERNEL(t_dtype_pair, g_type_pair, idx_dtype_pair) \ + REGISTER_USER_KERNEL("ftrl_embedding_update") \ + .SetCreateFn< \ + FtrlEmbeddingUpdateKernel>() \ + .SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("num_unique_ids", 0) == OF_PP_PAIR_SECOND(idx_dtype_pair)) \ + && (user_op::HobDataType("embedding_grad", 0) == OF_PP_PAIR_SECOND(g_type_pair)) \ + && (user_op::HobDataType("unique_embeddings", 0) == OF_PP_PAIR_SECOND(t_dtype_pair))); +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_CUDA_FTRL_EMBEDDING_UPDATE_KERNEL, FLOATING_DATA_TYPE_SEQ, + // FLOATING_DATA_TYPE_SEQ HALF_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) + FLOATING_DATA_TYPE_SEQ, IDX_DATA_TYPE_SEQ) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/one_hot_kernel.hip.cpp b/oneflow/user/kernels/one_hot_kernel.hip.cpp index 661a41b..4b0a5b1 100644 --- a/oneflow/user/kernels/one_hot_kernel.hip.cpp +++ b/oneflow/user/kernels/one_hot_kernel.hip.cpp @@ -1,81 +1,81 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/cuda_graph_support.h" -#include "oneflow/core/device/cuda_util.h" - -namespace oneflow { - -namespace { - -template -__global__ void OneHotEncodeGpu(int64_t elem_cnt, const int64_t depth, const T on_value, - const T off_value, const K* indices, T* out) { - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { - const int64_t row = i / depth; - const int64_t col = i - row * depth; - const int64_t idx = indices[row]; - assert(idx >= 0 && idx < depth); - out[i] = (idx == col) ? on_value : off_value; - } -} - -} // namespace - -template -class GpuOneHotKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { - public: - GpuOneHotKernel() = default; - ~GpuOneHotKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const int64_t num_indices = indices->shape_view().elem_cnt(); - const int64_t depth = ctx->Attr("depth"); - const DataType dtype = ctx->Attr("dtype"); - const T on_value = IsFloatingDataType(dtype) - ? static_cast(ctx->Attr("floating_on_value")) - : static_cast(ctx->Attr("integer_on_value")); - const T off_value = IsFloatingDataType(dtype) - ? static_cast(ctx->Attr("floating_off_value")) - : static_cast(ctx->Attr("integer_off_value")); - RUN_CUDA_KERNEL((OneHotEncodeGpu), ctx->stream(), num_indices * depth, - num_indices * depth, depth, on_value, off_value, indices->dptr(), - out->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_ONE_HOT_KERNEL(dtype, itype) \ - REGISTER_USER_KERNEL("one_hot").SetCreateFn>().SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("indices", 0) == GetDataType::value) \ - && (user_op::HobDataType("out", 0) == GetDataType::value)); - -REGISTER_CUDA_ONE_HOT_KERNEL(int32_t, int32_t) -REGISTER_CUDA_ONE_HOT_KERNEL(int32_t, int64_t) -REGISTER_CUDA_ONE_HOT_KERNEL(int64_t, int32_t) -REGISTER_CUDA_ONE_HOT_KERNEL(int64_t, int64_t) -REGISTER_CUDA_ONE_HOT_KERNEL(float, int32_t) -REGISTER_CUDA_ONE_HOT_KERNEL(float, int64_t) -REGISTER_CUDA_ONE_HOT_KERNEL(double, int32_t) -REGISTER_CUDA_ONE_HOT_KERNEL(double, int64_t) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/cuda_graph_support.h" +#include "oneflow/core/device/cuda_util.h" + +namespace oneflow { + +namespace { + +template +__global__ void OneHotEncodeGpu(int64_t elem_cnt, const int64_t depth, const T on_value, + const T off_value, const K* indices, T* out) { + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { + const int64_t row = i / depth; + const int64_t col = i - row * depth; + const int64_t idx = indices[row]; + assert(idx >= 0 && idx < depth); + out[i] = (idx == col) ? on_value : off_value; + } +} + +} // namespace + +template +class GpuOneHotKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport { + public: + GpuOneHotKernel() = default; + ~GpuOneHotKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* indices = ctx->Tensor4ArgNameAndIndex("indices", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + const int64_t num_indices = indices->shape_view().elem_cnt(); + const int64_t depth = ctx->Attr("depth"); + const DataType dtype = ctx->Attr("dtype"); + const T on_value = IsFloatingDataType(dtype) + ? static_cast(ctx->Attr("floating_on_value")) + : static_cast(ctx->Attr("integer_on_value")); + const T off_value = IsFloatingDataType(dtype) + ? static_cast(ctx->Attr("floating_off_value")) + : static_cast(ctx->Attr("integer_off_value")); + RUN_CUDA_KERNEL((OneHotEncodeGpu), ctx->stream(), num_indices * depth, + num_indices * depth, depth, on_value, off_value, indices->dptr(), + out->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_ONE_HOT_KERNEL(dtype, itype) \ + REGISTER_USER_KERNEL("one_hot").SetCreateFn>().SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("indices", 0) == GetDataType::value) \ + && (user_op::HobDataType("out", 0) == GetDataType::value)); + +REGISTER_CUDA_ONE_HOT_KERNEL(int32_t, int32_t) +REGISTER_CUDA_ONE_HOT_KERNEL(int32_t, int64_t) +REGISTER_CUDA_ONE_HOT_KERNEL(int64_t, int32_t) +REGISTER_CUDA_ONE_HOT_KERNEL(int64_t, int64_t) +REGISTER_CUDA_ONE_HOT_KERNEL(float, int32_t) +REGISTER_CUDA_ONE_HOT_KERNEL(float, int64_t) +REGISTER_CUDA_ONE_HOT_KERNEL(double, int32_t) +REGISTER_CUDA_ONE_HOT_KERNEL(double, int64_t) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/pad2d_kernels_util.hip.cpp b/oneflow/user/kernels/pad2d_kernels_util.hip.cpp index 9c9bf9c..37f40e6 100644 --- a/oneflow/user/kernels/pad2d_kernels_util.hip.cpp +++ b/oneflow/user/kernels/pad2d_kernels_util.hip.cpp @@ -1,214 +1,214 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include -#ifdef WITH_ROCM -#include "hip/hip_runtime.h" -#include "oneflow/core/common/data_type.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/user/kernels/pad2d_kernels_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { -namespace user_op { - -template -__global__ void DoCUDAReflectionPad2d(const IN_T* src, IN_T* dest, - const NdIndexOffsetHelper index_helper, - int64_t elem_num, int64_t src_num, int64_t dest_num, - int64_t y_height, int64_t y_width, int64_t x_height, - int64_t x_width, int64_t pad_left, int64_t pad_top) { - DoReflectionPad2d(src, dest, index_helper, elem_num, src_num, dest_num, y_height, y_width, - x_height, x_width, pad_left, pad_top); -}; - -template -__global__ void DoCUDAReflectionPad2dGrad(const IN_T* src, IN_T* dest, - const NdIndexOffsetHelper index_helper, - int64_t elem_num, int64_t src_num, int64_t dest_num, - int64_t dy_height, int64_t dy_width, int64_t dx_height, - int64_t dx_width, int64_t pad_left, int64_t pad_top) { - DoReflectionPad2dGrad(src, dest, index_helper, elem_num, src_num, dest_num, dy_height, - dy_width, dx_height, dx_width, pad_left, pad_top); -}; - -template -struct ReflectionPad2dFunctor final { - void operator()(ep::Stream* stream, const IN_T* src, IN_T* dest, - const NdIndexOffsetHelper& index_helper, int64_t n_batch, - int64_t n_channel, int64_t y_height, int64_t y_width, int64_t x_height, - int64_t x_width, int64_t pad_left, int64_t pad_top) { - int64_t dest_num = n_channel * y_height * y_width; - int64_t src_num = n_channel * x_height * x_width; - int64_t elem_num = n_batch * dest_num; - DoCUDAReflectionPad2d<<As()->cuda_stream()>>>( - src, dest, index_helper, elem_num, src_num, dest_num, y_height, y_width, x_height, x_width, - pad_left, pad_top); - } -}; - -// float16 implementation -template<> -void ReflectionPad2dFunctor::operator()( - ep::Stream* stream, const float16* src, float16* dest, - const NdIndexOffsetHelper& index_helper, int64_t n_batch, int64_t n_channel, - int64_t y_height, int64_t y_width, int64_t x_height, int64_t x_width, int64_t pad_left, - int64_t pad_top) { - int64_t dest_num = n_channel * y_height * y_width; - int64_t src_num = n_channel * x_height * x_width; - int64_t elem_num = n_batch * dest_num; - DoCUDAReflectionPad2d<<As()->cuda_stream()>>>( - reinterpret_cast(src), reinterpret_cast(dest), index_helper, elem_num, - src_num, dest_num, y_height, y_width, x_height, x_width, pad_left, pad_top); -} - -template -struct ReflectionPad2dGradFunctor final { - void operator()(ep::Stream* stream, const IN_T* src, IN_T* dest, - const NdIndexOffsetHelper& index_helper, int64_t n_batch, - int64_t n_channel, int64_t dy_height, int64_t dy_width, int64_t dx_height, - int64_t dx_width, int64_t pad_left, int64_t pad_top) { - int64_t dest_num = n_channel * dx_height * dx_width; - int64_t src_num = n_channel * dy_height * dy_width; - int64_t elem_num = n_batch * src_num; - DoCUDAReflectionPad2dGrad<<As()->cuda_stream()>>>( - src, dest, index_helper, elem_num, src_num, dest_num, dy_height, dy_width, dx_height, - dx_width, pad_left, pad_top); - } -}; - -// float16 implementation -template<> -void ReflectionPad2dGradFunctor::operator()( - ep::Stream* stream, const float16* src, float16* dest, - const NdIndexOffsetHelper& index_helper, int64_t n_batch, int64_t n_channel, - int64_t dy_height, int64_t dy_width, int64_t dx_height, int64_t dx_width, int64_t pad_left, - int64_t pad_top) { - int64_t dest_num = n_channel * dx_height * dx_width; - int64_t src_num = n_channel * dy_height * dy_width; - int64_t elem_num = n_batch * src_num; - DoCUDAReflectionPad2dGrad<<As()->cuda_stream()>>>( - reinterpret_cast(src), reinterpret_cast(dest), index_helper, elem_num, - src_num, dest_num, dy_height, dy_width, dx_height, dx_width, pad_left, pad_top); -} - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_REFLECTION_PAD2D_FUNCTOR, - OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA), - PADDING_DATA_TYPE_CUDA_SEQ); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_REFLECTION_PAD2D_GRAD_FUNCTOR, - OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA), - PADDING_DATA_TYPE_CUDA_SEQ); - -template -__global__ void DoCUDAReplicationPad2d(const IN_T* src, IN_T* dest, - const NdIndexOffsetHelper index_helper, - int64_t elem_num, int64_t src_num, int64_t dest_num, - int64_t y_height, int64_t y_width, int64_t x_height, - int64_t x_width, int64_t pad_left, int64_t pad_top) { - DoReplicationPad2d(src, dest, index_helper, elem_num, src_num, dest_num, y_height, y_width, - x_height, x_width, pad_left, pad_top); -}; - -template -__global__ void DoCUDAReplicationPad2dGrad(const IN_T* src, IN_T* dest, - const NdIndexOffsetHelper index_helper, - int64_t elem_num, int64_t src_num, int64_t dest_num, - int64_t dy_height, int64_t dy_width, int64_t dx_height, - int64_t dx_width, int64_t pad_left, int64_t pad_top) { - DoReplicationPad2dGrad(src, dest, index_helper, elem_num, src_num, dest_num, dy_height, - dy_width, dx_height, dx_width, pad_left, pad_top); -}; - -template -struct ReplicationPad2dFunctor final { - void operator()(ep::Stream* stream, const IN_T* src, IN_T* dest, - const NdIndexOffsetHelper& index_helper, int64_t n_batch, - int64_t n_channel, int64_t y_height, int64_t y_width, int64_t x_height, - int64_t x_width, int64_t pad_left, int64_t pad_top) { - int64_t dest_num = n_channel * y_height * y_width; - int64_t src_num = n_channel * x_height * x_width; - int64_t elem_num = n_batch * dest_num; - DoCUDAReplicationPad2d<<As()->cuda_stream()>>>( - src, dest, index_helper, elem_num, src_num, dest_num, y_height, y_width, x_height, x_width, - pad_left, pad_top); - } -}; - -// float16 implementation -template<> -void ReplicationPad2dFunctor::operator()( - ep::Stream* stream, const float16* src, float16* dest, - const NdIndexOffsetHelper& index_helper, int64_t n_batch, int64_t n_channel, - int64_t y_height, int64_t y_width, int64_t x_height, int64_t x_width, int64_t pad_left, - int64_t pad_top) { - int64_t dest_num = n_channel * y_height * y_width; - int64_t src_num = n_channel * x_height * x_width; - int64_t elem_num = n_batch * dest_num; - DoCUDAReplicationPad2d<<As()->cuda_stream()>>>( - reinterpret_cast(src), reinterpret_cast(dest), index_helper, elem_num, - src_num, dest_num, y_height, y_width, x_height, x_width, pad_left, pad_top); -} - -template -struct ReplicationPad2dGradFunctor final { - void operator()(ep::Stream* stream, const IN_T* src, IN_T* dest, - const NdIndexOffsetHelper& index_helper, int64_t n_batch, - int64_t n_channel, int64_t dy_height, int64_t dy_width, int64_t dx_height, - int64_t dx_width, int64_t pad_left, int64_t pad_top) { - int64_t dest_num = n_channel * dx_height * dx_width; - int64_t src_num = n_channel * dy_height * dy_width; - int64_t elem_num = n_batch * src_num; - DoCUDAReplicationPad2dGrad<<As()->cuda_stream()>>>( - src, dest, index_helper, elem_num, src_num, dest_num, dy_height, dy_width, dx_height, - dx_width, pad_left, pad_top); - } -}; - -// float16 implementation -template<> -void ReplicationPad2dGradFunctor::operator()( - ep::Stream* stream, const float16* src, float16* dest, - const NdIndexOffsetHelper& index_helper, int64_t n_batch, int64_t n_channel, - int64_t dy_height, int64_t dy_width, int64_t dx_height, int64_t dx_width, int64_t pad_left, - int64_t pad_top) { - int64_t dest_num = n_channel * dx_height * dx_width; - int64_t src_num = n_channel * dy_height * dy_width; - int64_t elem_num = n_batch * src_num; - DoCUDAReplicationPad2dGrad<<As()->cuda_stream()>>>( - reinterpret_cast(src), reinterpret_cast(dest), index_helper, elem_num, - src_num, dest_num, dy_height, dy_width, dx_height, dx_width, pad_left, pad_top); -} - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_REPLICATION_PAD2D_FUNCTOR, - OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA), - PADDING_DATA_TYPE_CUDA_SEQ); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_REPLICATION_PAD2D_GRAD_FUNCTOR, - OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA), - PADDING_DATA_TYPE_CUDA_SEQ); - -} // namespace user_op -} // namespace oneflow - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include +#ifdef WITH_ROCM +#include "hip/hip_runtime.h" +#include "oneflow/core/common/data_type.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/user/kernels/pad2d_kernels_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { +namespace user_op { + +template +__global__ void DoCUDAReflectionPad2d(const IN_T* src, IN_T* dest, + const NdIndexOffsetHelper index_helper, + int64_t elem_num, int64_t src_num, int64_t dest_num, + int64_t y_height, int64_t y_width, int64_t x_height, + int64_t x_width, int64_t pad_left, int64_t pad_top) { + DoReflectionPad2d(src, dest, index_helper, elem_num, src_num, dest_num, y_height, y_width, + x_height, x_width, pad_left, pad_top); +}; + +template +__global__ void DoCUDAReflectionPad2dGrad(const IN_T* src, IN_T* dest, + const NdIndexOffsetHelper index_helper, + int64_t elem_num, int64_t src_num, int64_t dest_num, + int64_t dy_height, int64_t dy_width, int64_t dx_height, + int64_t dx_width, int64_t pad_left, int64_t pad_top) { + DoReflectionPad2dGrad(src, dest, index_helper, elem_num, src_num, dest_num, dy_height, + dy_width, dx_height, dx_width, pad_left, pad_top); +}; + +template +struct ReflectionPad2dFunctor final { + void operator()(ep::Stream* stream, const IN_T* src, IN_T* dest, + const NdIndexOffsetHelper& index_helper, int64_t n_batch, + int64_t n_channel, int64_t y_height, int64_t y_width, int64_t x_height, + int64_t x_width, int64_t pad_left, int64_t pad_top) { + int64_t dest_num = n_channel * y_height * y_width; + int64_t src_num = n_channel * x_height * x_width; + int64_t elem_num = n_batch * dest_num; + DoCUDAReflectionPad2d<<As()->cuda_stream()>>>( + src, dest, index_helper, elem_num, src_num, dest_num, y_height, y_width, x_height, x_width, + pad_left, pad_top); + } +}; + +// float16 implementation +template<> +void ReflectionPad2dFunctor::operator()( + ep::Stream* stream, const float16* src, float16* dest, + const NdIndexOffsetHelper& index_helper, int64_t n_batch, int64_t n_channel, + int64_t y_height, int64_t y_width, int64_t x_height, int64_t x_width, int64_t pad_left, + int64_t pad_top) { + int64_t dest_num = n_channel * y_height * y_width; + int64_t src_num = n_channel * x_height * x_width; + int64_t elem_num = n_batch * dest_num; + DoCUDAReflectionPad2d<<As()->cuda_stream()>>>( + reinterpret_cast(src), reinterpret_cast(dest), index_helper, elem_num, + src_num, dest_num, y_height, y_width, x_height, x_width, pad_left, pad_top); +} + +template +struct ReflectionPad2dGradFunctor final { + void operator()(ep::Stream* stream, const IN_T* src, IN_T* dest, + const NdIndexOffsetHelper& index_helper, int64_t n_batch, + int64_t n_channel, int64_t dy_height, int64_t dy_width, int64_t dx_height, + int64_t dx_width, int64_t pad_left, int64_t pad_top) { + int64_t dest_num = n_channel * dx_height * dx_width; + int64_t src_num = n_channel * dy_height * dy_width; + int64_t elem_num = n_batch * src_num; + DoCUDAReflectionPad2dGrad<<As()->cuda_stream()>>>( + src, dest, index_helper, elem_num, src_num, dest_num, dy_height, dy_width, dx_height, + dx_width, pad_left, pad_top); + } +}; + +// float16 implementation +template<> +void ReflectionPad2dGradFunctor::operator()( + ep::Stream* stream, const float16* src, float16* dest, + const NdIndexOffsetHelper& index_helper, int64_t n_batch, int64_t n_channel, + int64_t dy_height, int64_t dy_width, int64_t dx_height, int64_t dx_width, int64_t pad_left, + int64_t pad_top) { + int64_t dest_num = n_channel * dx_height * dx_width; + int64_t src_num = n_channel * dy_height * dy_width; + int64_t elem_num = n_batch * src_num; + DoCUDAReflectionPad2dGrad<<As()->cuda_stream()>>>( + reinterpret_cast(src), reinterpret_cast(dest), index_helper, elem_num, + src_num, dest_num, dy_height, dy_width, dx_height, dx_width, pad_left, pad_top); +} + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_REFLECTION_PAD2D_FUNCTOR, + OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA), + PADDING_DATA_TYPE_CUDA_SEQ); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_REFLECTION_PAD2D_GRAD_FUNCTOR, + OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA), + PADDING_DATA_TYPE_CUDA_SEQ); + +template +__global__ void DoCUDAReplicationPad2d(const IN_T* src, IN_T* dest, + const NdIndexOffsetHelper index_helper, + int64_t elem_num, int64_t src_num, int64_t dest_num, + int64_t y_height, int64_t y_width, int64_t x_height, + int64_t x_width, int64_t pad_left, int64_t pad_top) { + DoReplicationPad2d(src, dest, index_helper, elem_num, src_num, dest_num, y_height, y_width, + x_height, x_width, pad_left, pad_top); +}; + +template +__global__ void DoCUDAReplicationPad2dGrad(const IN_T* src, IN_T* dest, + const NdIndexOffsetHelper index_helper, + int64_t elem_num, int64_t src_num, int64_t dest_num, + int64_t dy_height, int64_t dy_width, int64_t dx_height, + int64_t dx_width, int64_t pad_left, int64_t pad_top) { + DoReplicationPad2dGrad(src, dest, index_helper, elem_num, src_num, dest_num, dy_height, + dy_width, dx_height, dx_width, pad_left, pad_top); +}; + +template +struct ReplicationPad2dFunctor final { + void operator()(ep::Stream* stream, const IN_T* src, IN_T* dest, + const NdIndexOffsetHelper& index_helper, int64_t n_batch, + int64_t n_channel, int64_t y_height, int64_t y_width, int64_t x_height, + int64_t x_width, int64_t pad_left, int64_t pad_top) { + int64_t dest_num = n_channel * y_height * y_width; + int64_t src_num = n_channel * x_height * x_width; + int64_t elem_num = n_batch * dest_num; + DoCUDAReplicationPad2d<<As()->cuda_stream()>>>( + src, dest, index_helper, elem_num, src_num, dest_num, y_height, y_width, x_height, x_width, + pad_left, pad_top); + } +}; + +// float16 implementation +template<> +void ReplicationPad2dFunctor::operator()( + ep::Stream* stream, const float16* src, float16* dest, + const NdIndexOffsetHelper& index_helper, int64_t n_batch, int64_t n_channel, + int64_t y_height, int64_t y_width, int64_t x_height, int64_t x_width, int64_t pad_left, + int64_t pad_top) { + int64_t dest_num = n_channel * y_height * y_width; + int64_t src_num = n_channel * x_height * x_width; + int64_t elem_num = n_batch * dest_num; + DoCUDAReplicationPad2d<<As()->cuda_stream()>>>( + reinterpret_cast(src), reinterpret_cast(dest), index_helper, elem_num, + src_num, dest_num, y_height, y_width, x_height, x_width, pad_left, pad_top); +} + +template +struct ReplicationPad2dGradFunctor final { + void operator()(ep::Stream* stream, const IN_T* src, IN_T* dest, + const NdIndexOffsetHelper& index_helper, int64_t n_batch, + int64_t n_channel, int64_t dy_height, int64_t dy_width, int64_t dx_height, + int64_t dx_width, int64_t pad_left, int64_t pad_top) { + int64_t dest_num = n_channel * dx_height * dx_width; + int64_t src_num = n_channel * dy_height * dy_width; + int64_t elem_num = n_batch * src_num; + DoCUDAReplicationPad2dGrad<<As()->cuda_stream()>>>( + src, dest, index_helper, elem_num, src_num, dest_num, dy_height, dy_width, dx_height, + dx_width, pad_left, pad_top); + } +}; + +// float16 implementation +template<> +void ReplicationPad2dGradFunctor::operator()( + ep::Stream* stream, const float16* src, float16* dest, + const NdIndexOffsetHelper& index_helper, int64_t n_batch, int64_t n_channel, + int64_t dy_height, int64_t dy_width, int64_t dx_height, int64_t dx_width, int64_t pad_left, + int64_t pad_top) { + int64_t dest_num = n_channel * dx_height * dx_width; + int64_t src_num = n_channel * dy_height * dy_width; + int64_t elem_num = n_batch * src_num; + DoCUDAReplicationPad2dGrad<<As()->cuda_stream()>>>( + reinterpret_cast(src), reinterpret_cast(dest), index_helper, elem_num, + src_num, dest_num, dy_height, dy_width, dx_height, dx_width, pad_left, pad_top); +} + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_REPLICATION_PAD2D_FUNCTOR, + OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA), + PADDING_DATA_TYPE_CUDA_SEQ); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_REPLICATION_PAD2D_GRAD_FUNCTOR, + OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA), + PADDING_DATA_TYPE_CUDA_SEQ); + +} // namespace user_op +} // namespace oneflow + #endif // WITH_ROCM \ No newline at end of file diff --git a/oneflow/user/kernels/partial_fc_sample_kernel.hip.cpp b/oneflow/user/kernels/partial_fc_sample_kernel.hip.cpp index 22c1798..402ea80 100644 --- a/oneflow/user/kernels/partial_fc_sample_kernel.hip.cpp +++ b/oneflow/user/kernels/partial_fc_sample_kernel.hip.cpp @@ -1,431 +1,431 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifdef WITH_ROCM -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/common/balanced_splitter.h" -#include "oneflow/user/kernels/gather_kernel_util.h" -#include "oneflow/core/common/not_equal_to_previous_adjacent_iterator.h" -#include -#include -#include -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { -namespace user_op { - -namespace { - -template -int64_t GetCubSortPairsTempStorageSize(int64_t n) { - size_t cub_sort_temp_store_size = 0; - OF_CUDA_CHECK((hipcub::DeviceRadixSort::SortPairs(nullptr, cub_sort_temp_store_size, nullptr, - nullptr, nullptr, nullptr, n))); - size_t temp_store_size = GetCudaAlignedSize(cub_sort_temp_store_size); - CHECK_GE(temp_store_size, 0); - CHECK_LT(temp_store_size, static_cast(GetMaxVal())); - return static_cast(temp_store_size); -} - -template -int64_t GetCubScanTempStorageSize(int64_t n) { - size_t cub_scan_temp_store_size = 0; - NotEqualToPreviousAdjacentIterator unique_counting_iter(nullptr, 0); - OF_CUDA_CHECK((hipcub::DeviceScan::InclusiveSum, K*>( - nullptr, cub_scan_temp_store_size, unique_counting_iter, nullptr, n))); - size_t temp_store_size = GetCudaAlignedSize(cub_scan_temp_store_size); - CHECK_GE(temp_store_size, 0); - CHECK_LT(temp_store_size, static_cast(GetMaxVal())); - return static_cast(temp_store_size); -} - -template -class TmpBufferManager final { - public: - OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager); - TmpBufferManager(void* ptr, const int64_t device_num_class, const int64_t batch_size, - const int64_t parallel_num) - : ptr_(ptr) { - const int64_t buffer_elem_cnt = std::max(device_num_class, batch_size); - const size_t cub_sort_keys_bytes = GetCudaAlignedSize(buffer_elem_cnt * sizeof(K)); - const size_t cub_sort_values_bytes = GetCudaAlignedSize(buffer_elem_cnt * sizeof(K)); - const size_t cub_sort_keys_out_bytes = GetCudaAlignedSize(buffer_elem_cnt * sizeof(K)); - const size_t cub_sort_values_out_bytes = GetCudaAlignedSize(buffer_elem_cnt * sizeof(K)); - const size_t bound_index_bytes = GetCudaAlignedSize((parallel_num + 1) * sizeof(K)); - const size_t bound_value_bytes = GetCudaAlignedSize((parallel_num + 1) * sizeof(K)); - cub_tmp_storage_bytes_ = std::max(GetCubSortPairsTempStorageSize(buffer_elem_cnt), - GetCubScanTempStorageSize(batch_size)); - cub_sort_keys_offset_ = 0; - cub_sort_values_offset_ = cub_sort_keys_offset_ + cub_sort_keys_bytes; - cub_sort_keys_out_offset_ = cub_sort_values_offset_ + cub_sort_values_bytes; - cub_sort_values_out_offset_ = cub_sort_keys_out_offset_ + cub_sort_keys_out_bytes; - cub_tmp_storage_offset_ = cub_sort_values_out_offset_ + cub_sort_values_out_bytes; - bound_index_offset_ = cub_tmp_storage_offset_ + cub_tmp_storage_bytes_; - bound_value_offset_ = bound_index_offset_ + bound_index_bytes; - total_buffer_size_ = cub_sort_keys_bytes + cub_sort_values_bytes + cub_sort_keys_out_bytes - + cub_sort_values_out_bytes + cub_tmp_storage_bytes_ + bound_index_bytes - + bound_value_bytes; - } - ~TmpBufferManager() = default; - - size_t GetTotalBufferSize() const { return total_buffer_size_; } - size_t GetCubTmpStorageSize() const { return cub_tmp_storage_bytes_; } - K* CubSortKeysPtr() const { - CHECK(ptr_ != nullptr); - return reinterpret_cast(reinterpret_cast(ptr_) + cub_sort_keys_offset_); - } - K* CubSortValuesPtr() const { - CHECK(ptr_ != nullptr); - return reinterpret_cast(reinterpret_cast(ptr_) + cub_sort_values_offset_); - } - K* CubSortKeysOutPtr() const { - CHECK(ptr_ != nullptr); - return reinterpret_cast(reinterpret_cast(ptr_) + cub_sort_keys_out_offset_); - } - K* CubSortValuesOutPtr() const { - CHECK(ptr_ != nullptr); - return reinterpret_cast(reinterpret_cast(ptr_) + cub_sort_values_out_offset_); - } - void* CubTmpStoragePtr() const { - CHECK(ptr_ != nullptr); - return reinterpret_cast(reinterpret_cast(ptr_) + cub_tmp_storage_offset_); - } - K* BoundIndexPtr() const { - CHECK(ptr_ != nullptr); - return reinterpret_cast(reinterpret_cast(ptr_) + bound_index_offset_); - } - K* BoundValuePtr() const { - CHECK(ptr_ != nullptr); - return reinterpret_cast(reinterpret_cast(ptr_) + bound_value_offset_); - } - - private: - size_t cub_sort_keys_offset_; - size_t cub_sort_values_offset_; - size_t cub_sort_keys_out_offset_; - size_t cub_sort_values_out_offset_; - size_t cub_tmp_storage_offset_; - size_t bound_index_offset_; - size_t bound_value_offset_; - size_t cub_tmp_storage_bytes_; - size_t total_buffer_size_; - void* ptr_; -}; - -__global__ void SetupKernel(int64_t seed, hiprandState* state) { - const int id = blockIdx.x * blockDim.x + threadIdx.x; - size_t local_seed = (static_cast(seed) + 0x9e3779b9U + (static_cast(id) << 6U) - + (static_cast(id) >> 2U)); - hiprand_init(local_seed, 0, 0, &state[id]); -} - -template -__global__ void GenerateGpu(hiprandState* state, const int64_t n, const int64_t max_val, K* buffer) { - const int id = blockIdx.x * blockDim.x + threadIdx.x; - hiprandState localState = state[id]; - CUDA_1D_KERNEL_LOOP(i, n) { buffer[i] = static_cast(hiprand(&localState) % max_val); } - state[id] = localState; -} - -class DistributedPartialFcSampleOpKernelState final : public user_op::OpKernelState { - public: - DistributedPartialFcSampleOpKernelState(ep::Stream* stream, int64_t lower, int64_t upper, - int64_t num_sample_per_rank, int64_t seed) - : lower_(lower), upper_(upper), num_sample_per_rank_(num_sample_per_rank) { - CHECK_NOTNULL(stream); - const int64_t num_classes = upper_ - lower_; - OF_CUDA_CHECK(hipMalloc(&curand_states_, BlocksNum4ThreadsNum(num_classes) - * kCudaThreadsNumPerBlock * sizeof(hiprandState))); - SetupKernel<<As()->cuda_stream()>>>(seed, curand_states_); - } - ~DistributedPartialFcSampleOpKernelState() { - hipError_t ret = hipFree(curand_states_); - if (ret != hipErrorDeinitialized) { OF_CUDA_CHECK(ret); } - }; - - int64_t lower() const { return lower_; } - int64_t upper() const { return upper_; } - int64_t num_sample_per_rank() const { return num_sample_per_rank_; } - - template - void GenRandom(ep::Stream* stream, const int64_t n, const int64_t max_val, K* buffer) { - GenerateGpu - <<As()->cuda_stream()>>>(curand_states_, n, max_val, buffer); - } - - private: - const int64_t lower_; - const int64_t upper_; - const int64_t num_sample_per_rank_; - hiprandState* curand_states_; -}; - -template -__global__ void IotaKernel(int64_t n, K* out) { - CUDA_1D_KERNEL_LOOP(i, n) { out[i] = static_cast(i); } -} - -template -__global__ void MarkPositive(const int64_t n, const int64_t offset, const int64_t num_classes, - const K* labels, K* out) { - CUDA_1D_KERNEL_LOOP(i, n) { - K label = labels[i] - offset; - if (label >= 0 && label < num_classes) { out[label] = label - num_classes; } - } -} - -template -__global__ void GetSampledLabel(const int64_t n, const int64_t offset, const K* label, - K* sampled_label) { - CUDA_1D_KERNEL_LOOP(i, n) { sampled_label[i] = label[i] + offset; } -} - -template -__global__ void GetLabelMap(const int64_t n, const int64_t parallel_num, - const int64_t num_sample_per_rank, const K* bound_index, - const K* bound_value, K* label_map) { - CUDA_1D_KERNEL_LOOP(i, n) { -#pragma unroll - for (int64_t j = 0; j < parallel_num; j++) { - if (i >= bound_index[j] && i < bound_index[j + 1]) { - label_map[i] = label_map[i] - bound_value[j] + j * num_sample_per_rank; - } - } - } -} - -template -__global__ void GetPartionBound(const int64_t n, const int64_t parallel_num, - const int64_t num_classes_per_rank, const K* key_ptr, - const K* value_ptr, K* bound_index, K* bound_value) { - CUDA_1D_KERNEL_LOOP(i, n) { - if (i != 0) { - const K cur_in = key_ptr[i] / num_classes_per_rank; - const K pre_in = key_ptr[i - 1] / num_classes_per_rank; - if (cur_in > pre_in) { - assert(cur_in < parallel_num); -#pragma unroll - for (int32_t j = pre_in + 1; j <= cur_in; ++j) { - bound_index[j] = static_cast(i); - bound_value[j] = value_ptr[i]; - } - } - } - } - CUDA_1D_KERNEL_LOOP(i, parallel_num + 1) { - const K first_in = key_ptr[0] / num_classes_per_rank; - const K last_in = key_ptr[n - 1] / num_classes_per_rank; - if (i <= first_in) { - bound_index[i] = 0; - bound_value[i] = value_ptr[0]; - } else if (i > last_in) { - bound_index[i] = n; - bound_value[i] = value_ptr[n - 1]; - } - } -} - -template -__global__ void GetMappedLabel(const int64_t n, const K* label_map_key, const K* label_map_value, - K* mapped_label) { - CUDA_1D_KERNEL_LOOP(i, n) { mapped_label[label_map_key[i]] = label_map_value[i]; } -} - -template -void MapLabel(ep::Stream* stream, const int64_t num_classes, const int64_t batch_size, - const int64_t lower_bound, const int64_t parallel_num, const int64_t num_sample, - size_t temp_storage_bytes, const K* label_ptr, K* mapped_label_ptr, - K* cub_sort_values_ptr, K* cub_sort_keys_out_ptr, K* cub_sort_values_out_ptr, - void* cub_tmp_storage_ptr, K* bound_index_ptr, K* bound_value_ptr) { - IotaKernel<<As()->cuda_stream()>>>(batch_size, cub_sort_values_ptr); - OF_CUDA_CHECK((hipcub::DeviceRadixSort::SortPairs( - cub_tmp_storage_ptr, temp_storage_bytes, label_ptr, cub_sort_keys_out_ptr, - cub_sort_values_ptr, cub_sort_values_out_ptr, batch_size, 0, sizeof(K) * 8, - stream->As()->cuda_stream()))); - NotEqualToPreviousAdjacentIterator unique_counting_iter(cub_sort_keys_out_ptr, 0); - OF_CUDA_CHECK((hipcub::DeviceScan::InclusiveSum, K*>( - cub_tmp_storage_ptr, temp_storage_bytes, unique_counting_iter, cub_sort_values_ptr, - batch_size, stream->As()->cuda_stream()))); - - GetPartionBound<<As()->cuda_stream()>>>( - batch_size, parallel_num, num_classes, cub_sort_keys_out_ptr, cub_sort_values_ptr, - bound_index_ptr, bound_value_ptr); - - GetLabelMap<<As()->cuda_stream()>>>( - batch_size, parallel_num, num_sample, bound_index_ptr, bound_value_ptr, cub_sort_values_ptr); - - GetMappedLabel<<As()->cuda_stream()>>>( - batch_size, cub_sort_values_out_ptr, cub_sort_values_ptr, mapped_label_ptr); -} - -} // namespace - -template -class DistributedPartialFcSampleGpuKernel final : public user_op::OpKernel { - public: - DistributedPartialFcSampleGpuKernel() = default; - ~DistributedPartialFcSampleGpuKernel() override = default; - - std::shared_ptr CreateOpKernelState( - user_op::KernelInitContext* ctx) const override { - const SbpParallel& in_sbp = ctx->SbpParallel4ArgNameAndIndex("weight", 0); - const TensorDesc* in_logical_desc = ctx->LogicalTensorDesc4ArgNameAndIndex("weight", 0); - const int64_t class_num = in_logical_desc->shape().At(0); - const int64_t num_sample = ctx->Attr("num_sample"); - int64_t seed = ctx->Attr("seed"); - const int64_t parallel_num = ctx->parallel_ctx().parallel_num(); - const int64_t num_sample_per_rank = RoundUp(num_sample, parallel_num) / parallel_num; - if (in_sbp.has_split_parallel() && in_sbp.split_parallel().axis() == 0 && parallel_num > 1) { - std::seed_seq seq{seed}; - std::vector seeds(parallel_num); - seq.generate(seeds.begin(), seeds.end()); - seed = seeds.at(ctx->parallel_ctx().parallel_id()); - CHECK(ctx->SbpParallel4ArgNameAndIndex("label", 0).has_broadcast_parallel()); - BalancedSplitter bs(class_num, parallel_num); - return std::make_shared( - ctx->stream(), bs.At(ctx->parallel_ctx().parallel_id()).begin(), - bs.At(ctx->parallel_ctx().parallel_id()).end(), num_sample_per_rank, seed); - } else { - return std::make_shared(ctx->stream(), 0, class_num, - num_sample_per_rank, seed); - } - } - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, - const user_op::OpKernelCache*) const override { - const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); - const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0); - user_op::Tensor* mapped_label = ctx->Tensor4ArgNameAndIndex("mapped_label", 0); - user_op::Tensor* sampled_label = ctx->Tensor4ArgNameAndIndex("sampled_label", 0); - user_op::Tensor* sampled_weight = ctx->Tensor4ArgNameAndIndex("sampled_weight", 0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - - const int64_t batch_size = label->shape_view().At(0); - const int64_t num_classes = weight->shape_view().At(0); - const int64_t parallel_num = ctx->parallel_ctx().parallel_num(); - TmpBufferManager buffer_manager(tmp_buffer->mut_dptr(), num_classes, batch_size, - parallel_num); - - auto* kernel_state = dynamic_cast(state); - CHECK_NOTNULL(kernel_state); - CHECK_EQ(num_classes, kernel_state->upper() - kernel_state->lower()); - const int64_t lower_bound = kernel_state->lower(); - const int64_t num_sample = kernel_state->num_sample_per_rank(); - kernel_state->GenRandom(ctx->stream(), num_classes, num_classes, - buffer_manager.CubSortKeysPtr()); - MarkPositive<<stream()->As()->cuda_stream()>>>( - batch_size, lower_bound, num_classes, label->dptr(), buffer_manager.CubSortKeysPtr()); - IotaKernel<<stream()->As()->cuda_stream()>>>( - num_classes, buffer_manager.CubSortValuesPtr()); - size_t temp_storage_bytes = buffer_manager.GetCubTmpStorageSize(); - OF_CUDA_CHECK((hipcub::DeviceRadixSort::SortPairs( - buffer_manager.CubTmpStoragePtr(), temp_storage_bytes, buffer_manager.CubSortKeysPtr(), - buffer_manager.CubSortKeysOutPtr(), buffer_manager.CubSortValuesPtr(), - buffer_manager.CubSortValuesOutPtr(), num_classes, 0, sizeof(K) * 8, - ctx->stream()->As()->cuda_stream()))); - - GetSampledLabel<<stream()->As()->cuda_stream()>>>( - num_sample, lower_bound, buffer_manager.CubSortValuesOutPtr(), - sampled_label->mut_dptr()); - - GatherKernelUtilImpl::Forward( - ctx->stream(), buffer_manager.CubSortValuesOutPtr(), num_sample, weight->dptr(), - Shape({1, num_classes, weight->shape_view().Count(1)}), sampled_weight->mut_dptr(), 0); - - MapLabel(ctx->stream(), num_classes, batch_size, lower_bound, parallel_num, num_sample, - buffer_manager.GetCubTmpStorageSize(), label->dptr(), - mapped_label->mut_dptr(), buffer_manager.CubSortValuesPtr(), - buffer_manager.CubSortKeysOutPtr(), buffer_manager.CubSortValuesOutPtr(), - buffer_manager.CubTmpStoragePtr(), buffer_manager.BoundIndexPtr(), - buffer_manager.BoundValuePtr()); - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_DISTRIBUTED_PARTIAL_FC_SAMPLE_CUDA_KERNEL(dtype_pair, ltype_pair) \ - REGISTER_USER_KERNEL("distributed_partial_fc_sample") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("label", 0) == OF_PP_PAIR_SECOND(ltype_pair)) \ - && (user_op::HobDataType("weight", 0) == OF_PP_PAIR_SECOND(dtype_pair))) \ - .SetInferTmpSizeFn([](oneflow::user_op::InferContext* ctx) { \ - const int64_t num_classes = ctx->InputTensorDesc("weight", 0).shape().At(0); \ - const int64_t batch_size = ctx->InputTensorDesc("label", 0).shape().At(0); \ - const int64_t parallel_num = ctx->parallel_ctx().parallel_num(); \ - TmpBufferManager buffer_manager(nullptr, num_classes, \ - batch_size, parallel_num); \ - return buffer_manager.GetTotalBufferSize(); \ - }); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_DISTRIBUTED_PARTIAL_FC_SAMPLE_CUDA_KERNEL, - FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) - -template -class DistributedPartialFcSampleDisableBoxingGpuKernel final : public user_op::OpKernel { - public: - DistributedPartialFcSampleDisableBoxingGpuKernel() = default; - ~DistributedPartialFcSampleDisableBoxingGpuKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, - const user_op::OpKernelCache*) const override { - const user_op::Tensor* sampled_weight_diff = - ctx->Tensor4ArgNameAndIndex("sampled_weight_diff", 0); - const user_op::Tensor* sampled_label = ctx->Tensor4ArgNameAndIndex("sampled_label", 0); - user_op::Tensor* boxing_disabled_sampled_weight_diff = - ctx->Tensor4ArgNameAndIndex("boxing_disabled_sampled_weight_diff", 0); - user_op::Tensor* boxing_disabled_sampled_label = - ctx->Tensor4ArgNameAndIndex("boxing_disabled_sampled_label", 0); - Memcpy(ctx->stream(), boxing_disabled_sampled_weight_diff->mut_dptr(), - sampled_weight_diff->dptr(), - sampled_weight_diff->shape_view().elem_cnt() - * GetSizeOfDataType(sampled_weight_diff->data_type())); - Memcpy( - ctx->stream(), boxing_disabled_sampled_label->mut_dptr(), sampled_label->dptr(), - sampled_label->shape_view().elem_cnt() * GetSizeOfDataType(sampled_label->data_type())); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_DISTRIBUTED_PARTIAL_FC_SAMPLE_DISABLE_BOXING_CUDA_KERNEL(dtype_pair, ltype_pair) \ - REGISTER_USER_KERNEL("distributed_partial_fc_sample_disable_boxing") \ - .SetCreateFn>() \ - .SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("sampled_label", 0) == OF_PP_PAIR_SECOND(ltype_pair)) \ - && (user_op::HobDataType("sampled_weight_diff", 0) == OF_PP_PAIR_SECOND(dtype_pair))); -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_DISTRIBUTED_PARTIAL_FC_SAMPLE_DISABLE_BOXING_CUDA_KERNEL, - FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) - -} // namespace user_op -} // namespace oneflow +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef WITH_ROCM +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/common/balanced_splitter.h" +#include "oneflow/user/kernels/gather_kernel_util.h" +#include "oneflow/core/common/not_equal_to_previous_adjacent_iterator.h" +#include +#include +#include +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { +namespace user_op { + +namespace { + +template +int64_t GetCubSortPairsTempStorageSize(int64_t n) { + size_t cub_sort_temp_store_size = 0; + OF_CUDA_CHECK((hipcub::DeviceRadixSort::SortPairs(nullptr, cub_sort_temp_store_size, nullptr, + nullptr, nullptr, nullptr, n))); + size_t temp_store_size = GetCudaAlignedSize(cub_sort_temp_store_size); + CHECK_GE(temp_store_size, 0); + CHECK_LT(temp_store_size, static_cast(GetMaxVal())); + return static_cast(temp_store_size); +} + +template +int64_t GetCubScanTempStorageSize(int64_t n) { + size_t cub_scan_temp_store_size = 0; + NotEqualToPreviousAdjacentIterator unique_counting_iter(nullptr, 0); + OF_CUDA_CHECK((hipcub::DeviceScan::InclusiveSum, K*>( + nullptr, cub_scan_temp_store_size, unique_counting_iter, nullptr, n))); + size_t temp_store_size = GetCudaAlignedSize(cub_scan_temp_store_size); + CHECK_GE(temp_store_size, 0); + CHECK_LT(temp_store_size, static_cast(GetMaxVal())); + return static_cast(temp_store_size); +} + +template +class TmpBufferManager final { + public: + OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager); + TmpBufferManager(void* ptr, const int64_t device_num_class, const int64_t batch_size, + const int64_t parallel_num) + : ptr_(ptr) { + const int64_t buffer_elem_cnt = std::max(device_num_class, batch_size); + const size_t cub_sort_keys_bytes = GetCudaAlignedSize(buffer_elem_cnt * sizeof(K)); + const size_t cub_sort_values_bytes = GetCudaAlignedSize(buffer_elem_cnt * sizeof(K)); + const size_t cub_sort_keys_out_bytes = GetCudaAlignedSize(buffer_elem_cnt * sizeof(K)); + const size_t cub_sort_values_out_bytes = GetCudaAlignedSize(buffer_elem_cnt * sizeof(K)); + const size_t bound_index_bytes = GetCudaAlignedSize((parallel_num + 1) * sizeof(K)); + const size_t bound_value_bytes = GetCudaAlignedSize((parallel_num + 1) * sizeof(K)); + cub_tmp_storage_bytes_ = std::max(GetCubSortPairsTempStorageSize(buffer_elem_cnt), + GetCubScanTempStorageSize(batch_size)); + cub_sort_keys_offset_ = 0; + cub_sort_values_offset_ = cub_sort_keys_offset_ + cub_sort_keys_bytes; + cub_sort_keys_out_offset_ = cub_sort_values_offset_ + cub_sort_values_bytes; + cub_sort_values_out_offset_ = cub_sort_keys_out_offset_ + cub_sort_keys_out_bytes; + cub_tmp_storage_offset_ = cub_sort_values_out_offset_ + cub_sort_values_out_bytes; + bound_index_offset_ = cub_tmp_storage_offset_ + cub_tmp_storage_bytes_; + bound_value_offset_ = bound_index_offset_ + bound_index_bytes; + total_buffer_size_ = cub_sort_keys_bytes + cub_sort_values_bytes + cub_sort_keys_out_bytes + + cub_sort_values_out_bytes + cub_tmp_storage_bytes_ + bound_index_bytes + + bound_value_bytes; + } + ~TmpBufferManager() = default; + + size_t GetTotalBufferSize() const { return total_buffer_size_; } + size_t GetCubTmpStorageSize() const { return cub_tmp_storage_bytes_; } + K* CubSortKeysPtr() const { + CHECK(ptr_ != nullptr); + return reinterpret_cast(reinterpret_cast(ptr_) + cub_sort_keys_offset_); + } + K* CubSortValuesPtr() const { + CHECK(ptr_ != nullptr); + return reinterpret_cast(reinterpret_cast(ptr_) + cub_sort_values_offset_); + } + K* CubSortKeysOutPtr() const { + CHECK(ptr_ != nullptr); + return reinterpret_cast(reinterpret_cast(ptr_) + cub_sort_keys_out_offset_); + } + K* CubSortValuesOutPtr() const { + CHECK(ptr_ != nullptr); + return reinterpret_cast(reinterpret_cast(ptr_) + cub_sort_values_out_offset_); + } + void* CubTmpStoragePtr() const { + CHECK(ptr_ != nullptr); + return reinterpret_cast(reinterpret_cast(ptr_) + cub_tmp_storage_offset_); + } + K* BoundIndexPtr() const { + CHECK(ptr_ != nullptr); + return reinterpret_cast(reinterpret_cast(ptr_) + bound_index_offset_); + } + K* BoundValuePtr() const { + CHECK(ptr_ != nullptr); + return reinterpret_cast(reinterpret_cast(ptr_) + bound_value_offset_); + } + + private: + size_t cub_sort_keys_offset_; + size_t cub_sort_values_offset_; + size_t cub_sort_keys_out_offset_; + size_t cub_sort_values_out_offset_; + size_t cub_tmp_storage_offset_; + size_t bound_index_offset_; + size_t bound_value_offset_; + size_t cub_tmp_storage_bytes_; + size_t total_buffer_size_; + void* ptr_; +}; + +__global__ void SetupKernel(int64_t seed, hiprandState* state) { + const int id = blockIdx.x * blockDim.x + threadIdx.x; + size_t local_seed = (static_cast(seed) + 0x9e3779b9U + (static_cast(id) << 6U) + + (static_cast(id) >> 2U)); + hiprand_init(local_seed, 0, 0, &state[id]); +} + +template +__global__ void GenerateGpu(hiprandState* state, const int64_t n, const int64_t max_val, K* buffer) { + const int id = blockIdx.x * blockDim.x + threadIdx.x; + hiprandState localState = state[id]; + CUDA_1D_KERNEL_LOOP(i, n) { buffer[i] = static_cast(hiprand(&localState) % max_val); } + state[id] = localState; +} + +class DistributedPartialFcSampleOpKernelState final : public user_op::OpKernelState { + public: + DistributedPartialFcSampleOpKernelState(ep::Stream* stream, int64_t lower, int64_t upper, + int64_t num_sample_per_rank, int64_t seed) + : lower_(lower), upper_(upper), num_sample_per_rank_(num_sample_per_rank) { + CHECK_NOTNULL(stream); + const int64_t num_classes = upper_ - lower_; + OF_CUDA_CHECK(hipMalloc(&curand_states_, BlocksNum4ThreadsNum(num_classes) + * kCudaThreadsNumPerBlock * sizeof(hiprandState))); + SetupKernel<<As()->cuda_stream()>>>(seed, curand_states_); + } + ~DistributedPartialFcSampleOpKernelState() { + hipError_t ret = hipFree(curand_states_); + if (ret != hipErrorDeinitialized) { OF_CUDA_CHECK(ret); } + }; + + int64_t lower() const { return lower_; } + int64_t upper() const { return upper_; } + int64_t num_sample_per_rank() const { return num_sample_per_rank_; } + + template + void GenRandom(ep::Stream* stream, const int64_t n, const int64_t max_val, K* buffer) { + GenerateGpu + <<As()->cuda_stream()>>>(curand_states_, n, max_val, buffer); + } + + private: + const int64_t lower_; + const int64_t upper_; + const int64_t num_sample_per_rank_; + hiprandState* curand_states_; +}; + +template +__global__ void IotaKernel(int64_t n, K* out) { + CUDA_1D_KERNEL_LOOP(i, n) { out[i] = static_cast(i); } +} + +template +__global__ void MarkPositive(const int64_t n, const int64_t offset, const int64_t num_classes, + const K* labels, K* out) { + CUDA_1D_KERNEL_LOOP(i, n) { + K label = labels[i] - offset; + if (label >= 0 && label < num_classes) { out[label] = label - num_classes; } + } +} + +template +__global__ void GetSampledLabel(const int64_t n, const int64_t offset, const K* label, + K* sampled_label) { + CUDA_1D_KERNEL_LOOP(i, n) { sampled_label[i] = label[i] + offset; } +} + +template +__global__ void GetLabelMap(const int64_t n, const int64_t parallel_num, + const int64_t num_sample_per_rank, const K* bound_index, + const K* bound_value, K* label_map) { + CUDA_1D_KERNEL_LOOP(i, n) { +#pragma unroll + for (int64_t j = 0; j < parallel_num; j++) { + if (i >= bound_index[j] && i < bound_index[j + 1]) { + label_map[i] = label_map[i] - bound_value[j] + j * num_sample_per_rank; + } + } + } +} + +template +__global__ void GetPartionBound(const int64_t n, const int64_t parallel_num, + const int64_t num_classes_per_rank, const K* key_ptr, + const K* value_ptr, K* bound_index, K* bound_value) { + CUDA_1D_KERNEL_LOOP(i, n) { + if (i != 0) { + const K cur_in = key_ptr[i] / num_classes_per_rank; + const K pre_in = key_ptr[i - 1] / num_classes_per_rank; + if (cur_in > pre_in) { + assert(cur_in < parallel_num); +#pragma unroll + for (int32_t j = pre_in + 1; j <= cur_in; ++j) { + bound_index[j] = static_cast(i); + bound_value[j] = value_ptr[i]; + } + } + } + } + CUDA_1D_KERNEL_LOOP(i, parallel_num + 1) { + const K first_in = key_ptr[0] / num_classes_per_rank; + const K last_in = key_ptr[n - 1] / num_classes_per_rank; + if (i <= first_in) { + bound_index[i] = 0; + bound_value[i] = value_ptr[0]; + } else if (i > last_in) { + bound_index[i] = n; + bound_value[i] = value_ptr[n - 1]; + } + } +} + +template +__global__ void GetMappedLabel(const int64_t n, const K* label_map_key, const K* label_map_value, + K* mapped_label) { + CUDA_1D_KERNEL_LOOP(i, n) { mapped_label[label_map_key[i]] = label_map_value[i]; } +} + +template +void MapLabel(ep::Stream* stream, const int64_t num_classes, const int64_t batch_size, + const int64_t lower_bound, const int64_t parallel_num, const int64_t num_sample, + size_t temp_storage_bytes, const K* label_ptr, K* mapped_label_ptr, + K* cub_sort_values_ptr, K* cub_sort_keys_out_ptr, K* cub_sort_values_out_ptr, + void* cub_tmp_storage_ptr, K* bound_index_ptr, K* bound_value_ptr) { + IotaKernel<<As()->cuda_stream()>>>(batch_size, cub_sort_values_ptr); + OF_CUDA_CHECK((hipcub::DeviceRadixSort::SortPairs( + cub_tmp_storage_ptr, temp_storage_bytes, label_ptr, cub_sort_keys_out_ptr, + cub_sort_values_ptr, cub_sort_values_out_ptr, batch_size, 0, sizeof(K) * 8, + stream->As()->cuda_stream()))); + NotEqualToPreviousAdjacentIterator unique_counting_iter(cub_sort_keys_out_ptr, 0); + OF_CUDA_CHECK((hipcub::DeviceScan::InclusiveSum, K*>( + cub_tmp_storage_ptr, temp_storage_bytes, unique_counting_iter, cub_sort_values_ptr, + batch_size, stream->As()->cuda_stream()))); + + GetPartionBound<<As()->cuda_stream()>>>( + batch_size, parallel_num, num_classes, cub_sort_keys_out_ptr, cub_sort_values_ptr, + bound_index_ptr, bound_value_ptr); + + GetLabelMap<<As()->cuda_stream()>>>( + batch_size, parallel_num, num_sample, bound_index_ptr, bound_value_ptr, cub_sort_values_ptr); + + GetMappedLabel<<As()->cuda_stream()>>>( + batch_size, cub_sort_values_out_ptr, cub_sort_values_ptr, mapped_label_ptr); +} + +} // namespace + +template +class DistributedPartialFcSampleGpuKernel final : public user_op::OpKernel { + public: + DistributedPartialFcSampleGpuKernel() = default; + ~DistributedPartialFcSampleGpuKernel() override = default; + + std::shared_ptr CreateOpKernelState( + user_op::KernelInitContext* ctx) const override { + const SbpParallel& in_sbp = ctx->SbpParallel4ArgNameAndIndex("weight", 0); + const TensorDesc* in_logical_desc = ctx->LogicalTensorDesc4ArgNameAndIndex("weight", 0); + const int64_t class_num = in_logical_desc->shape().At(0); + const int64_t num_sample = ctx->Attr("num_sample"); + int64_t seed = ctx->Attr("seed"); + const int64_t parallel_num = ctx->parallel_ctx().parallel_num(); + const int64_t num_sample_per_rank = RoundUp(num_sample, parallel_num) / parallel_num; + if (in_sbp.has_split_parallel() && in_sbp.split_parallel().axis() == 0 && parallel_num > 1) { + std::seed_seq seq{seed}; + std::vector seeds(parallel_num); + seq.generate(seeds.begin(), seeds.end()); + seed = seeds.at(ctx->parallel_ctx().parallel_id()); + CHECK(ctx->SbpParallel4ArgNameAndIndex("label", 0).has_broadcast_parallel()); + BalancedSplitter bs(class_num, parallel_num); + return std::make_shared( + ctx->stream(), bs.At(ctx->parallel_ctx().parallel_id()).begin(), + bs.At(ctx->parallel_ctx().parallel_id()).end(), num_sample_per_rank, seed); + } else { + return std::make_shared(ctx->stream(), 0, class_num, + num_sample_per_rank, seed); + } + } + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, + const user_op::OpKernelCache*) const override { + const user_op::Tensor* weight = ctx->Tensor4ArgNameAndIndex("weight", 0); + const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0); + user_op::Tensor* mapped_label = ctx->Tensor4ArgNameAndIndex("mapped_label", 0); + user_op::Tensor* sampled_label = ctx->Tensor4ArgNameAndIndex("sampled_label", 0); + user_op::Tensor* sampled_weight = ctx->Tensor4ArgNameAndIndex("sampled_weight", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + + const int64_t batch_size = label->shape_view().At(0); + const int64_t num_classes = weight->shape_view().At(0); + const int64_t parallel_num = ctx->parallel_ctx().parallel_num(); + TmpBufferManager buffer_manager(tmp_buffer->mut_dptr(), num_classes, batch_size, + parallel_num); + + auto* kernel_state = dynamic_cast(state); + CHECK_NOTNULL(kernel_state); + CHECK_EQ(num_classes, kernel_state->upper() - kernel_state->lower()); + const int64_t lower_bound = kernel_state->lower(); + const int64_t num_sample = kernel_state->num_sample_per_rank(); + kernel_state->GenRandom(ctx->stream(), num_classes, num_classes, + buffer_manager.CubSortKeysPtr()); + MarkPositive<<stream()->As()->cuda_stream()>>>( + batch_size, lower_bound, num_classes, label->dptr(), buffer_manager.CubSortKeysPtr()); + IotaKernel<<stream()->As()->cuda_stream()>>>( + num_classes, buffer_manager.CubSortValuesPtr()); + size_t temp_storage_bytes = buffer_manager.GetCubTmpStorageSize(); + OF_CUDA_CHECK((hipcub::DeviceRadixSort::SortPairs( + buffer_manager.CubTmpStoragePtr(), temp_storage_bytes, buffer_manager.CubSortKeysPtr(), + buffer_manager.CubSortKeysOutPtr(), buffer_manager.CubSortValuesPtr(), + buffer_manager.CubSortValuesOutPtr(), num_classes, 0, sizeof(K) * 8, + ctx->stream()->As()->cuda_stream()))); + + GetSampledLabel<<stream()->As()->cuda_stream()>>>( + num_sample, lower_bound, buffer_manager.CubSortValuesOutPtr(), + sampled_label->mut_dptr()); + + GatherKernelUtilImpl::Forward( + ctx->stream(), buffer_manager.CubSortValuesOutPtr(), num_sample, weight->dptr(), + Shape({1, num_classes, weight->shape_view().Count(1)}), sampled_weight->mut_dptr(), 0); + + MapLabel(ctx->stream(), num_classes, batch_size, lower_bound, parallel_num, num_sample, + buffer_manager.GetCubTmpStorageSize(), label->dptr(), + mapped_label->mut_dptr(), buffer_manager.CubSortValuesPtr(), + buffer_manager.CubSortKeysOutPtr(), buffer_manager.CubSortValuesOutPtr(), + buffer_manager.CubTmpStoragePtr(), buffer_manager.BoundIndexPtr(), + buffer_manager.BoundValuePtr()); + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_DISTRIBUTED_PARTIAL_FC_SAMPLE_CUDA_KERNEL(dtype_pair, ltype_pair) \ + REGISTER_USER_KERNEL("distributed_partial_fc_sample") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("label", 0) == OF_PP_PAIR_SECOND(ltype_pair)) \ + && (user_op::HobDataType("weight", 0) == OF_PP_PAIR_SECOND(dtype_pair))) \ + .SetInferTmpSizeFn([](oneflow::user_op::InferContext* ctx) { \ + const int64_t num_classes = ctx->InputTensorDesc("weight", 0).shape().At(0); \ + const int64_t batch_size = ctx->InputTensorDesc("label", 0).shape().At(0); \ + const int64_t parallel_num = ctx->parallel_ctx().parallel_num(); \ + TmpBufferManager buffer_manager(nullptr, num_classes, \ + batch_size, parallel_num); \ + return buffer_manager.GetTotalBufferSize(); \ + }); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_DISTRIBUTED_PARTIAL_FC_SAMPLE_CUDA_KERNEL, + FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) + +template +class DistributedPartialFcSampleDisableBoxingGpuKernel final : public user_op::OpKernel { + public: + DistributedPartialFcSampleDisableBoxingGpuKernel() = default; + ~DistributedPartialFcSampleDisableBoxingGpuKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, + const user_op::OpKernelCache*) const override { + const user_op::Tensor* sampled_weight_diff = + ctx->Tensor4ArgNameAndIndex("sampled_weight_diff", 0); + const user_op::Tensor* sampled_label = ctx->Tensor4ArgNameAndIndex("sampled_label", 0); + user_op::Tensor* boxing_disabled_sampled_weight_diff = + ctx->Tensor4ArgNameAndIndex("boxing_disabled_sampled_weight_diff", 0); + user_op::Tensor* boxing_disabled_sampled_label = + ctx->Tensor4ArgNameAndIndex("boxing_disabled_sampled_label", 0); + Memcpy(ctx->stream(), boxing_disabled_sampled_weight_diff->mut_dptr(), + sampled_weight_diff->dptr(), + sampled_weight_diff->shape_view().elem_cnt() + * GetSizeOfDataType(sampled_weight_diff->data_type())); + Memcpy( + ctx->stream(), boxing_disabled_sampled_label->mut_dptr(), sampled_label->dptr(), + sampled_label->shape_view().elem_cnt() * GetSizeOfDataType(sampled_label->data_type())); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_DISTRIBUTED_PARTIAL_FC_SAMPLE_DISABLE_BOXING_CUDA_KERNEL(dtype_pair, ltype_pair) \ + REGISTER_USER_KERNEL("distributed_partial_fc_sample_disable_boxing") \ + .SetCreateFn>() \ + .SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("sampled_label", 0) == OF_PP_PAIR_SECOND(ltype_pair)) \ + && (user_op::HobDataType("sampled_weight_diff", 0) == OF_PP_PAIR_SECOND(dtype_pair))); +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_DISTRIBUTED_PARTIAL_FC_SAMPLE_DISABLE_BOXING_CUDA_KERNEL, + FLOATING_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) + +} // namespace user_op +} // namespace oneflow #endif \ No newline at end of file diff --git a/oneflow/user/kernels/prelu_kernel.hip.cpp b/oneflow/user/kernels/prelu_kernel.hip.cpp index 1613ff4..870272b 100644 --- a/oneflow/user/kernels/prelu_kernel.hip.cpp +++ b/oneflow/user/kernels/prelu_kernel.hip.cpp @@ -1,505 +1,505 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/ndarray/ndarray_util.h" -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -Shape CreatePreluLeftExtendedShape(const ShapeView& shape) { - DimVector dim_vec(shape.NumAxes()); - dim_vec.at(0) = 1LL; - dim_vec.at(1) = shape.At(1); - for (int i = 2; i < shape.NumAxes(); i++) { dim_vec.at(i) = 1LL; } - return Shape(std::move(dim_vec)); -} - -template -struct PreluForwardSingleAlphaFunctor { - OF_DEVICE_FUNC explicit PreluForwardSingleAlphaFunctor(const T alpha) : alpha(alpha) {} - __device__ T operator()(T x) const { return (x > static_cast(0.0)) ? x : (alpha * x); } - const T alpha; -}; - -template -struct PreluForwardSingleAlphaPtrFunctor { - OF_DEVICE_FUNC explicit PreluForwardSingleAlphaPtrFunctor(const T* alpha_ptr) - : alpha_ptr(alpha_ptr) {} - __device__ PreluForwardSingleAlphaFunctor operator()() const { - return PreluForwardSingleAlphaFunctor(*alpha_ptr); - } - const T* alpha_ptr; -}; - -template -__global__ void PReluBackwardSingleAlphaGpu(const IndexType elem_cnt, const int64_t n_tail, - const T* x, const T* alpha, const T* dy, T* dx, - T* alpha_diff, const T* tail_x, const T* tail_dy, - T* tail_dx, T* tail_alpha_diff) { - int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - - using LoadType = cuda::elementwise::PackType; - using LoadPack = cuda::elementwise::Pack; - T zero_val = static_cast(0); - T alpha_val = alpha[0]; - - for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt; - linear_index += gridDim.x * blockDim.x * pack_size) { - const LoadType* x_load = reinterpret_cast(x + linear_index); - LoadPack x_vec; - x_vec.storage = *x_load; - - const LoadType* dy_load = reinterpret_cast(dy + linear_index); - LoadPack dy_vec; - dy_vec.storage = *dy_load; - - LoadPack dx_vec; - T zero_val = static_cast(0.0); - if (alpha_requires_grad) { - LoadPack dalpha_vec; -#pragma unroll - for (int i = 0; i < pack_size; i++) { - if (x_vec.elem[i] > zero_val) { - dx_vec.elem[i] = dy_vec.elem[i]; - dalpha_vec.elem[i] = zero_val; - } else { - dx_vec.elem[i] = dy_vec.elem[i] * alpha_val; - dalpha_vec.elem[i] = dy_vec.elem[i] * x_vec.elem[i]; - } - } - *(reinterpret_cast(dx + linear_index)) = dx_vec.storage; - *(reinterpret_cast(alpha_diff + linear_index)) = dalpha_vec.storage; - } else { -#pragma unroll - for (int i = 0; i < pack_size; i++) { - if (x_vec.elem[i] > zero_val) { - dx_vec.elem[i] = dy_vec.elem[i]; - } else { - dx_vec.elem[i] = dy_vec.elem[i] * alpha_val; - } - } - *(reinterpret_cast(dx + linear_index)) = dx_vec.storage; - } - } - - if (tail && global_thread_id < n_tail) { - const T tail_dy_val = tail_dy[global_thread_id]; - if (tail_x[global_thread_id] > zero_val) { - tail_dx[global_thread_id] = tail_dy_val; - if (alpha_requires_grad) { tail_alpha_diff[global_thread_id] = zero_val; } - } else { - tail_dx[global_thread_id] = alpha_val * tail_dy_val; - if (alpha_requires_grad) { - tail_alpha_diff[global_thread_id] = tail_x[global_thread_id] * tail_dy_val; - } - } - } -} - -template -__global__ void BroadcastPReluMultiAlphaNaiveForwardGpu(const int32_t elem_cnt, - const int32_t alpha_size, - const int32_t inner_size, const T* x, - const T* alpha, T* y) { - const T zero_val = static_cast(0.0); - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { - const T x_i = x[i]; - int32_t alpha_idx = (i / inner_size) % alpha_size; - y[i] = x_i > zero_val ? x_i : x_i * alpha[alpha_idx]; - } -} - -template -__global__ void PReluForwardMultiAlphaGpu(const IndexType elem_cnt, const IndexType alpha_size, - const IndexType inner_size, const T* x, const T* alpha, - T* y) { - int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - - using LoadType = cuda::elementwise::PackType; - using LoadPack = cuda::elementwise::Pack; - T zero_val = static_cast(0); - for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt; - linear_index += gridDim.x * blockDim.x * pack_size) { - IndexType alpha_idx = (linear_index / inner_size) % alpha_size; - - const LoadType* x_load = reinterpret_cast(x + linear_index); - LoadPack x_vec; - x_vec.storage = *x_load; - - LoadPack y_vec; - - T alpha_val = alpha[alpha_idx]; -#pragma unroll - for (int i = 0; i < pack_size; i++) { - y_vec.elem[i] = x_vec.elem[i] > zero_val ? x_vec.elem[i] : x_vec.elem[i] * alpha_val; - } - *(reinterpret_cast(y + linear_index)) = y_vec.storage; - } -} - -template -__global__ void BroadcastPReluMultiAlphaNaiveBackwardGpu(const int32_t elem_cnt, - const int32_t alpha_size, - const int32_t inner_size, const T* x, - const T* alpha, const T* dy, T* dx, - T* alpha_diff) { - const T zero_val = static_cast(0.0); - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { - const T x_i = x[i]; - const T dy_i = dy[i]; - int32_t alpha_i = (i / inner_size) % alpha_size; - if (x_i > zero_val) { - dx[i] = dy_i; - if (alpha_requires_grad) { alpha_diff[i] = zero_val; } - } else { - dx[i] = dy_i * alpha[alpha_i]; - if (alpha_requires_grad) { alpha_diff[i] = dy_i * x_i; } - } - } -} - -template -__global__ void PReluBackwardMultiAlphaGpu(const IndexType elem_cnt, const IndexType alpha_size, - const IndexType inner_size, const T* x, const T* alpha, - const T* dy, T* dx, T* alpha_diff) { - int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - - using LoadType = cuda::elementwise::PackType; - using LoadPack = cuda::elementwise::Pack; - T zero_val = static_cast(0); - for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt; - linear_index += gridDim.x * blockDim.x * pack_size) { - IndexType alpha_idx = (linear_index / inner_size) % alpha_size; - - const LoadType* x_load = reinterpret_cast(x + linear_index); - LoadPack x_vec; - x_vec.storage = *x_load; - - const LoadType* dy_load = reinterpret_cast(dy + linear_index); - LoadPack dy_vec; - dy_vec.storage = *dy_load; - - LoadPack dx_vec; - T alpha_val = alpha[alpha_idx]; - if (alpha_requires_grad) { - LoadPack dalpha_vec; - T zero_val = static_cast(0.0); -#pragma unroll - for (int i = 0; i < pack_size; i++) { - if (x_vec.elem[i] > zero_val) { - dx_vec.elem[i] = dy_vec.elem[i]; - dalpha_vec.elem[i] = zero_val; - } else { - dx_vec.elem[i] = dy_vec.elem[i] * alpha_val; - dalpha_vec.elem[i] = dy_vec.elem[i] * x_vec.elem[i]; - } - } - *(reinterpret_cast(dx + linear_index)) = dx_vec.storage; - *(reinterpret_cast(alpha_diff + linear_index)) = dalpha_vec.storage; - } else { -#pragma unroll - for (int i = 0; i < pack_size; i++) { - if (x_vec.elem[i] > zero_val) { - dx_vec.elem[i] = dy_vec.elem[i]; - } else { - dx_vec.elem[i] = dy_vec.elem[i] * alpha_val; - } - } - *(reinterpret_cast(dx + linear_index)) = dx_vec.storage; - } - } -} - -constexpr int32_t kBlockSize = 256; - -template -int GetLaunchPackSize(const int64_t inner_size) { - constexpr int type_pack_size = cuda::elementwise::PackSize(); - for (int launch_pack_size = 8; launch_pack_size > 0; launch_pack_size /= 2) { - if (type_pack_size >= launch_pack_size && inner_size % launch_pack_size == 0) { - return launch_pack_size; - } - } - return 1; -} - -template -void DispatchPreluForwardPackSize(ep::Stream* stream, const int64_t elem_cnt, - const int64_t alpha_size, const int64_t inner_size, const T* x, - const T* alpha, T* y) { - int grid_size; - const int pack_size = GetLaunchPackSize(inner_size); - const int64_t pack_num = elem_cnt / pack_size; - hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size); - if (pack_size == 8) { - PReluForwardMultiAlphaGpu - <<As()->cuda_stream()>>>( - elem_cnt, alpha_size, inner_size, x, alpha, y); - } else if (pack_size == 4) { - PReluForwardMultiAlphaGpu - <<As()->cuda_stream()>>>( - elem_cnt, alpha_size, inner_size, x, alpha, y); - } else if (pack_size == 2) { - PReluForwardMultiAlphaGpu - <<As()->cuda_stream()>>>( - elem_cnt, alpha_size, inner_size, x, alpha, y); - } else { - BroadcastPReluMultiAlphaNaiveForwardGpu - <<As()->cuda_stream()>>>( - elem_cnt, alpha_size, inner_size, x, alpha, y); - } -} - -template -void DispatchPreluForwardIndex(ep::Stream* stream, const int64_t elem_cnt, const int64_t alpha_size, - const int64_t inner_size, const T* x, const T* alpha, T* y) { - if (elem_cnt < GetMaxVal()) { - DispatchPreluForwardPackSize(stream, elem_cnt, alpha_size, inner_size, x, alpha, y); - } else { - DispatchPreluForwardPackSize(stream, elem_cnt, alpha_size, inner_size, x, alpha, y); - } -} - -template -void DispatchPreluBackwardPackSize(ep::Stream* stream, const int64_t elem_cnt, - const int64_t alpha_size, const int64_t inner_size, const T* x, - const T* alpha, const T* dy, T* dx, T* alpha_diff, - const bool alpha_requires_grad) { - int grid_size; - const int pack_size = GetLaunchPackSize(inner_size); - const int64_t pack_num = elem_cnt / pack_size; - hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size); - - if (pack_size == 8) { - if (alpha_requires_grad) { - PReluBackwardMultiAlphaGpu - <<As()->cuda_stream()>>>( - elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff); - } else { - PReluBackwardMultiAlphaGpu - <<As()->cuda_stream()>>>( - elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff); - } - } else if (pack_size == 4) { - if (alpha_requires_grad) { - PReluBackwardMultiAlphaGpu - <<As()->cuda_stream()>>>( - elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff); - } else { - PReluBackwardMultiAlphaGpu - <<As()->cuda_stream()>>>( - elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff); - } - } else if (pack_size == 2) { - if (alpha_requires_grad) { - PReluBackwardMultiAlphaGpu - <<As()->cuda_stream()>>>( - elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff); - } else { - PReluBackwardMultiAlphaGpu - <<As()->cuda_stream()>>>( - elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff); - } - - } else { - if (alpha_requires_grad) { - BroadcastPReluMultiAlphaNaiveBackwardGpu - <<As()->cuda_stream()>>>( - elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff); - } else { - BroadcastPReluMultiAlphaNaiveBackwardGpu - <<As()->cuda_stream()>>>( - elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff); - } - } -} - -template -void DispatchPreluBackwardIndex(ep::Stream* stream, const int64_t elem_cnt, - const int64_t alpha_size, const int64_t inner_size, const T* x, - const T* alpha, const T* dy, T* dx, T* alpha_diff, - const bool alpha_requires_grad) { - if (elem_cnt < GetMaxVal()) { - DispatchPreluBackwardPackSize(stream, elem_cnt, alpha_size, inner_size, x, alpha, - dy, dx, alpha_diff, alpha_requires_grad); - } else { - DispatchPreluBackwardPackSize(stream, elem_cnt, alpha_size, inner_size, x, alpha, - dy, dx, alpha_diff, alpha_requires_grad); - } -} - -template -void DispatchPreluBackwardSingleAlphaTail(ep::Stream* stream, const IndexType elem_cnt, const T* x, - const T* alpha, const T* dy, T* dx, T* alpha_diff, - const bool alpha_requires_grad) { - constexpr int pack_size = cuda::elementwise::PackSize(); - const int64_t pack_num = elem_cnt / pack_size; - int grid_size; - hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size); - const int64_t tail_offset = pack_num * pack_size; - const int64_t n_tail = elem_cnt - tail_offset; - const bool tail = n_tail > 0 ? true : false; - if (tail) { - if (alpha_requires_grad) { - PReluBackwardSingleAlphaGpu - <<As()->cuda_stream()>>>( - elem_cnt, n_tail, x, alpha, dy, dx, alpha_diff, x + tail_offset, dy + tail_offset, - dx + tail_offset, alpha_diff + tail_offset); - } else { - PReluBackwardSingleAlphaGpu - <<As()->cuda_stream()>>>( - elem_cnt, n_tail, x, alpha, dy, dx, alpha_diff, x + tail_offset, dy + tail_offset, - dx + tail_offset, alpha_diff + tail_offset); - } - } else { - if (alpha_requires_grad) { - PReluBackwardSingleAlphaGpu - <<As()->cuda_stream()>>>( - elem_cnt, n_tail, x, alpha, dy, dx, alpha_diff, x + tail_offset, dy + tail_offset, - dx + tail_offset, alpha_diff + tail_offset); - } else { - PReluBackwardSingleAlphaGpu - <<As()->cuda_stream()>>>( - elem_cnt, n_tail, x, alpha, dy, dx, alpha_diff, x + tail_offset, dy + tail_offset, - dx + tail_offset, alpha_diff + tail_offset); - } - } -} - -template -void DispatchPreluBackwardSingleAlphaIndex(ep::Stream* stream, const int64_t elem_cnt, const T* x, - const T* alpha, const T* dy, T* dx, T* alpha_diff, - const bool alpha_requires_grad) { - if (elem_cnt < GetMaxVal()) { - DispatchPreluBackwardSingleAlphaTail(stream, elem_cnt, x, alpha, dy, dx, alpha_diff, - alpha_requires_grad); - } else { - DispatchPreluBackwardSingleAlphaTail(stream, elem_cnt, x, alpha, dy, dx, alpha_diff, - alpha_requires_grad); - } -} - -} // namespace - -template -class GpuPReluKernel final : public user_op::OpKernel { - public: - GpuPReluKernel() = default; - ~GpuPReluKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* alpha = ctx->Tensor4ArgNameAndIndex("alpha", 0); - user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - const int32_t elem_cnt = x->shape_view().elem_cnt(); - const int32_t batch = x->shape_view().At(0); - const int32_t channels = (x->shape_view().NumAxes() == 1) ? 1 : x->shape_view().At(1); - const int32_t alpha_size = alpha->shape_view().elem_cnt(); - const int32_t inner_size = elem_cnt / batch / channels; - - if (alpha_size == 1) { - OF_CUDA_CHECK((cuda::elementwise::UnaryWithFactory( - PreluForwardSingleAlphaPtrFunctor(reinterpret_cast(alpha->dptr())), elem_cnt, - reinterpret_cast(y->mut_dptr()), reinterpret_cast(x->dptr()), - ctx->stream()->As()->cuda_stream()))); - } else { - DispatchPreluForwardIndex( - ctx->stream(), elem_cnt, alpha_size, inner_size, reinterpret_cast(x->dptr()), - reinterpret_cast(alpha->dptr()), reinterpret_cast(y->mut_dptr())); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_PRELU_KERNEL(dtype) \ - REGISTER_USER_KERNEL("prelu").SetCreateFn>().SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value)); - -REGISTER_CUDA_PRELU_KERNEL(half) -REGISTER_CUDA_PRELU_KERNEL(float) -REGISTER_CUDA_PRELU_KERNEL(double) - -template -class GpuPReluGradKernel final : public user_op::OpKernel { - public: - GpuPReluGradKernel() = default; - ~GpuPReluGradKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* alpha = ctx->Tensor4ArgNameAndIndex("alpha", 0); - const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - user_op::Tensor* alpha_diff = ctx->Tensor4ArgNameAndIndex("alpha_diff", 0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const bool alpha_requires_grad = ctx->Attr("alpha_requires_grad"); - const int32_t elem_cnt = x->shape_view().elem_cnt(); - T* broadcasted_alpha_diff = tmp_buffer->mut_dptr(); - T* reduce_sum_tmp_buf = reinterpret_cast(tmp_buffer->mut_dptr() - + GetCudaAlignedSize(elem_cnt * sizeof(T))); - - const Shape& left_extended_shape = CreatePreluLeftExtendedShape(ShapeView(x->shape_view())); - - const int32_t batch = x->shape_view().At(0); - const int32_t channels = (x->shape_view().NumAxes() == 1) ? 1 : x->shape_view().At(1); - const int32_t alpha_size = alpha->shape_view().elem_cnt(); - const int32_t inner_size = elem_cnt / batch / channels; - if (alpha_size == 1) { - DispatchPreluBackwardSingleAlphaIndex(ctx->stream(), elem_cnt, x->dptr(), - alpha->dptr(), dy->dptr(), dx->mut_dptr(), - broadcasted_alpha_diff, alpha_requires_grad); - } else { - DispatchPreluBackwardIndex(ctx->stream(), elem_cnt, alpha_size, inner_size, x->dptr(), - alpha->dptr(), dy->dptr(), dx->mut_dptr(), - broadcasted_alpha_diff, alpha_requires_grad); - } - if (alpha_requires_grad) { - NdarrayUtil::ReduceSum( - ctx->stream(), XpuVarNdarray(left_extended_shape, alpha_diff->mut_dptr()), - XpuVarNdarray(x->shape_view(), broadcasted_alpha_diff), - XpuVarNdarray(x->shape_view(), reduce_sum_tmp_buf)); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_PRELU_GRAD_KERNEL(dtype) \ - REGISTER_USER_KERNEL("prelu_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ - const Shape& in_shape = ctx->InputShape("x", 0); \ - const Shape& alpha_shape = ctx->InputShape("alpha", 0); \ - const int64_t tmp_buffer_size = \ - 2 * GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(dtype)); \ - return tmp_buffer_size; \ - }); - -REGISTER_CUDA_PRELU_GRAD_KERNEL(half) -REGISTER_CUDA_PRELU_GRAD_KERNEL(float) -REGISTER_CUDA_PRELU_GRAD_KERNEL(double) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/ndarray/ndarray_util.h" +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +Shape CreatePreluLeftExtendedShape(const ShapeView& shape) { + DimVector dim_vec(shape.NumAxes()); + dim_vec.at(0) = 1LL; + dim_vec.at(1) = shape.At(1); + for (int i = 2; i < shape.NumAxes(); i++) { dim_vec.at(i) = 1LL; } + return Shape(std::move(dim_vec)); +} + +template +struct PreluForwardSingleAlphaFunctor { + OF_DEVICE_FUNC explicit PreluForwardSingleAlphaFunctor(const T alpha) : alpha(alpha) {} + __device__ T operator()(T x) const { return (x > static_cast(0.0)) ? x : (alpha * x); } + const T alpha; +}; + +template +struct PreluForwardSingleAlphaPtrFunctor { + OF_DEVICE_FUNC explicit PreluForwardSingleAlphaPtrFunctor(const T* alpha_ptr) + : alpha_ptr(alpha_ptr) {} + __device__ PreluForwardSingleAlphaFunctor operator()() const { + return PreluForwardSingleAlphaFunctor(*alpha_ptr); + } + const T* alpha_ptr; +}; + +template +__global__ void PReluBackwardSingleAlphaGpu(const IndexType elem_cnt, const int64_t n_tail, + const T* x, const T* alpha, const T* dy, T* dx, + T* alpha_diff, const T* tail_x, const T* tail_dy, + T* tail_dx, T* tail_alpha_diff) { + int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + using LoadType = cuda::elementwise::PackType; + using LoadPack = cuda::elementwise::Pack; + T zero_val = static_cast(0); + T alpha_val = alpha[0]; + + for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt; + linear_index += gridDim.x * blockDim.x * pack_size) { + const LoadType* x_load = reinterpret_cast(x + linear_index); + LoadPack x_vec; + x_vec.storage = *x_load; + + const LoadType* dy_load = reinterpret_cast(dy + linear_index); + LoadPack dy_vec; + dy_vec.storage = *dy_load; + + LoadPack dx_vec; + T zero_val = static_cast(0.0); + if (alpha_requires_grad) { + LoadPack dalpha_vec; +#pragma unroll + for (int i = 0; i < pack_size; i++) { + if (x_vec.elem[i] > zero_val) { + dx_vec.elem[i] = dy_vec.elem[i]; + dalpha_vec.elem[i] = zero_val; + } else { + dx_vec.elem[i] = dy_vec.elem[i] * alpha_val; + dalpha_vec.elem[i] = dy_vec.elem[i] * x_vec.elem[i]; + } + } + *(reinterpret_cast(dx + linear_index)) = dx_vec.storage; + *(reinterpret_cast(alpha_diff + linear_index)) = dalpha_vec.storage; + } else { +#pragma unroll + for (int i = 0; i < pack_size; i++) { + if (x_vec.elem[i] > zero_val) { + dx_vec.elem[i] = dy_vec.elem[i]; + } else { + dx_vec.elem[i] = dy_vec.elem[i] * alpha_val; + } + } + *(reinterpret_cast(dx + linear_index)) = dx_vec.storage; + } + } + + if (tail && global_thread_id < n_tail) { + const T tail_dy_val = tail_dy[global_thread_id]; + if (tail_x[global_thread_id] > zero_val) { + tail_dx[global_thread_id] = tail_dy_val; + if (alpha_requires_grad) { tail_alpha_diff[global_thread_id] = zero_val; } + } else { + tail_dx[global_thread_id] = alpha_val * tail_dy_val; + if (alpha_requires_grad) { + tail_alpha_diff[global_thread_id] = tail_x[global_thread_id] * tail_dy_val; + } + } + } +} + +template +__global__ void BroadcastPReluMultiAlphaNaiveForwardGpu(const int32_t elem_cnt, + const int32_t alpha_size, + const int32_t inner_size, const T* x, + const T* alpha, T* y) { + const T zero_val = static_cast(0.0); + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { + const T x_i = x[i]; + int32_t alpha_idx = (i / inner_size) % alpha_size; + y[i] = x_i > zero_val ? x_i : x_i * alpha[alpha_idx]; + } +} + +template +__global__ void PReluForwardMultiAlphaGpu(const IndexType elem_cnt, const IndexType alpha_size, + const IndexType inner_size, const T* x, const T* alpha, + T* y) { + int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + using LoadType = cuda::elementwise::PackType; + using LoadPack = cuda::elementwise::Pack; + T zero_val = static_cast(0); + for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt; + linear_index += gridDim.x * blockDim.x * pack_size) { + IndexType alpha_idx = (linear_index / inner_size) % alpha_size; + + const LoadType* x_load = reinterpret_cast(x + linear_index); + LoadPack x_vec; + x_vec.storage = *x_load; + + LoadPack y_vec; + + T alpha_val = alpha[alpha_idx]; +#pragma unroll + for (int i = 0; i < pack_size; i++) { + y_vec.elem[i] = x_vec.elem[i] > zero_val ? x_vec.elem[i] : x_vec.elem[i] * alpha_val; + } + *(reinterpret_cast(y + linear_index)) = y_vec.storage; + } +} + +template +__global__ void BroadcastPReluMultiAlphaNaiveBackwardGpu(const int32_t elem_cnt, + const int32_t alpha_size, + const int32_t inner_size, const T* x, + const T* alpha, const T* dy, T* dx, + T* alpha_diff) { + const T zero_val = static_cast(0.0); + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { + const T x_i = x[i]; + const T dy_i = dy[i]; + int32_t alpha_i = (i / inner_size) % alpha_size; + if (x_i > zero_val) { + dx[i] = dy_i; + if (alpha_requires_grad) { alpha_diff[i] = zero_val; } + } else { + dx[i] = dy_i * alpha[alpha_i]; + if (alpha_requires_grad) { alpha_diff[i] = dy_i * x_i; } + } + } +} + +template +__global__ void PReluBackwardMultiAlphaGpu(const IndexType elem_cnt, const IndexType alpha_size, + const IndexType inner_size, const T* x, const T* alpha, + const T* dy, T* dx, T* alpha_diff) { + int32_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + using LoadType = cuda::elementwise::PackType; + using LoadPack = cuda::elementwise::Pack; + T zero_val = static_cast(0); + for (int64_t linear_index = global_thread_id * pack_size; linear_index < elem_cnt; + linear_index += gridDim.x * blockDim.x * pack_size) { + IndexType alpha_idx = (linear_index / inner_size) % alpha_size; + + const LoadType* x_load = reinterpret_cast(x + linear_index); + LoadPack x_vec; + x_vec.storage = *x_load; + + const LoadType* dy_load = reinterpret_cast(dy + linear_index); + LoadPack dy_vec; + dy_vec.storage = *dy_load; + + LoadPack dx_vec; + T alpha_val = alpha[alpha_idx]; + if (alpha_requires_grad) { + LoadPack dalpha_vec; + T zero_val = static_cast(0.0); +#pragma unroll + for (int i = 0; i < pack_size; i++) { + if (x_vec.elem[i] > zero_val) { + dx_vec.elem[i] = dy_vec.elem[i]; + dalpha_vec.elem[i] = zero_val; + } else { + dx_vec.elem[i] = dy_vec.elem[i] * alpha_val; + dalpha_vec.elem[i] = dy_vec.elem[i] * x_vec.elem[i]; + } + } + *(reinterpret_cast(dx + linear_index)) = dx_vec.storage; + *(reinterpret_cast(alpha_diff + linear_index)) = dalpha_vec.storage; + } else { +#pragma unroll + for (int i = 0; i < pack_size; i++) { + if (x_vec.elem[i] > zero_val) { + dx_vec.elem[i] = dy_vec.elem[i]; + } else { + dx_vec.elem[i] = dy_vec.elem[i] * alpha_val; + } + } + *(reinterpret_cast(dx + linear_index)) = dx_vec.storage; + } + } +} + +constexpr int32_t kBlockSize = 256; + +template +int GetLaunchPackSize(const int64_t inner_size) { + constexpr int type_pack_size = cuda::elementwise::PackSize(); + for (int launch_pack_size = 8; launch_pack_size > 0; launch_pack_size /= 2) { + if (type_pack_size >= launch_pack_size && inner_size % launch_pack_size == 0) { + return launch_pack_size; + } + } + return 1; +} + +template +void DispatchPreluForwardPackSize(ep::Stream* stream, const int64_t elem_cnt, + const int64_t alpha_size, const int64_t inner_size, const T* x, + const T* alpha, T* y) { + int grid_size; + const int pack_size = GetLaunchPackSize(inner_size); + const int64_t pack_num = elem_cnt / pack_size; + hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size); + if (pack_size == 8) { + PReluForwardMultiAlphaGpu + <<As()->cuda_stream()>>>( + elem_cnt, alpha_size, inner_size, x, alpha, y); + } else if (pack_size == 4) { + PReluForwardMultiAlphaGpu + <<As()->cuda_stream()>>>( + elem_cnt, alpha_size, inner_size, x, alpha, y); + } else if (pack_size == 2) { + PReluForwardMultiAlphaGpu + <<As()->cuda_stream()>>>( + elem_cnt, alpha_size, inner_size, x, alpha, y); + } else { + BroadcastPReluMultiAlphaNaiveForwardGpu + <<As()->cuda_stream()>>>( + elem_cnt, alpha_size, inner_size, x, alpha, y); + } +} + +template +void DispatchPreluForwardIndex(ep::Stream* stream, const int64_t elem_cnt, const int64_t alpha_size, + const int64_t inner_size, const T* x, const T* alpha, T* y) { + if (elem_cnt < GetMaxVal()) { + DispatchPreluForwardPackSize(stream, elem_cnt, alpha_size, inner_size, x, alpha, y); + } else { + DispatchPreluForwardPackSize(stream, elem_cnt, alpha_size, inner_size, x, alpha, y); + } +} + +template +void DispatchPreluBackwardPackSize(ep::Stream* stream, const int64_t elem_cnt, + const int64_t alpha_size, const int64_t inner_size, const T* x, + const T* alpha, const T* dy, T* dx, T* alpha_diff, + const bool alpha_requires_grad) { + int grid_size; + const int pack_size = GetLaunchPackSize(inner_size); + const int64_t pack_num = elem_cnt / pack_size; + hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size); + + if (pack_size == 8) { + if (alpha_requires_grad) { + PReluBackwardMultiAlphaGpu + <<As()->cuda_stream()>>>( + elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff); + } else { + PReluBackwardMultiAlphaGpu + <<As()->cuda_stream()>>>( + elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff); + } + } else if (pack_size == 4) { + if (alpha_requires_grad) { + PReluBackwardMultiAlphaGpu + <<As()->cuda_stream()>>>( + elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff); + } else { + PReluBackwardMultiAlphaGpu + <<As()->cuda_stream()>>>( + elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff); + } + } else if (pack_size == 2) { + if (alpha_requires_grad) { + PReluBackwardMultiAlphaGpu + <<As()->cuda_stream()>>>( + elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff); + } else { + PReluBackwardMultiAlphaGpu + <<As()->cuda_stream()>>>( + elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff); + } + + } else { + if (alpha_requires_grad) { + BroadcastPReluMultiAlphaNaiveBackwardGpu + <<As()->cuda_stream()>>>( + elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff); + } else { + BroadcastPReluMultiAlphaNaiveBackwardGpu + <<As()->cuda_stream()>>>( + elem_cnt, alpha_size, inner_size, x, alpha, dy, dx, alpha_diff); + } + } +} + +template +void DispatchPreluBackwardIndex(ep::Stream* stream, const int64_t elem_cnt, + const int64_t alpha_size, const int64_t inner_size, const T* x, + const T* alpha, const T* dy, T* dx, T* alpha_diff, + const bool alpha_requires_grad) { + if (elem_cnt < GetMaxVal()) { + DispatchPreluBackwardPackSize(stream, elem_cnt, alpha_size, inner_size, x, alpha, + dy, dx, alpha_diff, alpha_requires_grad); + } else { + DispatchPreluBackwardPackSize(stream, elem_cnt, alpha_size, inner_size, x, alpha, + dy, dx, alpha_diff, alpha_requires_grad); + } +} + +template +void DispatchPreluBackwardSingleAlphaTail(ep::Stream* stream, const IndexType elem_cnt, const T* x, + const T* alpha, const T* dy, T* dx, T* alpha_diff, + const bool alpha_requires_grad) { + constexpr int pack_size = cuda::elementwise::PackSize(); + const int64_t pack_num = elem_cnt / pack_size; + int grid_size; + hipError_t err = cuda::elementwise::GetNumBlocks(pack_num, &grid_size); + const int64_t tail_offset = pack_num * pack_size; + const int64_t n_tail = elem_cnt - tail_offset; + const bool tail = n_tail > 0 ? true : false; + if (tail) { + if (alpha_requires_grad) { + PReluBackwardSingleAlphaGpu + <<As()->cuda_stream()>>>( + elem_cnt, n_tail, x, alpha, dy, dx, alpha_diff, x + tail_offset, dy + tail_offset, + dx + tail_offset, alpha_diff + tail_offset); + } else { + PReluBackwardSingleAlphaGpu + <<As()->cuda_stream()>>>( + elem_cnt, n_tail, x, alpha, dy, dx, alpha_diff, x + tail_offset, dy + tail_offset, + dx + tail_offset, alpha_diff + tail_offset); + } + } else { + if (alpha_requires_grad) { + PReluBackwardSingleAlphaGpu + <<As()->cuda_stream()>>>( + elem_cnt, n_tail, x, alpha, dy, dx, alpha_diff, x + tail_offset, dy + tail_offset, + dx + tail_offset, alpha_diff + tail_offset); + } else { + PReluBackwardSingleAlphaGpu + <<As()->cuda_stream()>>>( + elem_cnt, n_tail, x, alpha, dy, dx, alpha_diff, x + tail_offset, dy + tail_offset, + dx + tail_offset, alpha_diff + tail_offset); + } + } +} + +template +void DispatchPreluBackwardSingleAlphaIndex(ep::Stream* stream, const int64_t elem_cnt, const T* x, + const T* alpha, const T* dy, T* dx, T* alpha_diff, + const bool alpha_requires_grad) { + if (elem_cnt < GetMaxVal()) { + DispatchPreluBackwardSingleAlphaTail(stream, elem_cnt, x, alpha, dy, dx, alpha_diff, + alpha_requires_grad); + } else { + DispatchPreluBackwardSingleAlphaTail(stream, elem_cnt, x, alpha, dy, dx, alpha_diff, + alpha_requires_grad); + } +} + +} // namespace + +template +class GpuPReluKernel final : public user_op::OpKernel { + public: + GpuPReluKernel() = default; + ~GpuPReluKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* alpha = ctx->Tensor4ArgNameAndIndex("alpha", 0); + user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); + const int32_t elem_cnt = x->shape_view().elem_cnt(); + const int32_t batch = x->shape_view().At(0); + const int32_t channels = (x->shape_view().NumAxes() == 1) ? 1 : x->shape_view().At(1); + const int32_t alpha_size = alpha->shape_view().elem_cnt(); + const int32_t inner_size = elem_cnt / batch / channels; + + if (alpha_size == 1) { + OF_CUDA_CHECK((cuda::elementwise::UnaryWithFactory( + PreluForwardSingleAlphaPtrFunctor(reinterpret_cast(alpha->dptr())), elem_cnt, + reinterpret_cast(y->mut_dptr()), reinterpret_cast(x->dptr()), + ctx->stream()->As()->cuda_stream()))); + } else { + DispatchPreluForwardIndex( + ctx->stream(), elem_cnt, alpha_size, inner_size, reinterpret_cast(x->dptr()), + reinterpret_cast(alpha->dptr()), reinterpret_cast(y->mut_dptr())); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_PRELU_KERNEL(dtype) \ + REGISTER_USER_KERNEL("prelu").SetCreateFn>().SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("y", 0) == GetDataType::value)); + +REGISTER_CUDA_PRELU_KERNEL(half) +REGISTER_CUDA_PRELU_KERNEL(float) +REGISTER_CUDA_PRELU_KERNEL(double) + +template +class GpuPReluGradKernel final : public user_op::OpKernel { + public: + GpuPReluGradKernel() = default; + ~GpuPReluGradKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* alpha = ctx->Tensor4ArgNameAndIndex("alpha", 0); + const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + user_op::Tensor* alpha_diff = ctx->Tensor4ArgNameAndIndex("alpha_diff", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + const bool alpha_requires_grad = ctx->Attr("alpha_requires_grad"); + const int32_t elem_cnt = x->shape_view().elem_cnt(); + T* broadcasted_alpha_diff = tmp_buffer->mut_dptr(); + T* reduce_sum_tmp_buf = reinterpret_cast(tmp_buffer->mut_dptr() + + GetCudaAlignedSize(elem_cnt * sizeof(T))); + + const Shape& left_extended_shape = CreatePreluLeftExtendedShape(ShapeView(x->shape_view())); + + const int32_t batch = x->shape_view().At(0); + const int32_t channels = (x->shape_view().NumAxes() == 1) ? 1 : x->shape_view().At(1); + const int32_t alpha_size = alpha->shape_view().elem_cnt(); + const int32_t inner_size = elem_cnt / batch / channels; + if (alpha_size == 1) { + DispatchPreluBackwardSingleAlphaIndex(ctx->stream(), elem_cnt, x->dptr(), + alpha->dptr(), dy->dptr(), dx->mut_dptr(), + broadcasted_alpha_diff, alpha_requires_grad); + } else { + DispatchPreluBackwardIndex(ctx->stream(), elem_cnt, alpha_size, inner_size, x->dptr(), + alpha->dptr(), dy->dptr(), dx->mut_dptr(), + broadcasted_alpha_diff, alpha_requires_grad); + } + if (alpha_requires_grad) { + NdarrayUtil::ReduceSum( + ctx->stream(), XpuVarNdarray(left_extended_shape, alpha_diff->mut_dptr()), + XpuVarNdarray(x->shape_view(), broadcasted_alpha_diff), + XpuVarNdarray(x->shape_view(), reduce_sum_tmp_buf)); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_PRELU_GRAD_KERNEL(dtype) \ + REGISTER_USER_KERNEL("prelu_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ + const Shape& in_shape = ctx->InputShape("x", 0); \ + const Shape& alpha_shape = ctx->InputShape("alpha", 0); \ + const int64_t tmp_buffer_size = \ + 2 * GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(dtype)); \ + return tmp_buffer_size; \ + }); + +REGISTER_CUDA_PRELU_GRAD_KERNEL(half) +REGISTER_CUDA_PRELU_GRAD_KERNEL(float) +REGISTER_CUDA_PRELU_GRAD_KERNEL(double) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/quantization_kernel.hip.cpp b/oneflow/user/kernels/quantization_kernel.hip.cpp index 51fbbde..2f31245 100644 --- a/oneflow/user/kernels/quantization_kernel.hip.cpp +++ b/oneflow/user/kernels/quantization_kernel.hip.cpp @@ -1,159 +1,159 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/kernel_util.hip.h" - -namespace oneflow { - -namespace { - -template -__global__ void QuantizationSymmetric(const T* in_ptr, const T* scale_ptr, const int64_t scale_size, - const int64_t elements, const int64_t panel_size, - const double quantization_bit, T* out_ptr) { - int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; - int64_t step = gridDim.x * blockDim.x; - - T upper_bound = static_cast(pow(2.0, quantization_bit - 1)) - 1; - T lower_bound = -upper_bound - 1; - - while (gid < elements) { - int64_t channel_index = gid / panel_size; - int64_t scale_idx = min(scale_size - 1, channel_index); - - T scale = scale_ptr[scale_idx]; - - T out = nearbyint(in_ptr[gid] / scale); - out = out > upper_bound ? upper_bound : out; - out = out < lower_bound ? lower_bound : out; - out_ptr[gid] = out; - - gid += step; - } -} - -template -__global__ void QuantizationAffine(const T* in_ptr, const T* scale_ptr, const T* zero_point_ptr, - const int64_t scale_size, const int64_t elements, - const int64_t panel_size, const double quantization_bit, - T* out_ptr) { - int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; - int64_t step = gridDim.x * blockDim.x; - - T upper_bound = static_cast(pow(2.0, quantization_bit)) - 1; - T lower_bound = 0; - - while (gid < elements) { - int64_t channel_index = gid / panel_size; - int64_t scale_idx = min(scale_size - 1, channel_index); - - T scale = scale_ptr[scale_idx]; - T zero_point = zero_point_ptr[scale_idx]; - - T out = nearbyint(in_ptr[gid] / scale + zero_point); - out = out > upper_bound ? upper_bound : out; - out = out < lower_bound ? lower_bound : out; - out_ptr[gid] = out; - - gid += step; - } -} - -template -__global__ void QuantizationCambricon(const T* in_ptr, const T* shift, const int64_t scale_size, - const int64_t elements, const int64_t panel_size, - const double quantization_bit, T* out_ptr) { - int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; - int64_t step = gridDim.x * blockDim.x; - - T upper_bound = static_cast(pow(2.0, quantization_bit - 1)) - 1; - T lower_bound = -upper_bound - 1; - - T scale = static_cast(pow(2.0, static_cast(shift[0]))); - - while (gid < elements) { - T out = nearbyint(in_ptr[gid] / scale); - out = out > upper_bound ? upper_bound : out; - out = out < lower_bound ? lower_bound : out; - out_ptr[gid] = out; - gid += step; - } -} - -} // namespace - -template -class GpuQuantizationKernel final : public user_op::OpKernel { - public: - GpuQuantizationKernel() = default; - ~GpuQuantizationKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - const user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0); - const user_op::Tensor* zero_point = ctx->Tensor4ArgNameAndIndex("zero_point", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - - const std::string quantization_scheme = ctx->Attr("quantization_scheme"); - const int32_t quantization_bit = ctx->Attr("quantization_bit"); - const std::string quantization_formula = ctx->Attr("quantization_formula"); - - const int64_t elements = in->shape_view().elem_cnt(); - const int64_t panel_size = in->shape_view().Count(1); - const int64_t scale_size = scale->shape_view().elem_cnt(); - - // round to even - auto origin_round_mode = std::fegetround(); - std::fesetround(FE_TONEAREST); - - if (quantization_formula == "google") { - if (quantization_scheme == "symmetric") { - RUN_CUDA_KERNEL((QuantizationSymmetric), ctx->stream(), elements, in->dptr(), - scale->dptr(), scale_size, elements, panel_size, quantization_bit, - out->mut_dptr()); - } else { // quantization_scheme == "affine" - RUN_CUDA_KERNEL((QuantizationAffine), ctx->stream(), elements, in->dptr(), - scale->dptr(), zero_point->dptr(), scale_size, elements, panel_size, - quantization_bit, out->mut_dptr()); - } - } else if (quantization_formula == "cambricon") { - RUN_CUDA_KERNEL((QuantizationCambricon), ctx->stream(), elements, in->dptr(), - scale->dptr(), scale_size, elements, panel_size, quantization_bit, - out->mut_dptr()); - } else { - UNIMPLEMENTED(); - } - - std::fesetround(origin_round_mode); - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_QUANTIZATION_KERNEL(dtype) \ - REGISTER_USER_KERNEL("quantization") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("in", 0) == GetDataType::value)) - -REGISTER_QUANTIZATION_KERNEL(float); -REGISTER_QUANTIZATION_KERNEL(double); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/kernel_util.hip.h" + +namespace oneflow { + +namespace { + +template +__global__ void QuantizationSymmetric(const T* in_ptr, const T* scale_ptr, const int64_t scale_size, + const int64_t elements, const int64_t panel_size, + const double quantization_bit, T* out_ptr) { + int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; + int64_t step = gridDim.x * blockDim.x; + + T upper_bound = static_cast(pow(2.0, quantization_bit - 1)) - 1; + T lower_bound = -upper_bound - 1; + + while (gid < elements) { + int64_t channel_index = gid / panel_size; + int64_t scale_idx = min(scale_size - 1, channel_index); + + T scale = scale_ptr[scale_idx]; + + T out = nearbyint(in_ptr[gid] / scale); + out = out > upper_bound ? upper_bound : out; + out = out < lower_bound ? lower_bound : out; + out_ptr[gid] = out; + + gid += step; + } +} + +template +__global__ void QuantizationAffine(const T* in_ptr, const T* scale_ptr, const T* zero_point_ptr, + const int64_t scale_size, const int64_t elements, + const int64_t panel_size, const double quantization_bit, + T* out_ptr) { + int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; + int64_t step = gridDim.x * blockDim.x; + + T upper_bound = static_cast(pow(2.0, quantization_bit)) - 1; + T lower_bound = 0; + + while (gid < elements) { + int64_t channel_index = gid / panel_size; + int64_t scale_idx = min(scale_size - 1, channel_index); + + T scale = scale_ptr[scale_idx]; + T zero_point = zero_point_ptr[scale_idx]; + + T out = nearbyint(in_ptr[gid] / scale + zero_point); + out = out > upper_bound ? upper_bound : out; + out = out < lower_bound ? lower_bound : out; + out_ptr[gid] = out; + + gid += step; + } +} + +template +__global__ void QuantizationCambricon(const T* in_ptr, const T* shift, const int64_t scale_size, + const int64_t elements, const int64_t panel_size, + const double quantization_bit, T* out_ptr) { + int64_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; + int64_t step = gridDim.x * blockDim.x; + + T upper_bound = static_cast(pow(2.0, quantization_bit - 1)) - 1; + T lower_bound = -upper_bound - 1; + + T scale = static_cast(pow(2.0, static_cast(shift[0]))); + + while (gid < elements) { + T out = nearbyint(in_ptr[gid] / scale); + out = out > upper_bound ? upper_bound : out; + out = out < lower_bound ? lower_bound : out; + out_ptr[gid] = out; + gid += step; + } +} + +} // namespace + +template +class GpuQuantizationKernel final : public user_op::OpKernel { + public: + GpuQuantizationKernel() = default; + ~GpuQuantizationKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + const user_op::Tensor* scale = ctx->Tensor4ArgNameAndIndex("scale", 0); + const user_op::Tensor* zero_point = ctx->Tensor4ArgNameAndIndex("zero_point", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + + const std::string quantization_scheme = ctx->Attr("quantization_scheme"); + const int32_t quantization_bit = ctx->Attr("quantization_bit"); + const std::string quantization_formula = ctx->Attr("quantization_formula"); + + const int64_t elements = in->shape_view().elem_cnt(); + const int64_t panel_size = in->shape_view().Count(1); + const int64_t scale_size = scale->shape_view().elem_cnt(); + + // round to even + auto origin_round_mode = std::fegetround(); + std::fesetround(FE_TONEAREST); + + if (quantization_formula == "google") { + if (quantization_scheme == "symmetric") { + RUN_CUDA_KERNEL((QuantizationSymmetric), ctx->stream(), elements, in->dptr(), + scale->dptr(), scale_size, elements, panel_size, quantization_bit, + out->mut_dptr()); + } else { // quantization_scheme == "affine" + RUN_CUDA_KERNEL((QuantizationAffine), ctx->stream(), elements, in->dptr(), + scale->dptr(), zero_point->dptr(), scale_size, elements, panel_size, + quantization_bit, out->mut_dptr()); + } + } else if (quantization_formula == "cambricon") { + RUN_CUDA_KERNEL((QuantizationCambricon), ctx->stream(), elements, in->dptr(), + scale->dptr(), scale_size, elements, panel_size, quantization_bit, + out->mut_dptr()); + } else { + UNIMPLEMENTED(); + } + + std::fesetround(origin_round_mode); + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_QUANTIZATION_KERNEL(dtype) \ + REGISTER_USER_KERNEL("quantization") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("in", 0) == GetDataType::value)) + +REGISTER_QUANTIZATION_KERNEL(float); +REGISTER_QUANTIZATION_KERNEL(double); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/radix_sort.hip.h b/oneflow/user/kernels/radix_sort.hip.h index 8cdd903..c47f090 100644 --- a/oneflow/user/kernels/radix_sort.hip.h +++ b/oneflow/user/kernels/radix_sort.hip.h @@ -1,280 +1,280 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifndef ONEFLOW_USER_KERNELS_RADIX_SORT_HIP_H_ -#define ONEFLOW_USER_KERNELS_RADIX_SORT_HIP_H_ - -#include -#include "oneflow/core/device/cuda_util.h" - -namespace oneflow { - -namespace { - -class MultiplyFunctor final { - public: - MultiplyFunctor(int32_t num_col) : num_col_(num_col) {} - __host__ __device__ __forceinline__ int32_t operator()(int32_t idx) const { - return idx * num_col_; - } - - private: - int32_t num_col_; -}; - -} // namespace - -template -size_t InferTempStorageForSortPairsAscending(int32_t num_row, int32_t num_col) { - using SegmentOffsetIter = - hipcub::TransformInputIterator>; - - hipcub::CountingInputIterator counting_iter(0); - MultiplyFunctor multiply_functor(num_col); - SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor); - - size_t temp_storage_bytes = 0; - auto err = hipcub::DeviceSegmentedRadixSort::SortPairs( - /* d_temp_storage */ nullptr, - /* temp_storage_bytes */ temp_storage_bytes, - /* d_keys_in */ nullptr, - /* d_keys_out */ nullptr, - /* d_values_in */ nullptr, - /* d_values_out */ nullptr, - /* num_items */ num_row * num_col, - /* num_segments */ num_row, - /* d_begin_offsets */ segment_offset_iter, - /* d_end_offsets */ segment_offset_iter + 1, - /* begin_bit */ 0, - /* end_bit */ sizeof(KeyType) * 8, - /* stream */ 0); - OF_CUDA_CHECK(err); - - return temp_storage_bytes; -} - -template -size_t InferTempStorageForSortPairsDescending(int32_t num_row, int32_t num_col) { - using SegmentOffsetIter = - hipcub::TransformInputIterator>; - - hipcub::CountingInputIterator counting_iter(0); - MultiplyFunctor multiply_functor(num_col); - SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor); - - size_t temp_storage_bytes = 0; - auto err = - hipcub::DeviceSegmentedRadixSort::SortPairsDescending( - /* d_temp_storage */ nullptr, - /* temp_storage_bytes */ temp_storage_bytes, - /* d_keys_in */ nullptr, - /* d_keys_out */ nullptr, - /* d_values_in */ nullptr, - /* d_values_out */ nullptr, - /* num_items */ num_row * num_col, - /* num_segments */ num_row, - /* d_begin_offsets */ segment_offset_iter, - /* d_end_offsets */ segment_offset_iter + 1, - /* begin_bit */ 0, - /* end_bit */ sizeof(KeyType) * 8, - /* stream */ 0); - OF_CUDA_CHECK(err); - - return temp_storage_bytes; -} - -template -size_t InferTempStorageForSortKeysAscending(int32_t num_row, int32_t num_col) { - using SegmentOffsetIter = - hipcub::TransformInputIterator>; - - hipcub::CountingInputIterator counting_iter(0); - MultiplyFunctor multiply_functor(num_col); - SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor); - - size_t temp_storage_bytes = 0; - auto err = hipcub::DeviceSegmentedRadixSort::SortKeys( - /* d_temp_storage */ nullptr, - /* temp_storage_bytes */ temp_storage_bytes, - /* d_keys_in */ nullptr, - /* d_keys_out */ nullptr, - /* num_items */ num_row * num_col, - /* num_segments */ num_row, - /* d_begin_offsets */ segment_offset_iter, - /* d_end_offsets */ segment_offset_iter + 1, - /* begin_bit */ 0, - /* end_bit */ sizeof(KeyType) * 8, - /* stream */ 0); - OF_CUDA_CHECK(err); - - return temp_storage_bytes; -} - -template -size_t InferTempStorageForSortKeysDescending(int32_t num_row, int32_t num_col) { - using SegmentOffsetIter = - hipcub::TransformInputIterator>; - - hipcub::CountingInputIterator counting_iter(0); - MultiplyFunctor multiply_functor(num_col); - SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor); - - size_t temp_storage_bytes = 0; - auto err = hipcub::DeviceSegmentedRadixSort::SortKeysDescending( - /* d_temp_storage */ nullptr, - /* temp_storage_bytes */ temp_storage_bytes, - /* d_keys_in */ nullptr, - /* d_keys_out */ nullptr, - /* num_items */ num_row * num_col, - /* num_segments */ num_row, - /* d_begin_offsets */ segment_offset_iter, - /* d_end_offsets */ segment_offset_iter + 1, - /* begin_bit */ 0, - /* end_bit */ sizeof(KeyType) * 8, - /* stream */ 0); - OF_CUDA_CHECK(err); - - return temp_storage_bytes; -} - -template -void SortPairsAscending(const KeyType* keys_ptr, const ValueType* values_ptr, int32_t num_row, - int32_t num_col, void* temp_storage_ptr, int32_t temp_storage_bytes, - KeyType* sorted_keys_ptr, ValueType* sorted_values_ptr, - hipStream_t stream) { - size_t rt_inferred_temp_storage_bytes = - InferTempStorageForSortPairsAscending(num_row, num_col); - CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes); - - using SegmentOffsetIter = - hipcub::TransformInputIterator>; - - hipcub::CountingInputIterator counting_iter(0); - MultiplyFunctor multiply_functor(num_col); - SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor); - - auto err = hipcub::DeviceSegmentedRadixSort::SortPairs( - /* d_temp_storage */ temp_storage_ptr, - /* temp_storage_bytes */ rt_inferred_temp_storage_bytes, - /* d_keys_in */ keys_ptr, - /* d_keys_out */ sorted_keys_ptr, - /* d_values_in */ values_ptr, - /* d_values_out */ sorted_values_ptr, - /* num_items */ num_row * num_col, - /* num_segments */ num_row, - /* d_begin_offsets */ segment_offset_iter, - /* d_end_offsets */ segment_offset_iter + 1, - /* begin_bit */ 0, - /* end_bit */ sizeof(KeyType) * 8, - /* stream */ stream); - OF_CUDA_CHECK(err); -} - -template -void SortPairsDescending(const KeyType* keys_ptr, const ValueType* values_ptr, int32_t num_row, - int32_t num_col, void* temp_storage_ptr, int32_t temp_storage_bytes, - KeyType* sorted_keys_ptr, ValueType* sorted_values_ptr, - hipStream_t stream) { - size_t rt_inferred_temp_storage_bytes = - InferTempStorageForSortPairsDescending(num_row, num_col); - CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes); - - using SegmentOffsetIter = - hipcub::TransformInputIterator>; - - hipcub::CountingInputIterator counting_iter(0); - MultiplyFunctor multiply_functor(num_col); - SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor); - - auto err = hipcub::DeviceSegmentedRadixSort::SortPairsDescending( - /* d_temp_storage */ temp_storage_ptr, - /* temp_storage_bytes */ rt_inferred_temp_storage_bytes, - /* d_keys_in */ keys_ptr, - /* d_keys_out */ sorted_keys_ptr, - /* d_values_in */ values_ptr, - /* d_values_out */ sorted_values_ptr, - /* num_items */ num_row * num_col, - /* num_segments */ num_row, - /* d_begin_offsets */ segment_offset_iter, - /* d_end_offsets */ segment_offset_iter + 1, - /* begin_bit */ 0, - /* end_bit */ sizeof(KeyType) * 8, - /* stream */ stream); - OF_CUDA_CHECK(err); -} - -template -void SortKeysAscending(const KeyType* keys_ptr, int32_t num_row, int32_t num_col, - void* temp_storage_ptr, int32_t temp_storage_bytes, KeyType* sorted_keys_ptr, - hipStream_t stream) { - size_t rt_inferred_temp_storage_bytes = - InferTempStorageForSortKeysAscending(num_row, num_col); - CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes); - - using SegmentOffsetIter = - hipcub::TransformInputIterator>; - - hipcub::CountingInputIterator counting_iter(0); - MultiplyFunctor multiply_functor(num_col); - SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor); - - auto err = hipcub::DeviceSegmentedRadixSort::SortKeys( - /* d_temp_storage */ temp_storage_ptr, - /* temp_storage_bytes */ rt_inferred_temp_storage_bytes, - /* d_keys_in */ keys_ptr, - /* d_keys_out */ sorted_keys_ptr, - /* num_items */ num_row * num_col, - /* num_segments */ num_row, - /* d_begin_offsets */ segment_offset_iter, - /* d_end_offsets */ segment_offset_iter + 1, - /* begin_bit */ 0, - /* end_bit */ sizeof(KeyType) * 8, - /* stream */ stream); - OF_CUDA_CHECK(err); -} - -template -void SortKeysDescending(const KeyType* keys_ptr, int32_t num_row, int32_t num_col, - void* temp_storage_ptr, int32_t temp_storage_bytes, - KeyType* sorted_keys_ptr, hipStream_t stream) { - size_t rt_inferred_temp_storage_bytes = - InferTempStorageForSortKeysDescending(num_row, num_col); - CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes); - - using SegmentOffsetIter = - hipcub::TransformInputIterator>; - - hipcub::CountingInputIterator counting_iter(0); - MultiplyFunctor multiply_functor(num_col); - SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor); - - auto err = hipcub::DeviceSegmentedRadixSort::SortKeysDescending( - /* d_temp_storage */ temp_storage_ptr, - /* temp_storage_bytes */ rt_inferred_temp_storage_bytes, - /* d_keys_in */ keys_ptr, - /* d_keys_out */ sorted_keys_ptr, - /* num_items */ num_row * num_col, - /* num_segments */ num_row, - /* d_begin_offsets */ segment_offset_iter, - /* d_end_offsets */ segment_offset_iter + 1, - /* begin_bit */ 0, - /* end_bit */ sizeof(KeyType) * 8, - /* stream */ stream); - OF_CUDA_CHECK(err); -} - -} // namespace oneflow - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifndef ONEFLOW_USER_KERNELS_RADIX_SORT_HIP_H_ +#define ONEFLOW_USER_KERNELS_RADIX_SORT_HIP_H_ + +#include +#include "oneflow/core/device/cuda_util.h" + +namespace oneflow { + +namespace { + +class MultiplyFunctor final { + public: + MultiplyFunctor(int32_t num_col) : num_col_(num_col) {} + __host__ __device__ __forceinline__ int32_t operator()(int32_t idx) const { + return idx * num_col_; + } + + private: + int32_t num_col_; +}; + +} // namespace + +template +size_t InferTempStorageForSortPairsAscending(int32_t num_row, int32_t num_col) { + using SegmentOffsetIter = + hipcub::TransformInputIterator>; + + hipcub::CountingInputIterator counting_iter(0); + MultiplyFunctor multiply_functor(num_col); + SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor); + + size_t temp_storage_bytes = 0; + auto err = hipcub::DeviceSegmentedRadixSort::SortPairs( + /* d_temp_storage */ nullptr, + /* temp_storage_bytes */ temp_storage_bytes, + /* d_keys_in */ nullptr, + /* d_keys_out */ nullptr, + /* d_values_in */ nullptr, + /* d_values_out */ nullptr, + /* num_items */ num_row * num_col, + /* num_segments */ num_row, + /* d_begin_offsets */ segment_offset_iter, + /* d_end_offsets */ segment_offset_iter + 1, + /* begin_bit */ 0, + /* end_bit */ sizeof(KeyType) * 8, + /* stream */ 0); + OF_CUDA_CHECK(err); + + return temp_storage_bytes; +} + +template +size_t InferTempStorageForSortPairsDescending(int32_t num_row, int32_t num_col) { + using SegmentOffsetIter = + hipcub::TransformInputIterator>; + + hipcub::CountingInputIterator counting_iter(0); + MultiplyFunctor multiply_functor(num_col); + SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor); + + size_t temp_storage_bytes = 0; + auto err = + hipcub::DeviceSegmentedRadixSort::SortPairsDescending( + /* d_temp_storage */ nullptr, + /* temp_storage_bytes */ temp_storage_bytes, + /* d_keys_in */ nullptr, + /* d_keys_out */ nullptr, + /* d_values_in */ nullptr, + /* d_values_out */ nullptr, + /* num_items */ num_row * num_col, + /* num_segments */ num_row, + /* d_begin_offsets */ segment_offset_iter, + /* d_end_offsets */ segment_offset_iter + 1, + /* begin_bit */ 0, + /* end_bit */ sizeof(KeyType) * 8, + /* stream */ 0); + OF_CUDA_CHECK(err); + + return temp_storage_bytes; +} + +template +size_t InferTempStorageForSortKeysAscending(int32_t num_row, int32_t num_col) { + using SegmentOffsetIter = + hipcub::TransformInputIterator>; + + hipcub::CountingInputIterator counting_iter(0); + MultiplyFunctor multiply_functor(num_col); + SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor); + + size_t temp_storage_bytes = 0; + auto err = hipcub::DeviceSegmentedRadixSort::SortKeys( + /* d_temp_storage */ nullptr, + /* temp_storage_bytes */ temp_storage_bytes, + /* d_keys_in */ nullptr, + /* d_keys_out */ nullptr, + /* num_items */ num_row * num_col, + /* num_segments */ num_row, + /* d_begin_offsets */ segment_offset_iter, + /* d_end_offsets */ segment_offset_iter + 1, + /* begin_bit */ 0, + /* end_bit */ sizeof(KeyType) * 8, + /* stream */ 0); + OF_CUDA_CHECK(err); + + return temp_storage_bytes; +} + +template +size_t InferTempStorageForSortKeysDescending(int32_t num_row, int32_t num_col) { + using SegmentOffsetIter = + hipcub::TransformInputIterator>; + + hipcub::CountingInputIterator counting_iter(0); + MultiplyFunctor multiply_functor(num_col); + SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor); + + size_t temp_storage_bytes = 0; + auto err = hipcub::DeviceSegmentedRadixSort::SortKeysDescending( + /* d_temp_storage */ nullptr, + /* temp_storage_bytes */ temp_storage_bytes, + /* d_keys_in */ nullptr, + /* d_keys_out */ nullptr, + /* num_items */ num_row * num_col, + /* num_segments */ num_row, + /* d_begin_offsets */ segment_offset_iter, + /* d_end_offsets */ segment_offset_iter + 1, + /* begin_bit */ 0, + /* end_bit */ sizeof(KeyType) * 8, + /* stream */ 0); + OF_CUDA_CHECK(err); + + return temp_storage_bytes; +} + +template +void SortPairsAscending(const KeyType* keys_ptr, const ValueType* values_ptr, int32_t num_row, + int32_t num_col, void* temp_storage_ptr, int32_t temp_storage_bytes, + KeyType* sorted_keys_ptr, ValueType* sorted_values_ptr, + hipStream_t stream) { + size_t rt_inferred_temp_storage_bytes = + InferTempStorageForSortPairsAscending(num_row, num_col); + CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes); + + using SegmentOffsetIter = + hipcub::TransformInputIterator>; + + hipcub::CountingInputIterator counting_iter(0); + MultiplyFunctor multiply_functor(num_col); + SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor); + + auto err = hipcub::DeviceSegmentedRadixSort::SortPairs( + /* d_temp_storage */ temp_storage_ptr, + /* temp_storage_bytes */ rt_inferred_temp_storage_bytes, + /* d_keys_in */ keys_ptr, + /* d_keys_out */ sorted_keys_ptr, + /* d_values_in */ values_ptr, + /* d_values_out */ sorted_values_ptr, + /* num_items */ num_row * num_col, + /* num_segments */ num_row, + /* d_begin_offsets */ segment_offset_iter, + /* d_end_offsets */ segment_offset_iter + 1, + /* begin_bit */ 0, + /* end_bit */ sizeof(KeyType) * 8, + /* stream */ stream); + OF_CUDA_CHECK(err); +} + +template +void SortPairsDescending(const KeyType* keys_ptr, const ValueType* values_ptr, int32_t num_row, + int32_t num_col, void* temp_storage_ptr, int32_t temp_storage_bytes, + KeyType* sorted_keys_ptr, ValueType* sorted_values_ptr, + hipStream_t stream) { + size_t rt_inferred_temp_storage_bytes = + InferTempStorageForSortPairsDescending(num_row, num_col); + CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes); + + using SegmentOffsetIter = + hipcub::TransformInputIterator>; + + hipcub::CountingInputIterator counting_iter(0); + MultiplyFunctor multiply_functor(num_col); + SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor); + + auto err = hipcub::DeviceSegmentedRadixSort::SortPairsDescending( + /* d_temp_storage */ temp_storage_ptr, + /* temp_storage_bytes */ rt_inferred_temp_storage_bytes, + /* d_keys_in */ keys_ptr, + /* d_keys_out */ sorted_keys_ptr, + /* d_values_in */ values_ptr, + /* d_values_out */ sorted_values_ptr, + /* num_items */ num_row * num_col, + /* num_segments */ num_row, + /* d_begin_offsets */ segment_offset_iter, + /* d_end_offsets */ segment_offset_iter + 1, + /* begin_bit */ 0, + /* end_bit */ sizeof(KeyType) * 8, + /* stream */ stream); + OF_CUDA_CHECK(err); +} + +template +void SortKeysAscending(const KeyType* keys_ptr, int32_t num_row, int32_t num_col, + void* temp_storage_ptr, int32_t temp_storage_bytes, KeyType* sorted_keys_ptr, + hipStream_t stream) { + size_t rt_inferred_temp_storage_bytes = + InferTempStorageForSortKeysAscending(num_row, num_col); + CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes); + + using SegmentOffsetIter = + hipcub::TransformInputIterator>; + + hipcub::CountingInputIterator counting_iter(0); + MultiplyFunctor multiply_functor(num_col); + SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor); + + auto err = hipcub::DeviceSegmentedRadixSort::SortKeys( + /* d_temp_storage */ temp_storage_ptr, + /* temp_storage_bytes */ rt_inferred_temp_storage_bytes, + /* d_keys_in */ keys_ptr, + /* d_keys_out */ sorted_keys_ptr, + /* num_items */ num_row * num_col, + /* num_segments */ num_row, + /* d_begin_offsets */ segment_offset_iter, + /* d_end_offsets */ segment_offset_iter + 1, + /* begin_bit */ 0, + /* end_bit */ sizeof(KeyType) * 8, + /* stream */ stream); + OF_CUDA_CHECK(err); +} + +template +void SortKeysDescending(const KeyType* keys_ptr, int32_t num_row, int32_t num_col, + void* temp_storage_ptr, int32_t temp_storage_bytes, + KeyType* sorted_keys_ptr, hipStream_t stream) { + size_t rt_inferred_temp_storage_bytes = + InferTempStorageForSortKeysDescending(num_row, num_col); + CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes); + + using SegmentOffsetIter = + hipcub::TransformInputIterator>; + + hipcub::CountingInputIterator counting_iter(0); + MultiplyFunctor multiply_functor(num_col); + SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor); + + auto err = hipcub::DeviceSegmentedRadixSort::SortKeysDescending( + /* d_temp_storage */ temp_storage_ptr, + /* temp_storage_bytes */ rt_inferred_temp_storage_bytes, + /* d_keys_in */ keys_ptr, + /* d_keys_out */ sorted_keys_ptr, + /* num_items */ num_row * num_col, + /* num_segments */ num_row, + /* d_begin_offsets */ segment_offset_iter, + /* d_end_offsets */ segment_offset_iter + 1, + /* begin_bit */ 0, + /* end_bit */ sizeof(KeyType) * 8, + /* stream */ stream); + OF_CUDA_CHECK(err); +} + +} // namespace oneflow + #endif // ONEFLOW_USER_KERNELS_RADIX_SORT_HIP_H_ \ No newline at end of file diff --git a/oneflow/user/kernels/radix_sort_top_k_kernel.hip.cpp b/oneflow/user/kernels/radix_sort_top_k_kernel.hip.cpp index f1ea0e3..073f8e8 100644 --- a/oneflow/user/kernels/radix_sort_top_k_kernel.hip.cpp +++ b/oneflow/user/kernels/radix_sort_top_k_kernel.hip.cpp @@ -1,145 +1,145 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/user/kernels/radix_sort.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -class TmpBufferManager final { - public: - OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager); - TmpBufferManager(int64_t capacity, void* ptr, const ShapeView& in_shape) - : capacity_{capacity}, - sorted_in_elem_cnt_{in_shape.elem_cnt()}, - indices_elem_cnt_{sorted_in_elem_cnt_}, - sorted_indices_elem_cnt_{sorted_in_elem_cnt_} { - const int64_t sorted_in_aligned_bytes = GetCudaAlignedSize(sorted_in_elem_cnt_ * sizeof(T)); - const int64_t indices_aligned_bytes = GetCudaAlignedSize(indices_elem_cnt_ * sizeof(int64_t)); - const int64_t sorted_indices_aligned_bytes = indices_aligned_bytes; - sorted_in_ptr_ = reinterpret_cast(ptr); - indices_ptr_ = reinterpret_cast(reinterpret_cast(sorted_in_ptr_) - + sorted_in_aligned_bytes); - sorted_indices_ptr_ = - reinterpret_cast(reinterpret_cast(indices_ptr_) + indices_aligned_bytes); - temp_storage_ptr_ = reinterpret_cast(reinterpret_cast(sorted_indices_ptr_) - + sorted_indices_aligned_bytes); - temp_storage_bytes_ = - capacity_ - sorted_in_aligned_bytes - indices_aligned_bytes - sorted_indices_aligned_bytes; - CHECK_GE(temp_storage_bytes_, 0); - } - ~TmpBufferManager() = default; - - T* SortedInPtr() const { return sorted_in_ptr_; } - int64_t* IndicesPtr() const { return indices_ptr_; } - int64_t* SortedIndicesPtr() const { return sorted_indices_ptr_; } - void* TempStoragePtr() const { return temp_storage_ptr_; } - - int64_t TempStorageBytes() const { return temp_storage_bytes_; } - - private: - int64_t capacity_; - - T* sorted_in_ptr_; - int64_t* indices_ptr_; - int64_t* sorted_indices_ptr_; - void* temp_storage_ptr_; - - int64_t sorted_in_elem_cnt_; - int64_t indices_elem_cnt_; - int64_t sorted_indices_elem_cnt_; - int64_t temp_storage_bytes_; -}; - -__global__ void InitializeIndices(int64_t elem_cnt, int64_t* indices_ptr, int64_t instance_size) { - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { indices_ptr[i] = i % instance_size; }; -} - -} // namespace - -template -class GpuRadixSortTopKKernel final : public user_op::OpKernel { - public: - GpuRadixSortTopKKernel() = default; - ~GpuRadixSortTopKKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - if (in->shape_view().elem_cnt() == 0) { return; } - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - TmpBufferManager buf_manager(static_cast(tmp_buffer->shape_view().elem_cnt()), - tmp_buffer->mut_dptr(), in->shape_view()); - - const int64_t elem_cnt = in->shape_view().elem_cnt(); - const int64_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); - const int64_t instance_num = elem_cnt / instance_size; - const int64_t k = std::min(static_cast(ctx->Attr("k")), instance_size); - InitializeIndices<<stream()->As()->cuda_stream()>>>( - elem_cnt, buf_manager.IndicesPtr(), instance_size); - SortPairsDescending(in->dptr(), buf_manager.IndicesPtr(), instance_num, instance_size, - buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(), - buf_manager.SortedInPtr(), buf_manager.SortedIndicesPtr(), - ctx->stream()->As()->cuda_stream()); - OF_CUDA_CHECK(hipMemcpy2DAsync(out->mut_dptr(), k * sizeof(int64_t), - buf_manager.SortedIndicesPtr(), instance_size * sizeof(int64_t), - k * sizeof(int64_t), instance_num, hipMemcpyDefault, - ctx->stream()->As()->cuda_stream())); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(dtype) \ - REGISTER_USER_KERNEL("top_k") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobAttr("k") > 128) \ - && (user_op::HobDataType("in", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ - const Shape& in_shape = ctx->InputShape("in", 0); \ - const int64_t elem_cnt = in_shape.elem_cnt(); \ - const int64_t instance_size = in_shape.dim_vec().back(); \ - const int64_t instance_num = elem_cnt / instance_size; \ - \ - /* Sorted In*/ \ - const int64_t sorted_in_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(dtype)); \ - /* Indices */ \ - const int64_t indices_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(int64_t)); \ - /* Sorted Indices */ \ - const int64_t sorted_indices_aligned_bytes = indices_aligned_bytes; \ - /* CUB Temp Storage */ \ - int64_t temp_storage_bytes = \ - InferTempStorageForSortPairsDescending(instance_num, instance_size); \ - \ - return sorted_in_aligned_bytes + indices_aligned_bytes + sorted_indices_aligned_bytes \ - + temp_storage_bytes; \ - }); - -REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(float) -REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(double) -REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(uint8_t) -REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(int8_t) -REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(int32_t) -REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(int64_t) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/user/kernels/radix_sort.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +class TmpBufferManager final { + public: + OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager); + TmpBufferManager(int64_t capacity, void* ptr, const ShapeView& in_shape) + : capacity_{capacity}, + sorted_in_elem_cnt_{in_shape.elem_cnt()}, + indices_elem_cnt_{sorted_in_elem_cnt_}, + sorted_indices_elem_cnt_{sorted_in_elem_cnt_} { + const int64_t sorted_in_aligned_bytes = GetCudaAlignedSize(sorted_in_elem_cnt_ * sizeof(T)); + const int64_t indices_aligned_bytes = GetCudaAlignedSize(indices_elem_cnt_ * sizeof(int64_t)); + const int64_t sorted_indices_aligned_bytes = indices_aligned_bytes; + sorted_in_ptr_ = reinterpret_cast(ptr); + indices_ptr_ = reinterpret_cast(reinterpret_cast(sorted_in_ptr_) + + sorted_in_aligned_bytes); + sorted_indices_ptr_ = + reinterpret_cast(reinterpret_cast(indices_ptr_) + indices_aligned_bytes); + temp_storage_ptr_ = reinterpret_cast(reinterpret_cast(sorted_indices_ptr_) + + sorted_indices_aligned_bytes); + temp_storage_bytes_ = + capacity_ - sorted_in_aligned_bytes - indices_aligned_bytes - sorted_indices_aligned_bytes; + CHECK_GE(temp_storage_bytes_, 0); + } + ~TmpBufferManager() = default; + + T* SortedInPtr() const { return sorted_in_ptr_; } + int64_t* IndicesPtr() const { return indices_ptr_; } + int64_t* SortedIndicesPtr() const { return sorted_indices_ptr_; } + void* TempStoragePtr() const { return temp_storage_ptr_; } + + int64_t TempStorageBytes() const { return temp_storage_bytes_; } + + private: + int64_t capacity_; + + T* sorted_in_ptr_; + int64_t* indices_ptr_; + int64_t* sorted_indices_ptr_; + void* temp_storage_ptr_; + + int64_t sorted_in_elem_cnt_; + int64_t indices_elem_cnt_; + int64_t sorted_indices_elem_cnt_; + int64_t temp_storage_bytes_; +}; + +__global__ void InitializeIndices(int64_t elem_cnt, int64_t* indices_ptr, int64_t instance_size) { + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { indices_ptr[i] = i % instance_size; }; +} + +} // namespace + +template +class GpuRadixSortTopKKernel final : public user_op::OpKernel { + public: + GpuRadixSortTopKKernel() = default; + ~GpuRadixSortTopKKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + if (in->shape_view().elem_cnt() == 0) { return; } + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + TmpBufferManager buf_manager(static_cast(tmp_buffer->shape_view().elem_cnt()), + tmp_buffer->mut_dptr(), in->shape_view()); + + const int64_t elem_cnt = in->shape_view().elem_cnt(); + const int64_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); + const int64_t instance_num = elem_cnt / instance_size; + const int64_t k = std::min(static_cast(ctx->Attr("k")), instance_size); + InitializeIndices<<stream()->As()->cuda_stream()>>>( + elem_cnt, buf_manager.IndicesPtr(), instance_size); + SortPairsDescending(in->dptr(), buf_manager.IndicesPtr(), instance_num, instance_size, + buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(), + buf_manager.SortedInPtr(), buf_manager.SortedIndicesPtr(), + ctx->stream()->As()->cuda_stream()); + OF_CUDA_CHECK(hipMemcpy2DAsync(out->mut_dptr(), k * sizeof(int64_t), + buf_manager.SortedIndicesPtr(), instance_size * sizeof(int64_t), + k * sizeof(int64_t), instance_num, hipMemcpyDefault, + ctx->stream()->As()->cuda_stream())); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(dtype) \ + REGISTER_USER_KERNEL("top_k") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobAttr("k") > 128) \ + && (user_op::HobDataType("in", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ + const Shape& in_shape = ctx->InputShape("in", 0); \ + const int64_t elem_cnt = in_shape.elem_cnt(); \ + const int64_t instance_size = in_shape.dim_vec().back(); \ + const int64_t instance_num = elem_cnt / instance_size; \ + \ + /* Sorted In*/ \ + const int64_t sorted_in_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(dtype)); \ + /* Indices */ \ + const int64_t indices_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(int64_t)); \ + /* Sorted Indices */ \ + const int64_t sorted_indices_aligned_bytes = indices_aligned_bytes; \ + /* CUB Temp Storage */ \ + int64_t temp_storage_bytes = \ + InferTempStorageForSortPairsDescending(instance_num, instance_size); \ + \ + return sorted_in_aligned_bytes + indices_aligned_bytes + sorted_indices_aligned_bytes \ + + temp_storage_bytes; \ + }); + +REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(float) +REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(double) +REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(uint8_t) +REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(int8_t) +REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(int32_t) +REGISTER_CUDA_RADIX_SORT_TOP_K_KERNEL(int64_t) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/random_mask_generator.hip.cpp b/oneflow/user/kernels/random_mask_generator.hip.cpp index c28b7e3..6408b5b 100644 --- a/oneflow/user/kernels/random_mask_generator.hip.cpp +++ b/oneflow/user/kernels/random_mask_generator.hip.cpp @@ -1,69 +1,69 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/user/kernels/random_mask_generator.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -constexpr int32_t kMinPackPerThread = 2; - -using PackType = ulonglong2; - -union Pack { - PackType p_value; - bool b_value[sizeof(PackType)]; -}; - -__device__ bool GenMask(hiprandState* state, const float rate) { - return hiprand_uniform(state) > rate; -} - -__global__ void GenerateGpu(hiprandState* state, const int64_t n, const float rate, bool* mask) { - const int id = blockIdx.x * blockDim.x + threadIdx.x; - hiprandState localState = state[id]; - PackType* pack_mask = reinterpret_cast(mask); - Pack pack; - CUDA_1D_KERNEL_LOOP(i, n / sizeof(PackType)) { -#pragma unroll - for (int j = 0; j < sizeof(PackType); ++j) { pack.b_value[j] = GenMask(&localState, rate); } - pack_mask[i] = pack.p_value; - } - const int32_t rem_cnt = n % sizeof(PackType); - const int32_t rem_offset = n - rem_cnt; - if (id < rem_cnt) { mask[id + rem_offset] = GenMask(&localState, rate); } - state[id] = localState; -} - -} // namespace - -void RandomMaskGenerator::Generate(ep::Stream* stream, const int64_t n, - const float rate, bool* mask) { - int32_t block_num = generator_->max_block_num(); - int32_t thread_num = generator_->max_thread_num(); - auto* curand_states = generator_->curand_states(); - const int32_t elem_cnt_per_block = thread_num * sizeof(PackType) * kMinPackPerThread; - const int32_t block_num_final = - std::min(static_cast((n + elem_cnt_per_block - 1) / elem_cnt_per_block), block_num); - GenerateGpu<<As()->cuda_stream()>>>( - curand_states, n, rate, mask); -} - -template class RandomMaskGenerator; - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/user/kernels/random_mask_generator.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +constexpr int32_t kMinPackPerThread = 2; + +using PackType = ulonglong2; + +union Pack { + PackType p_value; + bool b_value[sizeof(PackType)]; +}; + +__device__ bool GenMask(hiprandState* state, const float rate) { + return hiprand_uniform(state) > rate; +} + +__global__ void GenerateGpu(hiprandState* state, const int64_t n, const float rate, bool* mask) { + const int id = blockIdx.x * blockDim.x + threadIdx.x; + hiprandState localState = state[id]; + PackType* pack_mask = reinterpret_cast(mask); + Pack pack; + CUDA_1D_KERNEL_LOOP(i, n / sizeof(PackType)) { +#pragma unroll + for (int j = 0; j < sizeof(PackType); ++j) { pack.b_value[j] = GenMask(&localState, rate); } + pack_mask[i] = pack.p_value; + } + const int32_t rem_cnt = n % sizeof(PackType); + const int32_t rem_offset = n - rem_cnt; + if (id < rem_cnt) { mask[id + rem_offset] = GenMask(&localState, rate); } + state[id] = localState; +} + +} // namespace + +void RandomMaskGenerator::Generate(ep::Stream* stream, const int64_t n, + const float rate, bool* mask) { + int32_t block_num = generator_->max_block_num(); + int32_t thread_num = generator_->max_thread_num(); + auto* curand_states = generator_->curand_states(); + const int32_t elem_cnt_per_block = thread_num * sizeof(PackType) * kMinPackPerThread; + const int32_t block_num_final = + std::min(static_cast((n + elem_cnt_per_block - 1) / elem_cnt_per_block), block_num); + GenerateGpu<<As()->cuda_stream()>>>( + curand_states, n, rate, mask); +} + +template class RandomMaskGenerator; + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/randperm_kernel.hip.cpp b/oneflow/user/kernels/randperm_kernel.hip.cpp index 94dcf6c..611ba57 100644 --- a/oneflow/user/kernels/randperm_kernel.hip.cpp +++ b/oneflow/user/kernels/randperm_kernel.hip.cpp @@ -1,201 +1,201 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include -#include -#include "hip/hip_runtime.h" -#include "oneflow/core/common/data_type.h" -#include "oneflow/core/ep/include/stream.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/framework/random_generator.h" -#include "oneflow/user/kernels/op_kernel_wrapper.h" -#include "oneflow/user/kernels/arange_kernel_util.h" -#include "oneflow/user/kernels/radix_sort.hip.h" -#include "oneflow/user/kernels/distributions/common.h" -#include "oneflow/core/ep/include/device.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include "oneflow/core/job/nd_sbp_util.h" -#include "oneflow/core/common/container_util.h" -#include "oneflow/core/register/tensor_slice_view.h" -#include "oneflow/core/device/cuda_util.h" - -namespace oneflow { -__global__ void GeneKeysAndValues(const int32_t n, int32_t* values, int32_t* keys, - hiprandState* state) { - const int id = blockIdx.x * blockDim.x + threadIdx.x; - hiprandState local_state = state[id]; - CUDA_1D_KERNEL_LOOP(i, n) { - keys[i] = hiprand(&local_state); - values[i] = i; - } - state[id] = local_state; -} - -__global__ void tempcopy2output(const int32_t n, const int32_t offset, int32_t* temp, - int32_t* output) { - CUDA_1D_KERNEL_LOOP(i, n) { output[i] = temp[offset + i]; } -} -class GpuRandPermKernelCache final : public user_op::OpKernelCache { - public: - GpuRandPermKernelCache(int32_t lower, int32_t upper) : lower_(lower), upper_(upper) {} - ~GpuRandPermKernelCache() override = default; - - int32_t lower() const { return lower_; } - int32_t upper() const { return upper_; } - - private: - const int32_t lower_; - const int32_t upper_; -}; - -namespace { - -template -size_t GetCubSortPairsTempStorageSize(int64_t n) { - size_t cub_sort_temp_store_size = 0; - OF_CUDA_CHECK((hipcub::DeviceRadixSort::SortPairs(nullptr, cub_sort_temp_store_size, nullptr, - nullptr, nullptr, nullptr, n))); - size_t temp_store_size = GetCudaAlignedSize(cub_sort_temp_store_size); - CHECK_GE(temp_store_size, 0) << "temp_store_size should >= 0."; - CHECK_LT(temp_store_size, static_cast(GetMaxVal())) - << "temp_store_size should < " << static_cast(GetMaxVal()); - return temp_store_size; -} - -} // namespace - -class GpuRandPermKernel final : public user_op::OpKernel { - public: - GpuRandPermKernel() = default; - ~GpuRandPermKernel() = default; - std::shared_ptr InitOpKernelCache( - user_op::KernelCacheContext* ctx) const override { - int64_t parallel_num = ctx->parallel_ctx().parallel_num(); - if (parallel_num > 1) { - const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0); - const Shape& hierarchy = *ctx->parallel_desc().hierarchy(); - int64_t parallel_id = ctx->parallel_ctx().parallel_id(); - int32_t n = ctx->Attr("n"); - const Shape& logical_shape = Shape({n}); - TensorSliceView view = - GetTensorSliceView4ParallelId(hierarchy, nd_sbp, logical_shape, parallel_id); - std::shared_ptr cache( - new GpuRandPermKernelCache(view.At(0).begin(), view.At(0).end())); - return cache; - } else { - return nullptr; - } - } - std::shared_ptr CreateOpKernelState( - user_op::KernelInitContext* ctx) const override { - const auto& generator = CHECK_JUST(one::MakeGenerator(kCUDA)); - generator->set_current_seed(ctx->Attr("seed")); - return std::make_shared(generator); - } - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, - const user_op::OpKernelCache* cache) const override { - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - int32_t* output = out->mut_dptr(); - const int32_t n = ctx->Attr("n"); - if (n == 0) { return; } - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - - auto* distribution_state = dynamic_cast(state); - CHECK_NOTNULL(distribution_state); - const auto& generator = distribution_state->generator(); - auto* stream = ctx->stream(); - const auto device_index = stream->device()->device_index(); - const auto& gpu_generator = CHECK_JUST(generator->Get(device_index)); - CHECK_NOTNULL(generator); - - int32_t block_num = gpu_generator->max_block_num(); - int32_t thread_num = gpu_generator->max_thread_num(); - hiprandState* curand_states = gpu_generator->curand_states(); - - // layout for tmp |...key(in and out,2xN)..|....value....|.... space for sort function....| - // values are the desired indexes ,and keys are generated randomly. - void* tmp = tmp_buffer->mut_dptr(); - int32_t* key_base = reinterpret_cast(tmp); - - const int32_t key_aligned_bytes = GetCudaAlignedSize(n * sizeof(int32_t)); - int32_t* value_base = - reinterpret_cast(reinterpret_cast(key_base) + 2 * key_aligned_bytes); - const int32_t indices_aligned_bytes = GetCudaAlignedSize(n * sizeof(int32_t)); - int32_t* temp_buffer_base = - reinterpret_cast(reinterpret_cast(value_base) + indices_aligned_bytes); - const int32_t temp_buffer_aligned_bytes = GetCudaAlignedSize(n * sizeof(int32_t)); - - void* tmp_base = reinterpret_cast(reinterpret_cast(temp_buffer_base) - + temp_buffer_aligned_bytes); - size_t temp_storage_bytes = GetCubSortPairsTempStorageSize(n); - GeneKeysAndValues<<As()->cuda_stream()>>>( - n, value_base, key_base, curand_states); - if (cache == nullptr) { - auto err = hipcub::DeviceRadixSort::SortPairs( - /* d_temp_storage */ tmp_base, - /* temp_storage_bytes */ temp_storage_bytes, - /* d_keys_in */ key_base, - /* d_keys_out */ key_base + n, - /* d_values_in */ value_base, - /* d_values_out */ output, - /* num_items */ n, - /* begin_bit */ 0, - /* end_bit */ sizeof(int32_t) * 8, - /* stream */ ctx->stream()->As()->cuda_stream()); - OF_CUDA_CHECK(err); - } else { - auto err = hipcub::DeviceRadixSort::SortPairs( - /* d_temp_storage */ tmp_base, - /* temp_storage_bytes */ temp_storage_bytes, - /* d_keys_in */ key_base, - /* d_keys_out */ key_base + n, - /* d_values_in */ value_base, - /* d_values_out */ temp_buffer_base, - /* num_items */ n, - /* begin_bit */ 0, - /* end_bit */ sizeof(int32_t) * 8, - /* stream */ ctx->stream()->As()->cuda_stream()); - OF_CUDA_CHECK(err); - const auto* randperm_cache = dynamic_cast(cache); - auto len = randperm_cache->upper() - randperm_cache->lower(); - const int64_t offset = randperm_cache->lower(); - tempcopy2output<<stream()->As()->cuda_stream()>>>( - len, offset, temp_buffer_base, output); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; -REGISTER_USER_KERNEL("randperm") - .SetCreateFn() - .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA) - .SetInferTmpSizeFn([](user_op::InferContext* ctx) { - const int32_t n = ctx->Attr("n"); - /* Sorted In */ - const int32_t sorted_in_aligned_bytes = 2 * GetCudaAlignedSize(n * sizeof(int32_t)); - /* Indices */ - const int32_t indices_aligned_bytes = GetCudaAlignedSize(n * sizeof(int32_t)); - const int32_t temp_aligned_bytes = GetCudaAlignedSize(n * sizeof(int32_t)); - - /* CUB Temp Storage */ - const int32_t temp_storage_bytes = GetCubSortPairsTempStorageSize(n); - - return sorted_in_aligned_bytes + indices_aligned_bytes + temp_storage_bytes - + temp_aligned_bytes; - }); +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include +#include +#include "hip/hip_runtime.h" +#include "oneflow/core/common/data_type.h" +#include "oneflow/core/ep/include/stream.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/framework/random_generator.h" +#include "oneflow/user/kernels/op_kernel_wrapper.h" +#include "oneflow/user/kernels/arange_kernel_util.h" +#include "oneflow/user/kernels/radix_sort.hip.h" +#include "oneflow/user/kernels/distributions/common.h" +#include "oneflow/core/ep/include/device.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include "oneflow/core/job/nd_sbp_util.h" +#include "oneflow/core/common/container_util.h" +#include "oneflow/core/register/tensor_slice_view.h" +#include "oneflow/core/device/cuda_util.h" + +namespace oneflow { +__global__ void GeneKeysAndValues(const int32_t n, int32_t* values, int32_t* keys, + hiprandState* state) { + const int id = blockIdx.x * blockDim.x + threadIdx.x; + hiprandState local_state = state[id]; + CUDA_1D_KERNEL_LOOP(i, n) { + keys[i] = hiprand(&local_state); + values[i] = i; + } + state[id] = local_state; +} + +__global__ void tempcopy2output(const int32_t n, const int32_t offset, int32_t* temp, + int32_t* output) { + CUDA_1D_KERNEL_LOOP(i, n) { output[i] = temp[offset + i]; } +} +class GpuRandPermKernelCache final : public user_op::OpKernelCache { + public: + GpuRandPermKernelCache(int32_t lower, int32_t upper) : lower_(lower), upper_(upper) {} + ~GpuRandPermKernelCache() override = default; + + int32_t lower() const { return lower_; } + int32_t upper() const { return upper_; } + + private: + const int32_t lower_; + const int32_t upper_; +}; + +namespace { + +template +size_t GetCubSortPairsTempStorageSize(int64_t n) { + size_t cub_sort_temp_store_size = 0; + OF_CUDA_CHECK((hipcub::DeviceRadixSort::SortPairs(nullptr, cub_sort_temp_store_size, nullptr, + nullptr, nullptr, nullptr, n))); + size_t temp_store_size = GetCudaAlignedSize(cub_sort_temp_store_size); + CHECK_GE(temp_store_size, 0) << "temp_store_size should >= 0."; + CHECK_LT(temp_store_size, static_cast(GetMaxVal())) + << "temp_store_size should < " << static_cast(GetMaxVal()); + return temp_store_size; +} + +} // namespace + +class GpuRandPermKernel final : public user_op::OpKernel { + public: + GpuRandPermKernel() = default; + ~GpuRandPermKernel() = default; + std::shared_ptr InitOpKernelCache( + user_op::KernelCacheContext* ctx) const override { + int64_t parallel_num = ctx->parallel_ctx().parallel_num(); + if (parallel_num > 1) { + const NdSbp& nd_sbp = ctx->NdSbp4ArgNameAndIndex("out", 0); + const Shape& hierarchy = *ctx->parallel_desc().hierarchy(); + int64_t parallel_id = ctx->parallel_ctx().parallel_id(); + int32_t n = ctx->Attr("n"); + const Shape& logical_shape = Shape({n}); + TensorSliceView view = + GetTensorSliceView4ParallelId(hierarchy, nd_sbp, logical_shape, parallel_id); + std::shared_ptr cache( + new GpuRandPermKernelCache(view.At(0).begin(), view.At(0).end())); + return cache; + } else { + return nullptr; + } + } + std::shared_ptr CreateOpKernelState( + user_op::KernelInitContext* ctx) const override { + const auto& generator = CHECK_JUST(one::MakeGenerator(kCUDA)); + generator->set_current_seed(ctx->Attr("seed")); + return std::make_shared(generator); + } + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, + const user_op::OpKernelCache* cache) const override { + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + int32_t* output = out->mut_dptr(); + const int32_t n = ctx->Attr("n"); + if (n == 0) { return; } + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + + auto* distribution_state = dynamic_cast(state); + CHECK_NOTNULL(distribution_state); + const auto& generator = distribution_state->generator(); + auto* stream = ctx->stream(); + const auto device_index = stream->device()->device_index(); + const auto& gpu_generator = CHECK_JUST(generator->Get(device_index)); + CHECK_NOTNULL(generator); + + int32_t block_num = gpu_generator->max_block_num(); + int32_t thread_num = gpu_generator->max_thread_num(); + hiprandState* curand_states = gpu_generator->curand_states(); + + // layout for tmp |...key(in and out,2xN)..|....value....|.... space for sort function....| + // values are the desired indexes ,and keys are generated randomly. + void* tmp = tmp_buffer->mut_dptr(); + int32_t* key_base = reinterpret_cast(tmp); + + const int32_t key_aligned_bytes = GetCudaAlignedSize(n * sizeof(int32_t)); + int32_t* value_base = + reinterpret_cast(reinterpret_cast(key_base) + 2 * key_aligned_bytes); + const int32_t indices_aligned_bytes = GetCudaAlignedSize(n * sizeof(int32_t)); + int32_t* temp_buffer_base = + reinterpret_cast(reinterpret_cast(value_base) + indices_aligned_bytes); + const int32_t temp_buffer_aligned_bytes = GetCudaAlignedSize(n * sizeof(int32_t)); + + void* tmp_base = reinterpret_cast(reinterpret_cast(temp_buffer_base) + + temp_buffer_aligned_bytes); + size_t temp_storage_bytes = GetCubSortPairsTempStorageSize(n); + GeneKeysAndValues<<As()->cuda_stream()>>>( + n, value_base, key_base, curand_states); + if (cache == nullptr) { + auto err = hipcub::DeviceRadixSort::SortPairs( + /* d_temp_storage */ tmp_base, + /* temp_storage_bytes */ temp_storage_bytes, + /* d_keys_in */ key_base, + /* d_keys_out */ key_base + n, + /* d_values_in */ value_base, + /* d_values_out */ output, + /* num_items */ n, + /* begin_bit */ 0, + /* end_bit */ sizeof(int32_t) * 8, + /* stream */ ctx->stream()->As()->cuda_stream()); + OF_CUDA_CHECK(err); + } else { + auto err = hipcub::DeviceRadixSort::SortPairs( + /* d_temp_storage */ tmp_base, + /* temp_storage_bytes */ temp_storage_bytes, + /* d_keys_in */ key_base, + /* d_keys_out */ key_base + n, + /* d_values_in */ value_base, + /* d_values_out */ temp_buffer_base, + /* num_items */ n, + /* begin_bit */ 0, + /* end_bit */ sizeof(int32_t) * 8, + /* stream */ ctx->stream()->As()->cuda_stream()); + OF_CUDA_CHECK(err); + const auto* randperm_cache = dynamic_cast(cache); + auto len = randperm_cache->upper() - randperm_cache->lower(); + const int64_t offset = randperm_cache->lower(); + tempcopy2output<<stream()->As()->cuda_stream()>>>( + len, offset, temp_buffer_base, output); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; +REGISTER_USER_KERNEL("randperm") + .SetCreateFn() + .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA) + .SetInferTmpSizeFn([](user_op::InferContext* ctx) { + const int32_t n = ctx->Attr("n"); + /* Sorted In */ + const int32_t sorted_in_aligned_bytes = 2 * GetCudaAlignedSize(n * sizeof(int32_t)); + /* Indices */ + const int32_t indices_aligned_bytes = GetCudaAlignedSize(n * sizeof(int32_t)); + const int32_t temp_aligned_bytes = GetCudaAlignedSize(n * sizeof(int32_t)); + + /* CUB Temp Storage */ + const int32_t temp_storage_bytes = GetCubSortPairsTempStorageSize(n); + + return sorted_in_aligned_bytes + indices_aligned_bytes + temp_storage_bytes + + temp_aligned_bytes; + }); } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/repeat_interleave_kernel.hip.cpp b/oneflow/user/kernels/repeat_interleave_kernel.hip.cpp index 3b3d3dc..f4c1d22 100644 --- a/oneflow/user/kernels/repeat_interleave_kernel.hip.cpp +++ b/oneflow/user/kernels/repeat_interleave_kernel.hip.cpp @@ -1,73 +1,73 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/user/kernels/roll_kernel_utils.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -#include - -namespace oneflow { - -namespace { - -template -__global__ void repeat_interleave(const T* in_ptr, const T* cumsum_ptr, T* out_ptr, - const int64_t num) { - CUDA_1D_KERNEL_LOOP(i, num) { - T end = cumsum_ptr[i]; - T size = in_ptr[i]; - T start = end - size; - for (T j = start; j < end; j++) { out_ptr[j] = i; } - } -} - -} // namespace - -template -class GpuRepeatInterLeaveKernel final : public user_op::OpKernel { - public: - GpuRepeatInterLeaveKernel() = default; - ~GpuRepeatInterLeaveKernel() = default; - - private: - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - const user_op::Tensor* cumsum = ctx->Tensor4ArgNameAndIndex("cumsum", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const int64_t& repeat_num = ctx->Attr("repeat_num"); - const T* in_ptr = in->dptr(); - const T* cumsum_ptr = cumsum->dptr(); - T* out_ptr = out->mut_dptr(); - - repeat_interleave<<shape_view().At(0)), kCudaThreadsNumPerBlock, 0, - ctx->stream()->As()->cuda_stream()>>>( - in_ptr, cumsum_ptr, out_ptr, in->shape_view().At(0)); - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_REPEAT_INTER_LEAVE_KERNEL(dtype) \ - REGISTER_USER_KERNEL("repeat_interleave") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("in", 0) == GetDataType::value)) - -REGISTER_REPEAT_INTER_LEAVE_KERNEL(int32_t); -REGISTER_REPEAT_INTER_LEAVE_KERNEL(int64_t); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/user/kernels/roll_kernel_utils.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +#include + +namespace oneflow { + +namespace { + +template +__global__ void repeat_interleave(const T* in_ptr, const T* cumsum_ptr, T* out_ptr, + const int64_t num) { + CUDA_1D_KERNEL_LOOP(i, num) { + T end = cumsum_ptr[i]; + T size = in_ptr[i]; + T start = end - size; + for (T j = start; j < end; j++) { out_ptr[j] = i; } + } +} + +} // namespace + +template +class GpuRepeatInterLeaveKernel final : public user_op::OpKernel { + public: + GpuRepeatInterLeaveKernel() = default; + ~GpuRepeatInterLeaveKernel() = default; + + private: + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + const user_op::Tensor* cumsum = ctx->Tensor4ArgNameAndIndex("cumsum", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + const int64_t& repeat_num = ctx->Attr("repeat_num"); + const T* in_ptr = in->dptr(); + const T* cumsum_ptr = cumsum->dptr(); + T* out_ptr = out->mut_dptr(); + + repeat_interleave<<shape_view().At(0)), kCudaThreadsNumPerBlock, 0, + ctx->stream()->As()->cuda_stream()>>>( + in_ptr, cumsum_ptr, out_ptr, in->shape_view().At(0)); + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_REPEAT_INTER_LEAVE_KERNEL(dtype) \ + REGISTER_USER_KERNEL("repeat_interleave") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("in", 0) == GetDataType::value)) + +REGISTER_REPEAT_INTER_LEAVE_KERNEL(int32_t); +REGISTER_REPEAT_INTER_LEAVE_KERNEL(int64_t); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/roi_align_kernel.hip.cpp b/oneflow/user/kernels/roi_align_kernel.hip.cpp index f9492c6..a6f3608 100644 --- a/oneflow/user/kernels/roi_align_kernel.hip.cpp +++ b/oneflow/user/kernels/roi_align_kernel.hip.cpp @@ -1,302 +1,302 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -__device__ T BilinearInterpolate(const T* channel_dptr, const int32_t height, const int32_t width, - T y, T x) { - if (y < -1.0 || y > height || x < -1.0 || x > width) { return 0; } - - if (y <= 0) { y = 0; } - if (x <= 0) { x = 0; } - int32_t y_low = static_cast(y); - int32_t x_low = static_cast(x); - int32_t y_high = 0; - int32_t x_high = 0; - - if (y_low >= height - 1) { - y_low = height - 1; - y_high = y_low; - y = static_cast(y_low); - } else { - y_high = y_low + 1; - } - - if (x_low >= width - 1) { - x_low = width - 1; - x_high = x_low; - x = static_cast(x_low); - } else { - x_high = x_low + 1; - } - - const T ly = y - y_low; - const T lx = x - x_low; - const T hy = 1.f - ly; - const T hx = 1.f - lx; - - // https://en.wikipedia.org/wiki/Bilinear_interpolation - const int64_t q11 = y_low * width + x_low; - const int64_t q21 = y_low * width + x_high; - const int64_t q12 = y_high * width + x_low; - const int64_t q22 = y_high * width + x_high; - // no 1 / (x_high - x_low) * (y_high - y_low) because it will always be 1 in RoI Align - return (hy * hx) * channel_dptr[q11] + (hy * lx) * channel_dptr[q21] - + (ly * hx) * channel_dptr[q12] + (ly * lx) * channel_dptr[q22]; -} - -template -__device__ bool BilinearInterpolateDiff(const T bin_diff_avg, const int64_t height, - const int64_t width, T y, T x, T& diff11, T& diff21, - T& diff12, T& diff22, int32_t& x_low, int32_t& x_high, - int32_t& y_low, int32_t& y_high) { - if (y < -1.0 || y > height || x < -1.0 || x > width) { return false; } - - if (y <= 0) { y = 0; } - if (x <= 0) { x = 0; } - - y_low = static_cast(y); - x_low = static_cast(x); - - if (y_low >= height - 1) { - y_low = height - 1; - y_high = y_low; - y = static_cast(y_low); - } else { - y_high = y_low + 1; - } - - if (x_low >= width - 1) { - x_low = width - 1; - x_high = x_low; - x = static_cast(x_low); - } else { - x_high = x_low + 1; - } - - const T ly = y - y_low; - const T lx = x - x_low; - const T hy = 1.f - ly; - const T hx = 1.f - lx; - - diff11 = bin_diff_avg * hy * hx; - diff21 = bin_diff_avg * hy * lx; - diff12 = bin_diff_avg * ly * hx; - diff22 = bin_diff_avg * ly * lx; - return true; -} - -template -__global__ void RoiAlignForward(const int64_t nthreads, const T* in_dptr, const T* rois_dptr, - const T spatial_scale, const int32_t sampling_ratio, - const int64_t channel_num, const int64_t height, - const int64_t width, const int64_t pooled_height, - const int64_t pooled_width, const bool aligned, T* out_dptr) { - const int64_t pooled_area = pooled_height * pooled_width; - const int64_t channel_pooled_area = channel_num * pooled_height * pooled_width; - CUDA_1D_KERNEL_LOOP(index, nthreads) { - const int64_t h = (index / pooled_width) % pooled_height; - const int64_t w = index % pooled_width; - const int64_t c = (index / pooled_area) % channel_num; - const int64_t r = index / channel_pooled_area; - const T* offset_rois_dptr = rois_dptr + r * 5; - const int64_t n = static_cast(offset_rois_dptr[0]); - const T align_offset = aligned ? static_cast(0.5) : static_cast(0.f); - const T roi_start_w = offset_rois_dptr[1] * spatial_scale - align_offset; - const T roi_start_h = offset_rois_dptr[2] * spatial_scale - align_offset; - const T roi_end_w = offset_rois_dptr[3] * spatial_scale - align_offset; - const T roi_end_h = offset_rois_dptr[4] * spatial_scale - align_offset; - T roi_height = roi_end_h - roi_start_h; - T roi_width = roi_end_w - roi_start_w; - // aligned == false is for compatibility. the argument "aligned" doesn't have the semantic of - // determining minimum roi size - if (aligned == false) { - roi_height = max(roi_height, static_cast(1.0)); - roi_width = max(roi_width, static_cast(1.0)); - } - const T bin_height = static_cast(roi_height) / static_cast(pooled_height); - const T bin_width = static_cast(roi_width) / static_cast(pooled_width); - const int32_t bin_grid_height = - (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); - const int32_t bin_grid_width = - (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); - const T count = max(bin_grid_height * bin_grid_width, 1); - const T* channel_dptr = in_dptr + (n * channel_num + c) * height * width; - T out_val = 0.0; - FOR_RANGE(int64_t, grid_i, 0, bin_grid_height) { - // + .5f for center position - T y = roi_start_h + h * bin_height - + static_cast(grid_i + 0.5f) * bin_height / static_cast(bin_grid_height); - FOR_RANGE(int64_t, grid_j, 0, bin_grid_width) { - T x = roi_start_w + w * bin_width - + static_cast(grid_j + 0.5f) * bin_width / static_cast(bin_grid_width); - out_val += BilinearInterpolate(channel_dptr, height, width, y, x); - } - } - out_dptr[index] = out_val / count; - } -} - -template -__global__ void RoiAlignBackward(const int64_t nthreads, const T* out_diff_dptr, const T* rois_dptr, - const T spatial_scale, const int32_t sampling_ratio, - const int64_t channel_num, const int64_t height, - const int64_t width, const int64_t pooled_height, - const int64_t pooled_width, const bool aligned, T* in_diff_dptr) { - const int64_t pooled_area = pooled_height * pooled_width; - const int64_t channel_pooled_area = channel_num * pooled_height * pooled_width; - CUDA_1D_KERNEL_LOOP(index, nthreads) { - const int64_t h = (index / pooled_width) % pooled_height; - const int64_t w = index % pooled_width; - const int64_t c = (index / pooled_area) % channel_num; - const int64_t r = index / channel_pooled_area; - const T* offset_rois_dptr = rois_dptr + r * 5; - const int64_t n = static_cast(offset_rois_dptr[0]); - const T align_offset = aligned ? static_cast(0.5) : static_cast(0.f); - const T roi_start_w = offset_rois_dptr[1] * spatial_scale - align_offset; - const T roi_start_h = offset_rois_dptr[2] * spatial_scale - align_offset; - const T roi_end_w = offset_rois_dptr[3] * spatial_scale - align_offset; - const T roi_end_h = offset_rois_dptr[4] * spatial_scale - align_offset; - T roi_width = roi_end_w - roi_start_w; - T roi_height = roi_end_h - roi_start_h; - // aligned == false is for compatibility. the argument "aligned" doesn't have the semantic of - // determining minimum roi size - if (aligned == false) { - roi_height = max(roi_height, static_cast(1.0)); - roi_width = max(roi_width, static_cast(1.0)); - } - const T bin_height = static_cast(roi_height) / static_cast(pooled_height); - const T bin_width = static_cast(roi_width) / static_cast(pooled_width); - const int32_t bin_grid_height = - (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); - const int32_t bin_grid_width = - (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); - - const T count = max(bin_grid_height * bin_grid_width, 1); - const T bin_diff_avg = out_diff_dptr[index] / count; - T* in_diff_channel_dptr = in_diff_dptr + (n * channel_num + c) * height * width; - FOR_RANGE(int64_t, grid_i, 0, bin_grid_height) { - // + .5f for center position - T y = roi_start_h + h * bin_height - + static_cast(grid_i + 0.5f) * bin_height / static_cast(bin_grid_height); - FOR_RANGE(int64_t, grid_j, 0, bin_grid_width) { - T x = roi_start_w + w * bin_width - + static_cast(grid_j + 0.5f) * bin_width / static_cast(bin_grid_width); - T diff11 = 0; - T diff21 = 0; - T diff12 = 0; - T diff22 = 0; - int32_t x_low = 0; - int32_t x_high = 0; - int32_t y_low = 0; - int32_t y_high = 0; - bool has_diff = BilinearInterpolateDiff(bin_diff_avg, height, width, y, x, diff11, diff21, - diff12, diff22, x_low, x_high, y_low, y_high); - if (has_diff) { - const int64_t q11 = y_low * width + x_low; - const int64_t q21 = y_low * width + x_high; - const int64_t q12 = y_high * width + x_low; - const int64_t q22 = y_high * width + x_high; - atomicAdd(in_diff_channel_dptr + q11, diff11); - atomicAdd(in_diff_channel_dptr + q21, diff21); - atomicAdd(in_diff_channel_dptr + q12, diff12); - atomicAdd(in_diff_channel_dptr + q22, diff22); - } - } - } - } -} - -} // namespace - -template -class RoIAlignKernel final : public user_op::OpKernel { - public: - RoIAlignKernel() = default; - ~RoIAlignKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x_blob = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* rois_blob = ctx->Tensor4ArgNameAndIndex("rois", 0); - if (rois_blob->shape_view().elem_cnt() == 0) { return; } - user_op::Tensor* y_blob = ctx->Tensor4ArgNameAndIndex("y", 0); - const int32_t pooled_h = ctx->Attr("pooled_h"); - const int32_t pooled_w = ctx->Attr("pooled_w"); - const float spatial_scale = ctx->Attr("spatial_scale"); - const int32_t sampling_ratio = ctx->Attr("sampling_ratio"); - const bool aligned = ctx->Attr("aligned"); - - const int64_t elem_cnt = y_blob->shape_view().elem_cnt(); - RoiAlignForward<<stream()->As()->cuda_stream()>>>( - elem_cnt, x_blob->dptr(), rois_blob->dptr(), spatial_scale, sampling_ratio, - x_blob->shape_view().At(1), x_blob->shape_view().At(2), x_blob->shape_view().At(3), - pooled_h, pooled_w, aligned, y_blob->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class RoIAlignGradKernel final : public user_op::OpKernel { - public: - RoIAlignGradKernel() = default; - ~RoIAlignGradKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - user_op::Tensor* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0); - if (dx_blob == nullptr) { return; } - Memset(ctx->stream(), dx_blob->mut_dptr(), 0, - dx_blob->shape_view().elem_cnt() * sizeof(T)); - const user_op::Tensor* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0); - const user_op::Tensor* rois_blob = ctx->Tensor4ArgNameAndIndex("rois", 0); - const int32_t pooled_h = ctx->Attr("pooled_h"); - const int32_t pooled_w = ctx->Attr("pooled_w"); - const float spatial_scale = ctx->Attr("spatial_scale"); - const int32_t sampling_ratio = ctx->Attr("sampling_ratio"); - const bool aligned = ctx->Attr("aligned"); - - const int64_t elem_cnt = dy_blob->shape_view().elem_cnt(); - if (elem_cnt > 0) { - RoiAlignBackward<<stream()->As()->cuda_stream()>>>( - elem_cnt, dy_blob->dptr(), rois_blob->dptr(), spatial_scale, sampling_ratio, - dx_blob->shape_view().At(1), dx_blob->shape_view().At(2), dx_blob->shape_view().At(3), - pooled_h, pooled_w, aligned, dx_blob->mut_dptr()); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -REGISTER_USER_KERNEL("roi_align") - .SetCreateFn>() - .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA); - -REGISTER_USER_KERNEL("roi_align_grad") - .SetCreateFn>() - .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +__device__ T BilinearInterpolate(const T* channel_dptr, const int32_t height, const int32_t width, + T y, T x) { + if (y < -1.0 || y > height || x < -1.0 || x > width) { return 0; } + + if (y <= 0) { y = 0; } + if (x <= 0) { x = 0; } + int32_t y_low = static_cast(y); + int32_t x_low = static_cast(x); + int32_t y_high = 0; + int32_t x_high = 0; + + if (y_low >= height - 1) { + y_low = height - 1; + y_high = y_low; + y = static_cast(y_low); + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_low = width - 1; + x_high = x_low; + x = static_cast(x_low); + } else { + x_high = x_low + 1; + } + + const T ly = y - y_low; + const T lx = x - x_low; + const T hy = 1.f - ly; + const T hx = 1.f - lx; + + // https://en.wikipedia.org/wiki/Bilinear_interpolation + const int64_t q11 = y_low * width + x_low; + const int64_t q21 = y_low * width + x_high; + const int64_t q12 = y_high * width + x_low; + const int64_t q22 = y_high * width + x_high; + // no 1 / (x_high - x_low) * (y_high - y_low) because it will always be 1 in RoI Align + return (hy * hx) * channel_dptr[q11] + (hy * lx) * channel_dptr[q21] + + (ly * hx) * channel_dptr[q12] + (ly * lx) * channel_dptr[q22]; +} + +template +__device__ bool BilinearInterpolateDiff(const T bin_diff_avg, const int64_t height, + const int64_t width, T y, T x, T& diff11, T& diff21, + T& diff12, T& diff22, int32_t& x_low, int32_t& x_high, + int32_t& y_low, int32_t& y_high) { + if (y < -1.0 || y > height || x < -1.0 || x > width) { return false; } + + if (y <= 0) { y = 0; } + if (x <= 0) { x = 0; } + + y_low = static_cast(y); + x_low = static_cast(x); + + if (y_low >= height - 1) { + y_low = height - 1; + y_high = y_low; + y = static_cast(y_low); + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_low = width - 1; + x_high = x_low; + x = static_cast(x_low); + } else { + x_high = x_low + 1; + } + + const T ly = y - y_low; + const T lx = x - x_low; + const T hy = 1.f - ly; + const T hx = 1.f - lx; + + diff11 = bin_diff_avg * hy * hx; + diff21 = bin_diff_avg * hy * lx; + diff12 = bin_diff_avg * ly * hx; + diff22 = bin_diff_avg * ly * lx; + return true; +} + +template +__global__ void RoiAlignForward(const int64_t nthreads, const T* in_dptr, const T* rois_dptr, + const T spatial_scale, const int32_t sampling_ratio, + const int64_t channel_num, const int64_t height, + const int64_t width, const int64_t pooled_height, + const int64_t pooled_width, const bool aligned, T* out_dptr) { + const int64_t pooled_area = pooled_height * pooled_width; + const int64_t channel_pooled_area = channel_num * pooled_height * pooled_width; + CUDA_1D_KERNEL_LOOP(index, nthreads) { + const int64_t h = (index / pooled_width) % pooled_height; + const int64_t w = index % pooled_width; + const int64_t c = (index / pooled_area) % channel_num; + const int64_t r = index / channel_pooled_area; + const T* offset_rois_dptr = rois_dptr + r * 5; + const int64_t n = static_cast(offset_rois_dptr[0]); + const T align_offset = aligned ? static_cast(0.5) : static_cast(0.f); + const T roi_start_w = offset_rois_dptr[1] * spatial_scale - align_offset; + const T roi_start_h = offset_rois_dptr[2] * spatial_scale - align_offset; + const T roi_end_w = offset_rois_dptr[3] * spatial_scale - align_offset; + const T roi_end_h = offset_rois_dptr[4] * spatial_scale - align_offset; + T roi_height = roi_end_h - roi_start_h; + T roi_width = roi_end_w - roi_start_w; + // aligned == false is for compatibility. the argument "aligned" doesn't have the semantic of + // determining minimum roi size + if (aligned == false) { + roi_height = max(roi_height, static_cast(1.0)); + roi_width = max(roi_width, static_cast(1.0)); + } + const T bin_height = static_cast(roi_height) / static_cast(pooled_height); + const T bin_width = static_cast(roi_width) / static_cast(pooled_width); + const int32_t bin_grid_height = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); + const int32_t bin_grid_width = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + const T count = max(bin_grid_height * bin_grid_width, 1); + const T* channel_dptr = in_dptr + (n * channel_num + c) * height * width; + T out_val = 0.0; + FOR_RANGE(int64_t, grid_i, 0, bin_grid_height) { + // + .5f for center position + T y = roi_start_h + h * bin_height + + static_cast(grid_i + 0.5f) * bin_height / static_cast(bin_grid_height); + FOR_RANGE(int64_t, grid_j, 0, bin_grid_width) { + T x = roi_start_w + w * bin_width + + static_cast(grid_j + 0.5f) * bin_width / static_cast(bin_grid_width); + out_val += BilinearInterpolate(channel_dptr, height, width, y, x); + } + } + out_dptr[index] = out_val / count; + } +} + +template +__global__ void RoiAlignBackward(const int64_t nthreads, const T* out_diff_dptr, const T* rois_dptr, + const T spatial_scale, const int32_t sampling_ratio, + const int64_t channel_num, const int64_t height, + const int64_t width, const int64_t pooled_height, + const int64_t pooled_width, const bool aligned, T* in_diff_dptr) { + const int64_t pooled_area = pooled_height * pooled_width; + const int64_t channel_pooled_area = channel_num * pooled_height * pooled_width; + CUDA_1D_KERNEL_LOOP(index, nthreads) { + const int64_t h = (index / pooled_width) % pooled_height; + const int64_t w = index % pooled_width; + const int64_t c = (index / pooled_area) % channel_num; + const int64_t r = index / channel_pooled_area; + const T* offset_rois_dptr = rois_dptr + r * 5; + const int64_t n = static_cast(offset_rois_dptr[0]); + const T align_offset = aligned ? static_cast(0.5) : static_cast(0.f); + const T roi_start_w = offset_rois_dptr[1] * spatial_scale - align_offset; + const T roi_start_h = offset_rois_dptr[2] * spatial_scale - align_offset; + const T roi_end_w = offset_rois_dptr[3] * spatial_scale - align_offset; + const T roi_end_h = offset_rois_dptr[4] * spatial_scale - align_offset; + T roi_width = roi_end_w - roi_start_w; + T roi_height = roi_end_h - roi_start_h; + // aligned == false is for compatibility. the argument "aligned" doesn't have the semantic of + // determining minimum roi size + if (aligned == false) { + roi_height = max(roi_height, static_cast(1.0)); + roi_width = max(roi_width, static_cast(1.0)); + } + const T bin_height = static_cast(roi_height) / static_cast(pooled_height); + const T bin_width = static_cast(roi_width) / static_cast(pooled_width); + const int32_t bin_grid_height = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); + const int32_t bin_grid_width = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + const T count = max(bin_grid_height * bin_grid_width, 1); + const T bin_diff_avg = out_diff_dptr[index] / count; + T* in_diff_channel_dptr = in_diff_dptr + (n * channel_num + c) * height * width; + FOR_RANGE(int64_t, grid_i, 0, bin_grid_height) { + // + .5f for center position + T y = roi_start_h + h * bin_height + + static_cast(grid_i + 0.5f) * bin_height / static_cast(bin_grid_height); + FOR_RANGE(int64_t, grid_j, 0, bin_grid_width) { + T x = roi_start_w + w * bin_width + + static_cast(grid_j + 0.5f) * bin_width / static_cast(bin_grid_width); + T diff11 = 0; + T diff21 = 0; + T diff12 = 0; + T diff22 = 0; + int32_t x_low = 0; + int32_t x_high = 0; + int32_t y_low = 0; + int32_t y_high = 0; + bool has_diff = BilinearInterpolateDiff(bin_diff_avg, height, width, y, x, diff11, diff21, + diff12, diff22, x_low, x_high, y_low, y_high); + if (has_diff) { + const int64_t q11 = y_low * width + x_low; + const int64_t q21 = y_low * width + x_high; + const int64_t q12 = y_high * width + x_low; + const int64_t q22 = y_high * width + x_high; + atomicAdd(in_diff_channel_dptr + q11, diff11); + atomicAdd(in_diff_channel_dptr + q21, diff21); + atomicAdd(in_diff_channel_dptr + q12, diff12); + atomicAdd(in_diff_channel_dptr + q22, diff22); + } + } + } + } +} + +} // namespace + +template +class RoIAlignKernel final : public user_op::OpKernel { + public: + RoIAlignKernel() = default; + ~RoIAlignKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x_blob = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* rois_blob = ctx->Tensor4ArgNameAndIndex("rois", 0); + if (rois_blob->shape_view().elem_cnt() == 0) { return; } + user_op::Tensor* y_blob = ctx->Tensor4ArgNameAndIndex("y", 0); + const int32_t pooled_h = ctx->Attr("pooled_h"); + const int32_t pooled_w = ctx->Attr("pooled_w"); + const float spatial_scale = ctx->Attr("spatial_scale"); + const int32_t sampling_ratio = ctx->Attr("sampling_ratio"); + const bool aligned = ctx->Attr("aligned"); + + const int64_t elem_cnt = y_blob->shape_view().elem_cnt(); + RoiAlignForward<<stream()->As()->cuda_stream()>>>( + elem_cnt, x_blob->dptr(), rois_blob->dptr(), spatial_scale, sampling_ratio, + x_blob->shape_view().At(1), x_blob->shape_view().At(2), x_blob->shape_view().At(3), + pooled_h, pooled_w, aligned, y_blob->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class RoIAlignGradKernel final : public user_op::OpKernel { + public: + RoIAlignGradKernel() = default; + ~RoIAlignGradKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + user_op::Tensor* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0); + if (dx_blob == nullptr) { return; } + Memset(ctx->stream(), dx_blob->mut_dptr(), 0, + dx_blob->shape_view().elem_cnt() * sizeof(T)); + const user_op::Tensor* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0); + const user_op::Tensor* rois_blob = ctx->Tensor4ArgNameAndIndex("rois", 0); + const int32_t pooled_h = ctx->Attr("pooled_h"); + const int32_t pooled_w = ctx->Attr("pooled_w"); + const float spatial_scale = ctx->Attr("spatial_scale"); + const int32_t sampling_ratio = ctx->Attr("sampling_ratio"); + const bool aligned = ctx->Attr("aligned"); + + const int64_t elem_cnt = dy_blob->shape_view().elem_cnt(); + if (elem_cnt > 0) { + RoiAlignBackward<<stream()->As()->cuda_stream()>>>( + elem_cnt, dy_blob->dptr(), rois_blob->dptr(), spatial_scale, sampling_ratio, + dx_blob->shape_view().At(1), dx_blob->shape_view().At(2), dx_blob->shape_view().At(3), + pooled_h, pooled_w, aligned, dx_blob->mut_dptr()); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +REGISTER_USER_KERNEL("roi_align") + .SetCreateFn>() + .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA); + +REGISTER_USER_KERNEL("roi_align_grad") + .SetCreateFn>() + .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/roll_kernel.hip.cpp b/oneflow/user/kernels/roll_kernel.hip.cpp index dc65156..c1e119d 100644 --- a/oneflow/user/kernels/roll_kernel.hip.cpp +++ b/oneflow/user/kernels/roll_kernel.hip.cpp @@ -1,295 +1,295 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/hip/atomic.hip.h" -#include "oneflow/user/kernels/roll_kernel_utils.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -__global__ void RollCudaKernel(const T* in_ptr, const SHIFTS shifts, const SHAPE shape, - const STRIDE stride, const int64_t elements, T* out_ptr) { - int32_t global_index = (blockDim.x * blockIdx.x) + threadIdx.x; - int32_t step = gridDim.x * blockDim.x; - while (global_index < elements) { - int32_t shifted_global_index = - getShiftedIndex(global_index, shifts.val, shape.val, stride.val); - out_ptr[global_index] = in_ptr[shifted_global_index]; - global_index += step; - } -} - -template -struct GpuRollFunctor final { - void operator()(ep::Stream* stream, const T* in_ptr, const SHIFTS shifts, const SHAPE shape, - const STRIDE stride, const int64_t elements, T* out_ptr) { - RollCudaKernel<<As()->cuda_stream()>>>( - in_ptr, shifts, shape, stride, elements, out_ptr); - } -}; - -template -struct GpuRollFunctor final { - void operator()(ep::Stream* stream, const float16* in_ptr, const SHIFTS shifts, const SHAPE shape, - const STRIDE stride, const int64_t elements, float16* out_ptr) { - RollCudaKernel<<As()->cuda_stream()>>>( - reinterpret_cast(in_ptr), shifts, shape, stride, elements, - reinterpret_cast(out_ptr)); - } -}; - -template -__global__ void RollFlattenCudaKernel(const T* in_ptr, const int64_t start, - const int64_t elem_count_minus_start, const int64_t elements, - T* out_ptr) { - int64_t global_index = (blockDim.x * blockIdx.x) + threadIdx.x; - int32_t step = gridDim.x * blockDim.x; - - while (global_index < elements) { - int64_t source_idx = 0; - if (global_index >= elem_count_minus_start) { - source_idx = global_index - elem_count_minus_start; - } else { - source_idx = global_index + start; - } - out_ptr[global_index] = in_ptr[source_idx]; - - global_index += step; - } -} - -template -struct GpuRollFlattenFunctor final { - void operator()(ep::Stream* stream, const T* in_ptr, const int64_t start, - const int64_t elem_count_minus_start, const int64_t elements, T* out_ptr) { - RollFlattenCudaKernel<<As()->cuda_stream()>>>( - in_ptr, start, elem_count_minus_start, elements, out_ptr); - } -}; - -template<> -void GpuRollFlattenFunctor::operator()(ep::Stream* stream, const float16* in_ptr, - const int64_t start, - const int64_t elem_count_minus_start, - const int64_t elements, float16* out_ptr) { - RollFlattenCudaKernel<<As()->cuda_stream()>>>( - reinterpret_cast(in_ptr), start, elem_count_minus_start, elements, - reinterpret_cast(out_ptr)); -} - -template -__global__ void Roll1DimCudaKernel(const T* in_ptr, const int32_t stride_x_size, - const int32_t stride, const int32_t size_minus_start, - const int32_t size_minus_start_x_stride, - const int32_t start_x_stride, const int64_t elements, - T* out_ptr) { - int32_t global_index = (blockDim.x * blockIdx.x) + threadIdx.x; - int32_t step = gridDim.x * blockDim.x; - - while (global_index < elements) { - // roll dim idx is the index of linear_index along the rolling dimension. - int32_t roll_dim_idx = global_index % stride_x_size / stride; - // index into the source data to find appropriate value. - int32_t source_idx = 0; - if (roll_dim_idx >= size_minus_start) { - source_idx = global_index - size_minus_start_x_stride; - } else { - source_idx = global_index + start_x_stride; - } - out_ptr[global_index] = in_ptr[source_idx]; - - global_index += step; - } -} - -template -struct GpuRoll1DimFunctor final { - void operator()(ep::Stream* stream, const T* in_ptr, const int32_t stride_x_size, - const int32_t stride, const int32_t size_minus_start, - const int32_t size_minus_start_x_stride, const int32_t start_x_stride, - const int64_t elements, T* out_ptr) { - Roll1DimCudaKernel<<As()->cuda_stream()>>>( - in_ptr, stride_x_size, stride, size_minus_start, size_minus_start_x_stride, start_x_stride, - elements, out_ptr); - } -}; - -template<> -void GpuRoll1DimFunctor::operator()(ep::Stream* stream, const float16* in_ptr, - const int32_t stride_x_size, const int32_t stride, - const int32_t size_minus_start, - const int32_t size_minus_start_x_stride, - const int32_t start_x_stride, const int64_t elements, - float16* out_ptr) { - Roll1DimCudaKernel<<As()->cuda_stream()>>>( - reinterpret_cast(in_ptr), stride_x_size, stride, size_minus_start, - size_minus_start_x_stride, start_x_stride, elements, reinterpret_cast(out_ptr)); -} - -} // namespace - -template -class GpuRollKernel final : public user_op::OpKernel { - public: - GpuRollKernel() = default; - ~GpuRollKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const std::vector& shifts = ctx->Attr>("shifts"); - const std::vector& dims = ctx->Attr>("dims"); - - const T* in_ptr = in->dptr(); - T* out_ptr = out->mut_dptr(); - const int64_t elem_count = out->shape_view().elem_cnt(); - - if (dims[0] == -1) { - // NOTE(Liang Depeng): Borrow the implementation of pytorch and simplify to 1d array case. - int64_t start = (elem_count - shifts[0]) % elem_count; - if (start < 0) start = start + elem_count; - const int64_t elem_count_minus_start = elem_count - start; - GpuRollFlattenFunctor()(ctx->stream(), in_ptr, start, elem_count_minus_start, elem_count, - out_ptr); - } else { - SHAPE new_shape{}; - SHIFTS new_shifts{}; - int32_t num_axes = 0; - computeParams(in->shape_view(), shifts, dims, new_shifts.val, new_shape.val, &num_axes); - - STRIDE stride{}; - initStride(stride, new_shape, num_axes); - - if (dims.size() == 1) { - // NOTE(Liang Depeng): Borrow the implementation of pytorch - const int32_t size = new_shape.val[dims[0]]; - int32_t start = (size - new_shifts.val[dims[0]]) % size; - // Behavior of % is different in C++ vs Python for negative numbers. This - // corrects the difference. - if (start < 0) start = start + size; - - const int32_t stride_x_size = stride.val[dims[0]] * size; - const int32_t size_minus_start = size - start; - const int32_t size_minus_start_x_stride = size_minus_start * stride.val[dims[0]]; - const int32_t start_x_stride = start * stride.val[dims[0]]; - - GpuRoll1DimFunctor()(ctx->stream(), in_ptr, stride_x_size, stride.val[dims[0]], - size_minus_start, size_minus_start_x_stride, start_x_stride, - elem_count, out_ptr); - - } else { - transformShifts(new_shifts.val, new_shape.val, num_axes); - switch (num_axes) { - case 1: - GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count, - out_ptr); - break; - case 2: - GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count, - out_ptr); - break; - case 3: - GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count, - out_ptr); - break; - case 4: - GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count, - out_ptr); - break; - case 5: - GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count, - out_ptr); - break; - case 6: - GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count, - out_ptr); - break; - case 7: - GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count, - out_ptr); - break; - case 8: - GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count, - out_ptr); - break; - case 9: - GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count, - out_ptr); - break; - case 10: - GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, - elem_count, out_ptr); - break; - case 11: - GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, - elem_count, out_ptr); - break; - case 12: - GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, - elem_count, out_ptr); - break; - case 13: - GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, - elem_count, out_ptr); - break; - case 14: - GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, - elem_count, out_ptr); - break; - case 15: - GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, - elem_count, out_ptr); - break; - case 16: - GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, - elem_count, out_ptr); - break; - default: break; - } - } - } - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_ROLL_KERNEL(dtype) \ - REGISTER_USER_KERNEL("roll").SetCreateFn>().SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("in", 0) == GetDataType::value)) - -REGISTER_ROLL_KERNEL(float); -REGISTER_ROLL_KERNEL(double); -REGISTER_ROLL_KERNEL(float16); -REGISTER_ROLL_KERNEL(bool); -REGISTER_ROLL_KERNEL(uint8_t); -REGISTER_ROLL_KERNEL(int8_t); -REGISTER_ROLL_KERNEL(int32_t); -REGISTER_ROLL_KERNEL(int64_t); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/hip/atomic.hip.h" +#include "oneflow/user/kernels/roll_kernel_utils.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +__global__ void RollCudaKernel(const T* in_ptr, const SHIFTS shifts, const SHAPE shape, + const STRIDE stride, const int64_t elements, T* out_ptr) { + int32_t global_index = (blockDim.x * blockIdx.x) + threadIdx.x; + int32_t step = gridDim.x * blockDim.x; + while (global_index < elements) { + int32_t shifted_global_index = + getShiftedIndex(global_index, shifts.val, shape.val, stride.val); + out_ptr[global_index] = in_ptr[shifted_global_index]; + global_index += step; + } +} + +template +struct GpuRollFunctor final { + void operator()(ep::Stream* stream, const T* in_ptr, const SHIFTS shifts, const SHAPE shape, + const STRIDE stride, const int64_t elements, T* out_ptr) { + RollCudaKernel<<As()->cuda_stream()>>>( + in_ptr, shifts, shape, stride, elements, out_ptr); + } +}; + +template +struct GpuRollFunctor final { + void operator()(ep::Stream* stream, const float16* in_ptr, const SHIFTS shifts, const SHAPE shape, + const STRIDE stride, const int64_t elements, float16* out_ptr) { + RollCudaKernel<<As()->cuda_stream()>>>( + reinterpret_cast(in_ptr), shifts, shape, stride, elements, + reinterpret_cast(out_ptr)); + } +}; + +template +__global__ void RollFlattenCudaKernel(const T* in_ptr, const int64_t start, + const int64_t elem_count_minus_start, const int64_t elements, + T* out_ptr) { + int64_t global_index = (blockDim.x * blockIdx.x) + threadIdx.x; + int32_t step = gridDim.x * blockDim.x; + + while (global_index < elements) { + int64_t source_idx = 0; + if (global_index >= elem_count_minus_start) { + source_idx = global_index - elem_count_minus_start; + } else { + source_idx = global_index + start; + } + out_ptr[global_index] = in_ptr[source_idx]; + + global_index += step; + } +} + +template +struct GpuRollFlattenFunctor final { + void operator()(ep::Stream* stream, const T* in_ptr, const int64_t start, + const int64_t elem_count_minus_start, const int64_t elements, T* out_ptr) { + RollFlattenCudaKernel<<As()->cuda_stream()>>>( + in_ptr, start, elem_count_minus_start, elements, out_ptr); + } +}; + +template<> +void GpuRollFlattenFunctor::operator()(ep::Stream* stream, const float16* in_ptr, + const int64_t start, + const int64_t elem_count_minus_start, + const int64_t elements, float16* out_ptr) { + RollFlattenCudaKernel<<As()->cuda_stream()>>>( + reinterpret_cast(in_ptr), start, elem_count_minus_start, elements, + reinterpret_cast(out_ptr)); +} + +template +__global__ void Roll1DimCudaKernel(const T* in_ptr, const int32_t stride_x_size, + const int32_t stride, const int32_t size_minus_start, + const int32_t size_minus_start_x_stride, + const int32_t start_x_stride, const int64_t elements, + T* out_ptr) { + int32_t global_index = (blockDim.x * blockIdx.x) + threadIdx.x; + int32_t step = gridDim.x * blockDim.x; + + while (global_index < elements) { + // roll dim idx is the index of linear_index along the rolling dimension. + int32_t roll_dim_idx = global_index % stride_x_size / stride; + // index into the source data to find appropriate value. + int32_t source_idx = 0; + if (roll_dim_idx >= size_minus_start) { + source_idx = global_index - size_minus_start_x_stride; + } else { + source_idx = global_index + start_x_stride; + } + out_ptr[global_index] = in_ptr[source_idx]; + + global_index += step; + } +} + +template +struct GpuRoll1DimFunctor final { + void operator()(ep::Stream* stream, const T* in_ptr, const int32_t stride_x_size, + const int32_t stride, const int32_t size_minus_start, + const int32_t size_minus_start_x_stride, const int32_t start_x_stride, + const int64_t elements, T* out_ptr) { + Roll1DimCudaKernel<<As()->cuda_stream()>>>( + in_ptr, stride_x_size, stride, size_minus_start, size_minus_start_x_stride, start_x_stride, + elements, out_ptr); + } +}; + +template<> +void GpuRoll1DimFunctor::operator()(ep::Stream* stream, const float16* in_ptr, + const int32_t stride_x_size, const int32_t stride, + const int32_t size_minus_start, + const int32_t size_minus_start_x_stride, + const int32_t start_x_stride, const int64_t elements, + float16* out_ptr) { + Roll1DimCudaKernel<<As()->cuda_stream()>>>( + reinterpret_cast(in_ptr), stride_x_size, stride, size_minus_start, + size_minus_start_x_stride, start_x_stride, elements, reinterpret_cast(out_ptr)); +} + +} // namespace + +template +class GpuRollKernel final : public user_op::OpKernel { + public: + GpuRollKernel() = default; + ~GpuRollKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + const std::vector& shifts = ctx->Attr>("shifts"); + const std::vector& dims = ctx->Attr>("dims"); + + const T* in_ptr = in->dptr(); + T* out_ptr = out->mut_dptr(); + const int64_t elem_count = out->shape_view().elem_cnt(); + + if (dims[0] == -1) { + // NOTE(Liang Depeng): Borrow the implementation of pytorch and simplify to 1d array case. + int64_t start = (elem_count - shifts[0]) % elem_count; + if (start < 0) start = start + elem_count; + const int64_t elem_count_minus_start = elem_count - start; + GpuRollFlattenFunctor()(ctx->stream(), in_ptr, start, elem_count_minus_start, elem_count, + out_ptr); + } else { + SHAPE new_shape{}; + SHIFTS new_shifts{}; + int32_t num_axes = 0; + computeParams(in->shape_view(), shifts, dims, new_shifts.val, new_shape.val, &num_axes); + + STRIDE stride{}; + initStride(stride, new_shape, num_axes); + + if (dims.size() == 1) { + // NOTE(Liang Depeng): Borrow the implementation of pytorch + const int32_t size = new_shape.val[dims[0]]; + int32_t start = (size - new_shifts.val[dims[0]]) % size; + // Behavior of % is different in C++ vs Python for negative numbers. This + // corrects the difference. + if (start < 0) start = start + size; + + const int32_t stride_x_size = stride.val[dims[0]] * size; + const int32_t size_minus_start = size - start; + const int32_t size_minus_start_x_stride = size_minus_start * stride.val[dims[0]]; + const int32_t start_x_stride = start * stride.val[dims[0]]; + + GpuRoll1DimFunctor()(ctx->stream(), in_ptr, stride_x_size, stride.val[dims[0]], + size_minus_start, size_minus_start_x_stride, start_x_stride, + elem_count, out_ptr); + + } else { + transformShifts(new_shifts.val, new_shape.val, num_axes); + switch (num_axes) { + case 1: + GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count, + out_ptr); + break; + case 2: + GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count, + out_ptr); + break; + case 3: + GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count, + out_ptr); + break; + case 4: + GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count, + out_ptr); + break; + case 5: + GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count, + out_ptr); + break; + case 6: + GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count, + out_ptr); + break; + case 7: + GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count, + out_ptr); + break; + case 8: + GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count, + out_ptr); + break; + case 9: + GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, elem_count, + out_ptr); + break; + case 10: + GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, + elem_count, out_ptr); + break; + case 11: + GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, + elem_count, out_ptr); + break; + case 12: + GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, + elem_count, out_ptr); + break; + case 13: + GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, + elem_count, out_ptr); + break; + case 14: + GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, + elem_count, out_ptr); + break; + case 15: + GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, + elem_count, out_ptr); + break; + case 16: + GpuRollFunctor()(ctx->stream(), in_ptr, new_shifts, new_shape, stride, + elem_count, out_ptr); + break; + default: break; + } + } + } + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_ROLL_KERNEL(dtype) \ + REGISTER_USER_KERNEL("roll").SetCreateFn>().SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("in", 0) == GetDataType::value)) + +REGISTER_ROLL_KERNEL(float); +REGISTER_ROLL_KERNEL(double); +REGISTER_ROLL_KERNEL(float16); +REGISTER_ROLL_KERNEL(bool); +REGISTER_ROLL_KERNEL(uint8_t); +REGISTER_ROLL_KERNEL(int8_t); +REGISTER_ROLL_KERNEL(int32_t); +REGISTER_ROLL_KERNEL(int64_t); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/scalar_math_kernels.hip.cpp b/oneflow/user/kernels/scalar_math_kernels.hip.cpp index 6691786..377e669 100644 --- a/oneflow/user/kernels/scalar_math_kernels.hip.cpp +++ b/oneflow/user/kernels/scalar_math_kernels.hip.cpp @@ -1,223 +1,223 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/user/kernels/scalar_math_kernels.h" -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/core/kernel/util/cuda_half_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -template class Op, typename T> -struct UnaryByScalarFunctor { - __host__ __device__ explicit UnaryByScalarFunctor(T scalar) : scalar(scalar) {} - __device__ T operator()(T a) const { return Op::Invoke(a, scalar); } - const T scalar; -}; - -template class Op, typename T> -struct UnaryByScalarReverseFunctor { - __host__ __device__ explicit UnaryByScalarReverseFunctor(T scalar) : scalar(scalar) {} - __device__ T operator()(T a) const { return Op::Invoke(scalar, a); } - const T scalar; -}; - -template class Op> -struct UnaryByScalarFunctor { - __host__ __device__ explicit UnaryByScalarFunctor(half scalar) : scalar(scalar) {} - __device__ half operator()(half a) const { return Op::Invoke(a, scalar); } - const half scalar; -}; - -template class Op> -struct UnaryByScalarReverseFunctor { - __host__ __device__ explicit UnaryByScalarReverseFunctor(half scalar) : scalar(scalar) {} - __device__ half operator()(half a) const { return Op::Invoke(scalar, a); } - const half scalar; -}; - -template class BIN_OP, typename T> -struct ScalarMathFunctor final { - void operator()(ep::Stream* stream, const int64_t elem_cnt, const T scalar, const T* in, T* out) { - OF_CUDA_CHECK(cuda::elementwise::Unary(UnaryByScalarFunctor(scalar), elem_cnt, out, - in, stream->As()->cuda_stream())); - } -}; - -template class BIN_OP> -struct ScalarMathFunctor final { - void operator()(ep::Stream* stream, const int64_t elem_cnt, float16 scalar, const float16* in, - float16* out) { - OF_CUDA_CHECK(cuda::elementwise::Unary( - UnaryByScalarFunctor(float16_2half(scalar)), elem_cnt, - reinterpret_cast(out), reinterpret_cast(in), - stream->As()->cuda_stream())); - } -}; - -template class BIN_OP, typename T> -struct ScalarReverseMathFunctor final { - void operator()(ep::Stream* stream, const int64_t elem_cnt, const T scalar, const T* in, T* out) { - OF_CUDA_CHECK(cuda::elementwise::Unary(UnaryByScalarReverseFunctor(scalar), elem_cnt, - out, in, stream->As()->cuda_stream())); - } -}; - -template class BIN_OP> -struct ScalarReverseMathFunctor final { - void operator()(ep::Stream* stream, const int64_t elem_cnt, float16 scalar, const float16* in, - float16* out) { - OF_CUDA_CHECK(cuda::elementwise::Unary( - UnaryByScalarReverseFunctor(float16_2half(scalar)), elem_cnt, - reinterpret_cast(out), reinterpret_cast(in), - stream->As()->cuda_stream())); - } -}; - -INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncAdd); -INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncFloorDiv); -INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncFMod); -INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncMul); -INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncDiv); -INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncPow); -INSTANTIATE_SCALAR_REVERSE_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncPow); - -template -struct ScalarPowGradFunctor { - OF_DEVICE_FUNC explicit ScalarPowGradFunctor(T exponent) : exponent(exponent) {} - __device__ T operator()(T x, T dy) const { - return exponent * (pow(x, exponent - static_cast(1.0))) * dy; - } - const T exponent; -}; - -template<> -struct ScalarPowGradFunctor { - OF_DEVICE_FUNC explicit ScalarPowGradFunctor(half exponent) : exponent(exponent) {} - __device__ half operator()(half x, half dy) const { - return __float2half(__half2float(exponent) - * (powf(__half2float(x), __half2float(exponent) - static_cast(1.0))) - * __half2float(dy)); - } - const half exponent; -}; - -template -struct ScalarReversePowGradFunctor { - OF_DEVICE_FUNC explicit ScalarReversePowGradFunctor(T exponent) : exponent(exponent) {} - __device__ T operator()(T x, T dy) const { return pow(exponent, x) * log(exponent) * dy; } - const T exponent; -}; - -template<> -struct ScalarReversePowGradFunctor { - OF_DEVICE_FUNC explicit ScalarReversePowGradFunctor(float exponent) : exponent(exponent) {} - __device__ float operator()(float x, float dy) const { - return powf(exponent, x) * logf(exponent) * dy; - } - const float exponent; -}; - -template<> -struct ScalarReversePowGradFunctor { - OF_DEVICE_FUNC explicit ScalarReversePowGradFunctor(half exponent) : exponent(exponent) {} - __device__ half operator()(half x, half dy) const { - const float exp = __half2float(exponent); - return __float2half(exp * powf(exp, __half2float(x)) * logf(exp) * __half2float(dy)); - } - const half exponent; -}; - -template -class GpuScalarPowGradKernel final : public user_op::OpKernel { - public: - GpuScalarPowGradKernel() = default; - ~GpuScalarPowGradKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); - user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); - const T* x_ptr = x_tensor->dptr(); - const T* dy_ptr = dy_tensor->dptr(); - T* dx_ptr = dx_tensor->mut_dptr(); - T scalar_operand = static_cast(0); - if (ctx->Attr("has_int_operand")) { - scalar_operand = static_cast(ctx->Attr("int_operand")); - } else if (ctx->Attr("has_float_operand")) { - scalar_operand = static_cast(ctx->Attr("float_operand")); - } else { - UNIMPLEMENTED(); - } - const int32_t elem_cnt = x_tensor->shape_view().elem_cnt(); - OF_CUDA_CHECK((oneflow::cuda::elementwise::Binary( - ScalarPowGradFunctor(scalar_operand), elem_cnt, dx_ptr, x_ptr, dy_ptr, - ctx->stream()->As()->cuda_stream()))); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_SCALAR_POW_BACKWARD_KERNEL(device, dtype) \ - REGISTER_USER_KERNEL("scalar_pow_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == device) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)); - -REGISTER_CUDA_SCALAR_POW_BACKWARD_KERNEL(DeviceType::kCUDA, float); -REGISTER_CUDA_SCALAR_POW_BACKWARD_KERNEL(DeviceType::kCUDA, double); - -template -class GpuScalarReversePowGradKernel final : public user_op::OpKernel { - public: - GpuScalarReversePowGradKernel() = default; - ~GpuScalarReversePowGradKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); - user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); - const T* x_ptr = x_tensor->dptr(); - const T* dy_ptr = dy_tensor->dptr(); - T* dx_ptr = dx_tensor->mut_dptr(); - T scalar_operand = static_cast(0); - if (ctx->Attr("has_int_operand")) { - scalar_operand = static_cast(ctx->Attr("int_operand")); - } else if (ctx->Attr("has_float_operand")) { - scalar_operand = static_cast(ctx->Attr("float_operand")); - } else { - UNIMPLEMENTED(); - } - const int32_t elem_cnt = x_tensor->shape_view().elem_cnt(); - OF_CUDA_CHECK((oneflow::cuda::elementwise::Binary( - ScalarReversePowGradFunctor(scalar_operand), elem_cnt, dx_ptr, x_ptr, dy_ptr, - ctx->stream()->As()->cuda_stream()))); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_SCALAR_REVERSE_POW_BACKWARD_KERNEL(device, dtype) \ - REGISTER_USER_KERNEL("scalar_reverse_pow_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == device) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)); - -REGISTER_CUDA_SCALAR_REVERSE_POW_BACKWARD_KERNEL(DeviceType::kCUDA, float); -REGISTER_CUDA_SCALAR_REVERSE_POW_BACKWARD_KERNEL(DeviceType::kCUDA, double); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/user/kernels/scalar_math_kernels.h" +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/core/kernel/util/cuda_half_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +template class Op, typename T> +struct UnaryByScalarFunctor { + __host__ __device__ explicit UnaryByScalarFunctor(T scalar) : scalar(scalar) {} + __device__ T operator()(T a) const { return Op::Invoke(a, scalar); } + const T scalar; +}; + +template class Op, typename T> +struct UnaryByScalarReverseFunctor { + __host__ __device__ explicit UnaryByScalarReverseFunctor(T scalar) : scalar(scalar) {} + __device__ T operator()(T a) const { return Op::Invoke(scalar, a); } + const T scalar; +}; + +template class Op> +struct UnaryByScalarFunctor { + __host__ __device__ explicit UnaryByScalarFunctor(half scalar) : scalar(scalar) {} + __device__ half operator()(half a) const { return Op::Invoke(a, scalar); } + const half scalar; +}; + +template class Op> +struct UnaryByScalarReverseFunctor { + __host__ __device__ explicit UnaryByScalarReverseFunctor(half scalar) : scalar(scalar) {} + __device__ half operator()(half a) const { return Op::Invoke(scalar, a); } + const half scalar; +}; + +template class BIN_OP, typename T> +struct ScalarMathFunctor final { + void operator()(ep::Stream* stream, const int64_t elem_cnt, const T scalar, const T* in, T* out) { + OF_CUDA_CHECK(cuda::elementwise::Unary(UnaryByScalarFunctor(scalar), elem_cnt, out, + in, stream->As()->cuda_stream())); + } +}; + +template class BIN_OP> +struct ScalarMathFunctor final { + void operator()(ep::Stream* stream, const int64_t elem_cnt, float16 scalar, const float16* in, + float16* out) { + OF_CUDA_CHECK(cuda::elementwise::Unary( + UnaryByScalarFunctor(float16_2half(scalar)), elem_cnt, + reinterpret_cast(out), reinterpret_cast(in), + stream->As()->cuda_stream())); + } +}; + +template class BIN_OP, typename T> +struct ScalarReverseMathFunctor final { + void operator()(ep::Stream* stream, const int64_t elem_cnt, const T scalar, const T* in, T* out) { + OF_CUDA_CHECK(cuda::elementwise::Unary(UnaryByScalarReverseFunctor(scalar), elem_cnt, + out, in, stream->As()->cuda_stream())); + } +}; + +template class BIN_OP> +struct ScalarReverseMathFunctor final { + void operator()(ep::Stream* stream, const int64_t elem_cnt, float16 scalar, const float16* in, + float16* out) { + OF_CUDA_CHECK(cuda::elementwise::Unary( + UnaryByScalarReverseFunctor(float16_2half(scalar)), elem_cnt, + reinterpret_cast(out), reinterpret_cast(in), + stream->As()->cuda_stream())); + } +}; + +INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncAdd); +INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncFloorDiv); +INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncFMod); +INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncMul); +INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncDiv); +INSTANTIATE_SCALAR_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncPow); +INSTANTIATE_SCALAR_REVERSE_MATH_FUNCTORS(DeviceType::kCUDA, BinaryFuncPow); + +template +struct ScalarPowGradFunctor { + OF_DEVICE_FUNC explicit ScalarPowGradFunctor(T exponent) : exponent(exponent) {} + __device__ T operator()(T x, T dy) const { + return exponent * (pow(x, exponent - static_cast(1.0))) * dy; + } + const T exponent; +}; + +template<> +struct ScalarPowGradFunctor { + OF_DEVICE_FUNC explicit ScalarPowGradFunctor(half exponent) : exponent(exponent) {} + __device__ half operator()(half x, half dy) const { + return __float2half(__half2float(exponent) + * (powf(__half2float(x), __half2float(exponent) - static_cast(1.0))) + * __half2float(dy)); + } + const half exponent; +}; + +template +struct ScalarReversePowGradFunctor { + OF_DEVICE_FUNC explicit ScalarReversePowGradFunctor(T exponent) : exponent(exponent) {} + __device__ T operator()(T x, T dy) const { return pow(exponent, x) * log(exponent) * dy; } + const T exponent; +}; + +template<> +struct ScalarReversePowGradFunctor { + OF_DEVICE_FUNC explicit ScalarReversePowGradFunctor(float exponent) : exponent(exponent) {} + __device__ float operator()(float x, float dy) const { + return powf(exponent, x) * logf(exponent) * dy; + } + const float exponent; +}; + +template<> +struct ScalarReversePowGradFunctor { + OF_DEVICE_FUNC explicit ScalarReversePowGradFunctor(half exponent) : exponent(exponent) {} + __device__ half operator()(half x, half dy) const { + const float exp = __half2float(exponent); + return __float2half(exp * powf(exp, __half2float(x)) * logf(exp) * __half2float(dy)); + } + const half exponent; +}; + +template +class GpuScalarPowGradKernel final : public user_op::OpKernel { + public: + GpuScalarPowGradKernel() = default; + ~GpuScalarPowGradKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); + user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); + const T* x_ptr = x_tensor->dptr(); + const T* dy_ptr = dy_tensor->dptr(); + T* dx_ptr = dx_tensor->mut_dptr(); + T scalar_operand = static_cast(0); + if (ctx->Attr("has_int_operand")) { + scalar_operand = static_cast(ctx->Attr("int_operand")); + } else if (ctx->Attr("has_float_operand")) { + scalar_operand = static_cast(ctx->Attr("float_operand")); + } else { + UNIMPLEMENTED(); + } + const int32_t elem_cnt = x_tensor->shape_view().elem_cnt(); + OF_CUDA_CHECK((oneflow::cuda::elementwise::Binary( + ScalarPowGradFunctor(scalar_operand), elem_cnt, dx_ptr, x_ptr, dy_ptr, + ctx->stream()->As()->cuda_stream()))); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_SCALAR_POW_BACKWARD_KERNEL(device, dtype) \ + REGISTER_USER_KERNEL("scalar_pow_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == device) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value)); + +REGISTER_CUDA_SCALAR_POW_BACKWARD_KERNEL(DeviceType::kCUDA, float); +REGISTER_CUDA_SCALAR_POW_BACKWARD_KERNEL(DeviceType::kCUDA, double); + +template +class GpuScalarReversePowGradKernel final : public user_op::OpKernel { + public: + GpuScalarReversePowGradKernel() = default; + ~GpuScalarReversePowGradKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); + user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); + const T* x_ptr = x_tensor->dptr(); + const T* dy_ptr = dy_tensor->dptr(); + T* dx_ptr = dx_tensor->mut_dptr(); + T scalar_operand = static_cast(0); + if (ctx->Attr("has_int_operand")) { + scalar_operand = static_cast(ctx->Attr("int_operand")); + } else if (ctx->Attr("has_float_operand")) { + scalar_operand = static_cast(ctx->Attr("float_operand")); + } else { + UNIMPLEMENTED(); + } + const int32_t elem_cnt = x_tensor->shape_view().elem_cnt(); + OF_CUDA_CHECK((oneflow::cuda::elementwise::Binary( + ScalarReversePowGradFunctor(scalar_operand), elem_cnt, dx_ptr, x_ptr, dy_ptr, + ctx->stream()->As()->cuda_stream()))); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_SCALAR_REVERSE_POW_BACKWARD_KERNEL(device, dtype) \ + REGISTER_USER_KERNEL("scalar_reverse_pow_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == device) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value)); + +REGISTER_CUDA_SCALAR_REVERSE_POW_BACKWARD_KERNEL(DeviceType::kCUDA, float); +REGISTER_CUDA_SCALAR_REVERSE_POW_BACKWARD_KERNEL(DeviceType::kCUDA, double); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/search_sorted_kernel.hip.cpp b/oneflow/user/kernels/search_sorted_kernel.hip.cpp index 2fcdabe..bb65bbe 100644 --- a/oneflow/user/kernels/search_sorted_kernel.hip.cpp +++ b/oneflow/user/kernels/search_sorted_kernel.hip.cpp @@ -1,129 +1,129 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/cuda_graph_support.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/user/kernels/search_sorted_kernel_util.h" - -namespace oneflow { - -template -__global__ void DoSearchSortedLogical(int32_t instance_num, bool is_sequence_1d, - K values_shape_last, K sequence_shape_last, bool right, - const T* values_ptr, const T* sequence_ptr, K* out_ptr) { - CUDA_1D_KERNEL_LOOP(i, instance_num) { - K start_bd = is_sequence_1d ? 0 : i / values_shape_last * sequence_shape_last; - K end_bd = start_bd + sequence_shape_last; - K pos = !right - ? cus_lower_bound(start_bd, end_bd, values_ptr[i], sequence_ptr) - start_bd - : cus_upper_bound(start_bd, end_bd, values_ptr[i], sequence_ptr) - start_bd; - out_ptr[i] = pos; - } -} - -template -__global__ void DoSearchSortedScalarLogical(K sequence_shape_last, bool right, const T values, - const T* sequence_ptr, K* out_ptr) { - CUDA_1D_KERNEL_LOOP(i, 1) { - K pos = !right ? cus_lower_bound(0, sequence_shape_last, values, sequence_ptr) - : cus_upper_bound(0, sequence_shape_last, values, sequence_ptr); - out_ptr[0] = pos; - } -} - -template -class GpuSearchSortedKernel final : public user_op::OpKernel { - public: - GpuSearchSortedKernel() = default; - ~GpuSearchSortedKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* sorted_sequence = ctx->Tensor4ArgNameAndIndex("sorted_sequence", 0); - const user_op::Tensor* values = ctx->Tensor4ArgNameAndIndex("values", 0); - - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - const bool& right = ctx->Attr("right"); - const T* values_ptr = values->dptr(); - const T* sequence_ptr = sorted_sequence->dptr(); - K* out_ptr = out->mut_dptr(); - const int32_t instance_num = values->shape_view().elem_cnt(); - bool is_values_scalar = values->shape_view().NumAxes() == 0; - bool is_sequence_1d = (sorted_sequence->shape_view().NumAxes() == 1); - K values_shape_last = - is_values_scalar ? 1 : values->shape_view().At(values->shape_view().NumAxes() - 1); - K sequence_shape_last = - sorted_sequence->shape_view().At(sorted_sequence->shape_view().NumAxes() - 1); - RUN_CUDA_KERNEL((DoSearchSortedLogical), ctx->stream(), instance_num, instance_num, - is_sequence_1d, values_shape_last, sequence_shape_last, right, values_ptr, - sequence_ptr, out_ptr); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_GPU_SEARCH_SORTED_KERNEL(in_dtype, out_dtype) \ - REGISTER_USER_KERNEL("searchsorted") \ - .SetCreateFn< \ - GpuSearchSortedKernel>() \ - .SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("sorted_sequence", 0) == OF_PP_PAIR_SECOND(in_dtype)) \ - && (user_op::HobDataType("values", 0) == OF_PP_PAIR_SECOND(in_dtype)) \ - && (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(out_dtype))); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_GPU_SEARCH_SORTED_KERNEL, ARITHMETIC_DATA_TYPE_SEQ, - INDEX_DATA_TYPE_SEQ) - -template -class GpuSearchSortedScalarKernel final : public user_op::OpKernel { - public: - GpuSearchSortedScalarKernel() = default; - ~GpuSearchSortedScalarKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* sorted_sequence = ctx->Tensor4ArgNameAndIndex("sorted_sequence", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - - const bool& right = ctx->Attr("right"); - const T& values = static_cast(ctx->Attr("values")); - - const T* sequence_ptr = sorted_sequence->dptr(); - K* out_ptr = out->mut_dptr(); - K sequence_shape_last = sorted_sequence->shape_view().At(0); - RUN_CUDA_KERNEL((DoSearchSortedScalarLogical), ctx->stream(), 1, sequence_shape_last, - right, values, sequence_ptr, out_ptr); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_GPU_SEARCH_SORTED_SCALAR_KERNEL(in_dtype, out_dtype) \ - REGISTER_USER_KERNEL("searchsorted_scalar") \ - .SetCreateFn< \ - GpuSearchSortedScalarKernel>() \ - .SetIsMatchedHob( \ - (user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("sorted_sequence", 0) == OF_PP_PAIR_SECOND(in_dtype)) \ - && (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(out_dtype))); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_GPU_SEARCH_SORTED_SCALAR_KERNEL, ARITHMETIC_DATA_TYPE_SEQ, - INDEX_DATA_TYPE_SEQ) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/cuda_graph_support.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/user/kernels/search_sorted_kernel_util.h" + +namespace oneflow { + +template +__global__ void DoSearchSortedLogical(int32_t instance_num, bool is_sequence_1d, + K values_shape_last, K sequence_shape_last, bool right, + const T* values_ptr, const T* sequence_ptr, K* out_ptr) { + CUDA_1D_KERNEL_LOOP(i, instance_num) { + K start_bd = is_sequence_1d ? 0 : i / values_shape_last * sequence_shape_last; + K end_bd = start_bd + sequence_shape_last; + K pos = !right + ? cus_lower_bound(start_bd, end_bd, values_ptr[i], sequence_ptr) - start_bd + : cus_upper_bound(start_bd, end_bd, values_ptr[i], sequence_ptr) - start_bd; + out_ptr[i] = pos; + } +} + +template +__global__ void DoSearchSortedScalarLogical(K sequence_shape_last, bool right, const T values, + const T* sequence_ptr, K* out_ptr) { + CUDA_1D_KERNEL_LOOP(i, 1) { + K pos = !right ? cus_lower_bound(0, sequence_shape_last, values, sequence_ptr) + : cus_upper_bound(0, sequence_shape_last, values, sequence_ptr); + out_ptr[0] = pos; + } +} + +template +class GpuSearchSortedKernel final : public user_op::OpKernel { + public: + GpuSearchSortedKernel() = default; + ~GpuSearchSortedKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* sorted_sequence = ctx->Tensor4ArgNameAndIndex("sorted_sequence", 0); + const user_op::Tensor* values = ctx->Tensor4ArgNameAndIndex("values", 0); + + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + const bool& right = ctx->Attr("right"); + const T* values_ptr = values->dptr(); + const T* sequence_ptr = sorted_sequence->dptr(); + K* out_ptr = out->mut_dptr(); + const int32_t instance_num = values->shape_view().elem_cnt(); + bool is_values_scalar = values->shape_view().NumAxes() == 0; + bool is_sequence_1d = (sorted_sequence->shape_view().NumAxes() == 1); + K values_shape_last = + is_values_scalar ? 1 : values->shape_view().At(values->shape_view().NumAxes() - 1); + K sequence_shape_last = + sorted_sequence->shape_view().At(sorted_sequence->shape_view().NumAxes() - 1); + RUN_CUDA_KERNEL((DoSearchSortedLogical), ctx->stream(), instance_num, instance_num, + is_sequence_1d, values_shape_last, sequence_shape_last, right, values_ptr, + sequence_ptr, out_ptr); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_GPU_SEARCH_SORTED_KERNEL(in_dtype, out_dtype) \ + REGISTER_USER_KERNEL("searchsorted") \ + .SetCreateFn< \ + GpuSearchSortedKernel>() \ + .SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("sorted_sequence", 0) == OF_PP_PAIR_SECOND(in_dtype)) \ + && (user_op::HobDataType("values", 0) == OF_PP_PAIR_SECOND(in_dtype)) \ + && (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(out_dtype))); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_GPU_SEARCH_SORTED_KERNEL, ARITHMETIC_DATA_TYPE_SEQ, + INDEX_DATA_TYPE_SEQ) + +template +class GpuSearchSortedScalarKernel final : public user_op::OpKernel { + public: + GpuSearchSortedScalarKernel() = default; + ~GpuSearchSortedScalarKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* sorted_sequence = ctx->Tensor4ArgNameAndIndex("sorted_sequence", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + + const bool& right = ctx->Attr("right"); + const T& values = static_cast(ctx->Attr("values")); + + const T* sequence_ptr = sorted_sequence->dptr(); + K* out_ptr = out->mut_dptr(); + K sequence_shape_last = sorted_sequence->shape_view().At(0); + RUN_CUDA_KERNEL((DoSearchSortedScalarLogical), ctx->stream(), 1, sequence_shape_last, + right, values, sequence_ptr, out_ptr); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_GPU_SEARCH_SORTED_SCALAR_KERNEL(in_dtype, out_dtype) \ + REGISTER_USER_KERNEL("searchsorted_scalar") \ + .SetCreateFn< \ + GpuSearchSortedScalarKernel>() \ + .SetIsMatchedHob( \ + (user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("sorted_sequence", 0) == OF_PP_PAIR_SECOND(in_dtype)) \ + && (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(out_dtype))); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_GPU_SEARCH_SORTED_SCALAR_KERNEL, ARITHMETIC_DATA_TYPE_SEQ, + INDEX_DATA_TYPE_SEQ) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/sigmoid_cross_entropy_kernel.hip.cpp b/oneflow/user/kernels/sigmoid_cross_entropy_kernel.hip.cpp index 237d71b..2181b4b 100644 --- a/oneflow/user/kernels/sigmoid_cross_entropy_kernel.hip.cpp +++ b/oneflow/user/kernels/sigmoid_cross_entropy_kernel.hip.cpp @@ -1,55 +1,55 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/user/kernels/sigmoid_cross_entropy_kernel.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { -template class Opt, typename PredT, typename LabelT> -struct ElemwiseSigmoidCrossEntropyGradFunctor final { - void operator()(ep::Stream* stream, int64_t n, PredT* prediction_diff, const PredT* prediction, - const LabelT* label, const PredT* loss_diff) { - OF_CUDA_CHECK(cuda::elementwise::Ternary(Opt(), n, prediction_diff, prediction, - label, loss_diff, - stream->As()->cuda_stream())); - } -}; - -template class Opt, typename PredT, typename LabelT> -struct ElemwiseSigmoidCrossEntropyFunctor final { - void operator()(ep::Stream* stream, int64_t n, PredT* loss, const PredT* prediction, - const LabelT* label) { - OF_CUDA_CHECK(cuda::elementwise::Binary(Opt(), n, loss, prediction, label, - stream->As()->cuda_stream())); - } -}; -} // namespace -REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, float, int32_t) -REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, double, int32_t) -REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, float, int8_t) -REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, double, int8_t) -REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, float, float) -REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, double, double) -REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, float, int32_t) -REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, double, int32_t) -REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, float, int8_t) -REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, double, int8_t) -REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, float, float) -REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, double, double) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/user/kernels/sigmoid_cross_entropy_kernel.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { +template class Opt, typename PredT, typename LabelT> +struct ElemwiseSigmoidCrossEntropyGradFunctor final { + void operator()(ep::Stream* stream, int64_t n, PredT* prediction_diff, const PredT* prediction, + const LabelT* label, const PredT* loss_diff) { + OF_CUDA_CHECK(cuda::elementwise::Ternary(Opt(), n, prediction_diff, prediction, + label, loss_diff, + stream->As()->cuda_stream())); + } +}; + +template class Opt, typename PredT, typename LabelT> +struct ElemwiseSigmoidCrossEntropyFunctor final { + void operator()(ep::Stream* stream, int64_t n, PredT* loss, const PredT* prediction, + const LabelT* label) { + OF_CUDA_CHECK(cuda::elementwise::Binary(Opt(), n, loss, prediction, label, + stream->As()->cuda_stream())); + } +}; +} // namespace +REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, float, int32_t) +REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, double, int32_t) +REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, float, int8_t) +REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, double, int8_t) +REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, float, float) +REGISTER_SIGMOID_CROSS_ENTROPY_KERNEL(DeviceType::kCUDA, double, double) +REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, float, int32_t) +REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, double, int32_t) +REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, float, int8_t) +REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, double, int8_t) +REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, float, float) +REGISTER_SIGMOID_CROSS_ENTROPY_GRAD_KERNEL(DeviceType::kCUDA, double, double) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/slice_util.hip.cpp b/oneflow/user/kernels/slice_util.hip.cpp index a008c27..2f32177 100644 --- a/oneflow/user/kernels/slice_util.hip.cpp +++ b/oneflow/user/kernels/slice_util.hip.cpp @@ -1,232 +1,232 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/user/kernels/slice_util.h" -#include "oneflow/core/common/switch_func.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -__global__ void SliceForwardGpu(const int n, SliceParams params, - SliceIndexHelper entire_idx_cvtr, - SliceIndexHelper sliced_idx_cvtr, const T* entire, - T* sliced) { - CUDA_1D_KERNEL_LOOP(i, n) { - int64_t offset = SliceOffsetToEntireOffset(i, params, entire_idx_cvtr, sliced_idx_cvtr); - sliced[i] = entire[offset]; - } -} - -template -__global__ void SliceForwardGpu(const int n, SliceParams entire_params, SliceParams sliced_params, - SliceIndexHelper entire_splitted_large_idx_cvtr, - SliceIndexHelper sliced_splitted_large_idx_cvtr, - SliceIndexHelper entire_full_small_idx_cvtr, - SliceIndexHelper sliced_full_small_idx_cvtr, const T* entire, - T* sliced) { - CUDA_1D_KERNEL_LOOP(i, n) { - int64_t entire_offset = SliceOffsetToEntireOffset( - i, entire_params, entire_splitted_large_idx_cvtr, sliced_splitted_large_idx_cvtr); - int64_t sliced_offset = SliceOffsetToEntireOffset( - i, sliced_params, entire_full_small_idx_cvtr, sliced_full_small_idx_cvtr); - sliced[sliced_offset] = entire[entire_offset]; - } -} - -template -__global__ void SliceBackwardGpu(const int n, SliceParams params, - SliceIndexHelper entire_idx_cvtr, - SliceIndexHelper sliced_idx_cvtr, T* entire, - const T* sliced) { - CUDA_1D_KERNEL_LOOP(i, n) { - int64_t offset = SliceOffsetToEntireOffset(i, params, entire_idx_cvtr, sliced_idx_cvtr); - entire[offset] = sliced[i]; - } -} - -template -void LaunchSliceForward(ep::Stream* stream, const SliceParams& params, const T* entire, T* sliced) { - CHECK_EQ(params.ndim, NDIM); - int64_t elem_cnt = params.elem_cnt(); - SliceIndexHelper entire_idx_cvtr(params.dims); - SliceIndexHelper sliced_idx_cvtr(params.size); - if (elem_cnt == 0) { return; } - SliceForwardGpu<<As()->cuda_stream()>>>( - elem_cnt, params, entire_idx_cvtr, sliced_idx_cvtr, entire, sliced); -} - -template -void LaunchSliceForward(ep::Stream* stream, const SliceParams& entire_params, - const SliceParams& sliced_params, const T* entire, T* sliced) { - CHECK_EQ(entire_params.ndim, NDIM); - CHECK_EQ(sliced_params.ndim, NDIM); - int64_t elem_cnt = entire_params.elem_cnt(); - if (elem_cnt == 0) { return; } - SliceIndexHelper entire_splitted_large_idx_cvtr(entire_params.dims); - SliceIndexHelper sliced_splitted_large_idx_cvtr(entire_params.size); - SliceIndexHelper entire_full_small_idx_cvtr(sliced_params.dims); - SliceIndexHelper sliced_full_small_idx_cvtr(sliced_params.size); - SliceForwardGpu<<As()->cuda_stream()>>>( - elem_cnt, entire_params, sliced_params, entire_splitted_large_idx_cvtr, - sliced_splitted_large_idx_cvtr, entire_full_small_idx_cvtr, sliced_full_small_idx_cvtr, - entire, sliced); -} - -template -void LaunchSliceBackward(ep::Stream* stream, const SliceParams& params, const T* sliced, - T* entire) { - CHECK_EQ(params.ndim, NDIM); - int64_t elem_cnt = params.elem_cnt(); - SliceIndexHelper entire_idx_cvtr(params.dims); - SliceIndexHelper sliced_idx_cvtr(params.size); - if (elem_cnt == 0) { return; } - SliceBackwardGpu<<As()->cuda_stream()>>>( - elem_cnt, params, entire_idx_cvtr, sliced_idx_cvtr, entire, sliced); -} - -template -struct SliceSwitchUtil final { -#define MAKE_SLICE_SWITCH_ENTRY(func_name, N) func_name -#define DEFINE_SLICE_SWITCH_UTIL_STATIC_METHOD(func_name) \ - DEFINE_STATIC_SWITCH_FUNC(void, func_name, MAKE_SLICE_SWITCH_ENTRY, MAKE_NDIM_CTRV_SEQ(DIM_SEQ)) - - DEFINE_SLICE_SWITCH_UTIL_STATIC_METHOD(LaunchSliceForward) - DEFINE_SLICE_SWITCH_UTIL_STATIC_METHOD(LaunchSliceBackward) -#undef DEFINE_SLICE_SWITCH_UTIL_STATIC_METHOD -#undef MAKE_SLICE_SWITCH_ENTRY -}; - -template -size_t GetPackSize(const SliceParams& params, const T* entire, const T* sliced) { - CHECK_GT(params.ndim, 0); - const int64_t last_dim = params.ndim - 1; - const int64_t mask = (params.dims[last_dim] * sizeof(T)) | (params.start[last_dim] * sizeof(T)) - | (params.size[last_dim] * sizeof(T)) - | static_cast(reinterpret_cast(entire)) - | static_cast(reinterpret_cast(sliced)); - if ((mask & 0xF) == 0) { - return 16; - } else if ((mask & 0x7) == 0) { - return 8; - } else if ((mask & 0x3) == 0) { - return 4; - } else if ((mask & 0x1) == 0) { - return 2; - } else { - return 1; - } -} - -template -void GetPackedParams(const SliceParams& params, const T* entire, const T* sliced, size_t* pack_size, - SliceParams* packed_params) { - CHECK_GT(params.ndim, 0); - const int64_t last_dim = params.ndim - 1; - if (params.step[last_dim] == 1) { - *pack_size = GetPackSize(params, entire, sliced); - CHECK_GE(*pack_size, sizeof(T)); - const int64_t elem_per_pack = *pack_size / sizeof(T); - *packed_params = params; - packed_params->dims[last_dim] /= elem_per_pack; - packed_params->start[last_dim] /= elem_per_pack; - packed_params->size[last_dim] /= elem_per_pack; - } else { - *pack_size = sizeof(T); - *packed_params = params; - } -} - -} // namespace - -template -struct SliceKernelUtil { - static void Forward(ep::Stream* stream, const SliceParams& params, const T* entire, T* sliced) { - SliceParams fold_slice_params = FoldContiguousFullSliceDimensions(params); - size_t pack_size; - SliceParams packed_params{}; - GetPackedParams(fold_slice_params, entire, sliced, &pack_size, &packed_params); - if (pack_size == 1) { - SliceSwitchUtil::SwitchLaunchSliceForward( - SwitchCase(packed_params.ndim), stream, packed_params, - reinterpret_cast(entire), reinterpret_cast(sliced)); - } else if (pack_size == 2) { - SliceSwitchUtil::SwitchLaunchSliceForward( - SwitchCase(packed_params.ndim), stream, packed_params, - reinterpret_cast(entire), reinterpret_cast(sliced)); - } else if (pack_size == 4) { - SliceSwitchUtil::SwitchLaunchSliceForward( - SwitchCase(packed_params.ndim), stream, packed_params, - reinterpret_cast(entire), reinterpret_cast(sliced)); - } else if (pack_size == 8) { - SliceSwitchUtil::SwitchLaunchSliceForward( - SwitchCase(packed_params.ndim), stream, packed_params, - reinterpret_cast(entire), reinterpret_cast(sliced)); - } else if (pack_size == 16) { - SliceSwitchUtil::SwitchLaunchSliceForward( - SwitchCase(packed_params.ndim), stream, packed_params, - reinterpret_cast(entire), reinterpret_cast(sliced)); - } else { - UNIMPLEMENTED(); - } - } - - static void Forward(ep::Stream* stream, const SliceParams& entire_params, - const SliceParams& sliced_params, const T* entire, T* sliced) { - SliceSwitchUtil::SwitchLaunchSliceForward(SwitchCase(entire_params.ndim), stream, - entire_params, sliced_params, entire, sliced); - } - - static void Backward(ep::Stream* stream, const SliceParams& params, const T* sliced, T* entire) { - SliceParams fold_slice_params = FoldContiguousFullSliceDimensions(params); - size_t pack_size; - SliceParams packed_params{}; - GetPackedParams(fold_slice_params, entire, sliced, &pack_size, &packed_params); - if (pack_size == 1) { - SliceSwitchUtil::SwitchLaunchSliceBackward( - SwitchCase(packed_params.ndim), stream, packed_params, - reinterpret_cast(sliced), reinterpret_cast(entire)); - } else if (pack_size == 2) { - SliceSwitchUtil::SwitchLaunchSliceBackward( - SwitchCase(packed_params.ndim), stream, packed_params, - reinterpret_cast(sliced), reinterpret_cast(entire)); - } else if (pack_size == 4) { - SliceSwitchUtil::SwitchLaunchSliceBackward( - SwitchCase(packed_params.ndim), stream, packed_params, - reinterpret_cast(sliced), reinterpret_cast(entire)); - } else if (pack_size == 8) { - SliceSwitchUtil::SwitchLaunchSliceBackward( - SwitchCase(packed_params.ndim), stream, packed_params, - reinterpret_cast(sliced), reinterpret_cast(entire)); - } else if (pack_size == 16) { - SliceSwitchUtil::SwitchLaunchSliceBackward( - SwitchCase(packed_params.ndim), stream, packed_params, - reinterpret_cast(sliced), reinterpret_cast(entire)); - } else { - UNIMPLEMENTED(); - } - } -}; - -INSTANTIATE_SLICE_KERNEL_UTIL_WITH_DEVICE(DeviceType::kCUDA) -INSTANTIATE_SLICE_KERNEL_UTIL(DeviceType::kCUDA, float16) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/user/kernels/slice_util.h" +#include "oneflow/core/common/switch_func.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +__global__ void SliceForwardGpu(const int n, SliceParams params, + SliceIndexHelper entire_idx_cvtr, + SliceIndexHelper sliced_idx_cvtr, const T* entire, + T* sliced) { + CUDA_1D_KERNEL_LOOP(i, n) { + int64_t offset = SliceOffsetToEntireOffset(i, params, entire_idx_cvtr, sliced_idx_cvtr); + sliced[i] = entire[offset]; + } +} + +template +__global__ void SliceForwardGpu(const int n, SliceParams entire_params, SliceParams sliced_params, + SliceIndexHelper entire_splitted_large_idx_cvtr, + SliceIndexHelper sliced_splitted_large_idx_cvtr, + SliceIndexHelper entire_full_small_idx_cvtr, + SliceIndexHelper sliced_full_small_idx_cvtr, const T* entire, + T* sliced) { + CUDA_1D_KERNEL_LOOP(i, n) { + int64_t entire_offset = SliceOffsetToEntireOffset( + i, entire_params, entire_splitted_large_idx_cvtr, sliced_splitted_large_idx_cvtr); + int64_t sliced_offset = SliceOffsetToEntireOffset( + i, sliced_params, entire_full_small_idx_cvtr, sliced_full_small_idx_cvtr); + sliced[sliced_offset] = entire[entire_offset]; + } +} + +template +__global__ void SliceBackwardGpu(const int n, SliceParams params, + SliceIndexHelper entire_idx_cvtr, + SliceIndexHelper sliced_idx_cvtr, T* entire, + const T* sliced) { + CUDA_1D_KERNEL_LOOP(i, n) { + int64_t offset = SliceOffsetToEntireOffset(i, params, entire_idx_cvtr, sliced_idx_cvtr); + entire[offset] = sliced[i]; + } +} + +template +void LaunchSliceForward(ep::Stream* stream, const SliceParams& params, const T* entire, T* sliced) { + CHECK_EQ(params.ndim, NDIM); + int64_t elem_cnt = params.elem_cnt(); + SliceIndexHelper entire_idx_cvtr(params.dims); + SliceIndexHelper sliced_idx_cvtr(params.size); + if (elem_cnt == 0) { return; } + SliceForwardGpu<<As()->cuda_stream()>>>( + elem_cnt, params, entire_idx_cvtr, sliced_idx_cvtr, entire, sliced); +} + +template +void LaunchSliceForward(ep::Stream* stream, const SliceParams& entire_params, + const SliceParams& sliced_params, const T* entire, T* sliced) { + CHECK_EQ(entire_params.ndim, NDIM); + CHECK_EQ(sliced_params.ndim, NDIM); + int64_t elem_cnt = entire_params.elem_cnt(); + if (elem_cnt == 0) { return; } + SliceIndexHelper entire_splitted_large_idx_cvtr(entire_params.dims); + SliceIndexHelper sliced_splitted_large_idx_cvtr(entire_params.size); + SliceIndexHelper entire_full_small_idx_cvtr(sliced_params.dims); + SliceIndexHelper sliced_full_small_idx_cvtr(sliced_params.size); + SliceForwardGpu<<As()->cuda_stream()>>>( + elem_cnt, entire_params, sliced_params, entire_splitted_large_idx_cvtr, + sliced_splitted_large_idx_cvtr, entire_full_small_idx_cvtr, sliced_full_small_idx_cvtr, + entire, sliced); +} + +template +void LaunchSliceBackward(ep::Stream* stream, const SliceParams& params, const T* sliced, + T* entire) { + CHECK_EQ(params.ndim, NDIM); + int64_t elem_cnt = params.elem_cnt(); + SliceIndexHelper entire_idx_cvtr(params.dims); + SliceIndexHelper sliced_idx_cvtr(params.size); + if (elem_cnt == 0) { return; } + SliceBackwardGpu<<As()->cuda_stream()>>>( + elem_cnt, params, entire_idx_cvtr, sliced_idx_cvtr, entire, sliced); +} + +template +struct SliceSwitchUtil final { +#define MAKE_SLICE_SWITCH_ENTRY(func_name, N) func_name +#define DEFINE_SLICE_SWITCH_UTIL_STATIC_METHOD(func_name) \ + DEFINE_STATIC_SWITCH_FUNC(void, func_name, MAKE_SLICE_SWITCH_ENTRY, MAKE_NDIM_CTRV_SEQ(DIM_SEQ)) + + DEFINE_SLICE_SWITCH_UTIL_STATIC_METHOD(LaunchSliceForward) + DEFINE_SLICE_SWITCH_UTIL_STATIC_METHOD(LaunchSliceBackward) +#undef DEFINE_SLICE_SWITCH_UTIL_STATIC_METHOD +#undef MAKE_SLICE_SWITCH_ENTRY +}; + +template +size_t GetPackSize(const SliceParams& params, const T* entire, const T* sliced) { + CHECK_GT(params.ndim, 0); + const int64_t last_dim = params.ndim - 1; + const int64_t mask = (params.dims[last_dim] * sizeof(T)) | (params.start[last_dim] * sizeof(T)) + | (params.size[last_dim] * sizeof(T)) + | static_cast(reinterpret_cast(entire)) + | static_cast(reinterpret_cast(sliced)); + if ((mask & 0xF) == 0) { + return 16; + } else if ((mask & 0x7) == 0) { + return 8; + } else if ((mask & 0x3) == 0) { + return 4; + } else if ((mask & 0x1) == 0) { + return 2; + } else { + return 1; + } +} + +template +void GetPackedParams(const SliceParams& params, const T* entire, const T* sliced, size_t* pack_size, + SliceParams* packed_params) { + CHECK_GT(params.ndim, 0); + const int64_t last_dim = params.ndim - 1; + if (params.step[last_dim] == 1) { + *pack_size = GetPackSize(params, entire, sliced); + CHECK_GE(*pack_size, sizeof(T)); + const int64_t elem_per_pack = *pack_size / sizeof(T); + *packed_params = params; + packed_params->dims[last_dim] /= elem_per_pack; + packed_params->start[last_dim] /= elem_per_pack; + packed_params->size[last_dim] /= elem_per_pack; + } else { + *pack_size = sizeof(T); + *packed_params = params; + } +} + +} // namespace + +template +struct SliceKernelUtil { + static void Forward(ep::Stream* stream, const SliceParams& params, const T* entire, T* sliced) { + SliceParams fold_slice_params = FoldContiguousFullSliceDimensions(params); + size_t pack_size; + SliceParams packed_params{}; + GetPackedParams(fold_slice_params, entire, sliced, &pack_size, &packed_params); + if (pack_size == 1) { + SliceSwitchUtil::SwitchLaunchSliceForward( + SwitchCase(packed_params.ndim), stream, packed_params, + reinterpret_cast(entire), reinterpret_cast(sliced)); + } else if (pack_size == 2) { + SliceSwitchUtil::SwitchLaunchSliceForward( + SwitchCase(packed_params.ndim), stream, packed_params, + reinterpret_cast(entire), reinterpret_cast(sliced)); + } else if (pack_size == 4) { + SliceSwitchUtil::SwitchLaunchSliceForward( + SwitchCase(packed_params.ndim), stream, packed_params, + reinterpret_cast(entire), reinterpret_cast(sliced)); + } else if (pack_size == 8) { + SliceSwitchUtil::SwitchLaunchSliceForward( + SwitchCase(packed_params.ndim), stream, packed_params, + reinterpret_cast(entire), reinterpret_cast(sliced)); + } else if (pack_size == 16) { + SliceSwitchUtil::SwitchLaunchSliceForward( + SwitchCase(packed_params.ndim), stream, packed_params, + reinterpret_cast(entire), reinterpret_cast(sliced)); + } else { + UNIMPLEMENTED(); + } + } + + static void Forward(ep::Stream* stream, const SliceParams& entire_params, + const SliceParams& sliced_params, const T* entire, T* sliced) { + SliceSwitchUtil::SwitchLaunchSliceForward(SwitchCase(entire_params.ndim), stream, + entire_params, sliced_params, entire, sliced); + } + + static void Backward(ep::Stream* stream, const SliceParams& params, const T* sliced, T* entire) { + SliceParams fold_slice_params = FoldContiguousFullSliceDimensions(params); + size_t pack_size; + SliceParams packed_params{}; + GetPackedParams(fold_slice_params, entire, sliced, &pack_size, &packed_params); + if (pack_size == 1) { + SliceSwitchUtil::SwitchLaunchSliceBackward( + SwitchCase(packed_params.ndim), stream, packed_params, + reinterpret_cast(sliced), reinterpret_cast(entire)); + } else if (pack_size == 2) { + SliceSwitchUtil::SwitchLaunchSliceBackward( + SwitchCase(packed_params.ndim), stream, packed_params, + reinterpret_cast(sliced), reinterpret_cast(entire)); + } else if (pack_size == 4) { + SliceSwitchUtil::SwitchLaunchSliceBackward( + SwitchCase(packed_params.ndim), stream, packed_params, + reinterpret_cast(sliced), reinterpret_cast(entire)); + } else if (pack_size == 8) { + SliceSwitchUtil::SwitchLaunchSliceBackward( + SwitchCase(packed_params.ndim), stream, packed_params, + reinterpret_cast(sliced), reinterpret_cast(entire)); + } else if (pack_size == 16) { + SliceSwitchUtil::SwitchLaunchSliceBackward( + SwitchCase(packed_params.ndim), stream, packed_params, + reinterpret_cast(sliced), reinterpret_cast(entire)); + } else { + UNIMPLEMENTED(); + } + } +}; + +INSTANTIATE_SLICE_KERNEL_UTIL_WITH_DEVICE(DeviceType::kCUDA) +INSTANTIATE_SLICE_KERNEL_UTIL(DeviceType::kCUDA, float16) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/smooth_l1_loss_kernel.hip.cpp b/oneflow/user/kernels/smooth_l1_loss_kernel.hip.cpp index 59d4eeb..ff8aca8 100644 --- a/oneflow/user/kernels/smooth_l1_loss_kernel.hip.cpp +++ b/oneflow/user/kernels/smooth_l1_loss_kernel.hip.cpp @@ -1,145 +1,145 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/user/kernels/loss_kernel_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { -namespace user_op { - -namespace { - -using namespace loss; - -template -struct SmoothL1Functor { - float beta_; - float inv_beta_; - T half_of_one_; - SmoothL1Functor(float beta) - : beta_(beta), inv_beta_(static_cast(1.0 / beta)), half_of_one_(static_cast(0.5)) {} - - __device__ __forceinline__ T operator()(T input_val, T target_val) const { - const T abs_diff = abs(input_val - target_val); - if (abs_diff < beta_) { - return half_of_one_ * abs_diff * abs_diff * inv_beta_; - } else { - return abs_diff - half_of_one_ * beta_; - } - } -}; - -template<> -struct SmoothL1Functor { - half beta_; - half inv_beta_; - half zero_; - half half_of_one_; - SmoothL1Functor(float beta) - : beta_(__float2half(beta)), - inv_beta_(__float2half(static_cast(1.0 / beta))), - zero_(__float2half(0.f)), - half_of_one_(__float2half(0.5f)) {} - - __device__ __forceinline__ half operator()(half input_val, half target_val) const { - const half diff = input_val - target_val; - const half abs_diff = diff < zero_ ? __hneg(diff) : diff; - if (abs_diff < beta_) { - return half_of_one_ * abs_diff * abs_diff * inv_beta_; - } else { - return abs_diff - half_of_one_ * beta_; - } - } -}; - -template -struct SmoothL1GradFunctor { - float beta_; - float inv_beta_; - T zero_; - SmoothL1GradFunctor(float beta) - : beta_(beta), inv_beta_(static_cast(1.0 / beta)), zero_(GetZeroVal()) {} - - __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val) const { - const T diff = input_val - target_val; - const T abs_diff = abs(diff); - T dx_val; - if (abs_diff < beta_) { - dx_val = diff * inv_beta_; - } else { - dx_val = (diff > zero_) - (diff < zero_); - } - return dx_val * dy_val; - } -}; - -template<> -struct SmoothL1GradFunctor { - half beta_; - half inv_beta_; - half zero_; - half one_; - SmoothL1GradFunctor(float beta) - : beta_(__float2half(beta)), - inv_beta_(__float2half(static_cast(1.0 / beta))), - zero_(__float2half(0.f)), - one_(__float2half(1.f)) {} - - __device__ __forceinline__ half operator()(half input_val, half target_val, half dy_val) const { - const half diff = input_val - target_val; - const half abs_diff = diff < zero_ ? __hneg(diff) : diff; - half dx_val; - if (abs_diff < beta_) { - dx_val = diff * inv_beta_; - } else { - dx_val = (diff > zero_) - (diff < zero_); - } - return dx_val * dy_val; - } -}; - -template -class SmoothL1LossKernel : public SimpleLossKernel> { - public: - void ComputeOut(user_op::KernelComputeContext* ctx, int64_t elem_cnt, const T* input, - const T* target, T* out) const { - const float beta = ctx->Attr("beta"); - OF_CUDA_CHECK((cuda::elementwise::Binary(SmoothL1Functor(beta), elem_cnt, out, input, target, - ctx->stream()->As()->cuda_stream()))); - } -}; - -template -class SmoothL1LossGradKernel - : public SimpleLossGradKernel> { - public: - void ComputeOut(user_op::KernelComputeContext* ctx, int64_t elem_cnt, const T* input, - const T* target, const T* dy, T* dx) const { - const float beta = ctx->Attr("beta"); - OF_CUDA_CHECK( - (cuda::elementwise::Ternary(SmoothL1GradFunctor(beta), elem_cnt, dx, input, target, dy, - ctx->stream()->As()->cuda_stream()))); - } -}; - -} // namespace - -REGISTER_SIMPLE_LOSS_KERNEL_CUDA("smooth_l1_loss", SmoothL1LossKernel) -REGISTER_SIMPLE_LOSS_GRAD_KERNEL_CUDA("smooth_l1_loss_grad", SmoothL1LossGradKernel) - -} // namespace user_op +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/user/kernels/loss_kernel_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { +namespace user_op { + +namespace { + +using namespace loss; + +template +struct SmoothL1Functor { + float beta_; + float inv_beta_; + T half_of_one_; + SmoothL1Functor(float beta) + : beta_(beta), inv_beta_(static_cast(1.0 / beta)), half_of_one_(static_cast(0.5)) {} + + __device__ __forceinline__ T operator()(T input_val, T target_val) const { + const T abs_diff = abs(input_val - target_val); + if (abs_diff < beta_) { + return half_of_one_ * abs_diff * abs_diff * inv_beta_; + } else { + return abs_diff - half_of_one_ * beta_; + } + } +}; + +template<> +struct SmoothL1Functor { + half beta_; + half inv_beta_; + half zero_; + half half_of_one_; + SmoothL1Functor(float beta) + : beta_(__float2half(beta)), + inv_beta_(__float2half(static_cast(1.0 / beta))), + zero_(__float2half(0.f)), + half_of_one_(__float2half(0.5f)) {} + + __device__ __forceinline__ half operator()(half input_val, half target_val) const { + const half diff = input_val - target_val; + const half abs_diff = diff < zero_ ? __hneg(diff) : diff; + if (abs_diff < beta_) { + return half_of_one_ * abs_diff * abs_diff * inv_beta_; + } else { + return abs_diff - half_of_one_ * beta_; + } + } +}; + +template +struct SmoothL1GradFunctor { + float beta_; + float inv_beta_; + T zero_; + SmoothL1GradFunctor(float beta) + : beta_(beta), inv_beta_(static_cast(1.0 / beta)), zero_(GetZeroVal()) {} + + __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val) const { + const T diff = input_val - target_val; + const T abs_diff = abs(diff); + T dx_val; + if (abs_diff < beta_) { + dx_val = diff * inv_beta_; + } else { + dx_val = (diff > zero_) - (diff < zero_); + } + return dx_val * dy_val; + } +}; + +template<> +struct SmoothL1GradFunctor { + half beta_; + half inv_beta_; + half zero_; + half one_; + SmoothL1GradFunctor(float beta) + : beta_(__float2half(beta)), + inv_beta_(__float2half(static_cast(1.0 / beta))), + zero_(__float2half(0.f)), + one_(__float2half(1.f)) {} + + __device__ __forceinline__ half operator()(half input_val, half target_val, half dy_val) const { + const half diff = input_val - target_val; + const half abs_diff = diff < zero_ ? __hneg(diff) : diff; + half dx_val; + if (abs_diff < beta_) { + dx_val = diff * inv_beta_; + } else { + dx_val = (diff > zero_) - (diff < zero_); + } + return dx_val * dy_val; + } +}; + +template +class SmoothL1LossKernel : public SimpleLossKernel> { + public: + void ComputeOut(user_op::KernelComputeContext* ctx, int64_t elem_cnt, const T* input, + const T* target, T* out) const { + const float beta = ctx->Attr("beta"); + OF_CUDA_CHECK((cuda::elementwise::Binary(SmoothL1Functor(beta), elem_cnt, out, input, target, + ctx->stream()->As()->cuda_stream()))); + } +}; + +template +class SmoothL1LossGradKernel + : public SimpleLossGradKernel> { + public: + void ComputeOut(user_op::KernelComputeContext* ctx, int64_t elem_cnt, const T* input, + const T* target, const T* dy, T* dx) const { + const float beta = ctx->Attr("beta"); + OF_CUDA_CHECK( + (cuda::elementwise::Ternary(SmoothL1GradFunctor(beta), elem_cnt, dx, input, target, dy, + ctx->stream()->As()->cuda_stream()))); + } +}; + +} // namespace + +REGISTER_SIMPLE_LOSS_KERNEL_CUDA("smooth_l1_loss", SmoothL1LossKernel) +REGISTER_SIMPLE_LOSS_GRAD_KERNEL_CUDA("smooth_l1_loss_grad", SmoothL1LossGradKernel) + +} // namespace user_op } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/softmax_cross_entropy_kernel.hip.cpp b/oneflow/user/kernels/softmax_cross_entropy_kernel.hip.cpp index 40e5cb1..c15d355 100644 --- a/oneflow/user/kernels/softmax_cross_entropy_kernel.hip.cpp +++ b/oneflow/user/kernels/softmax_cross_entropy_kernel.hip.cpp @@ -1,156 +1,156 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/user/kernels/softmax_cross_entropy_kernel.h" -#include "oneflow/core/kernel/kernel_util.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include - -namespace oneflow { -namespace user_op { - -namespace { - -constexpr int64_t kCrossEntropyGpuBlockSize = 128; - -template -__global__ void ComputeEntropyGpu(const int64_t num_instances, const int64_t num_classes, - const T* x, const T* labels, T* y) { - typedef hipcub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - const int tid = threadIdx.x; - for (int row = blockIdx.x; row < num_instances; row += gridDim.x) { - const int row_offset = row * num_classes; - const T* in_row = x + row_offset; - const T* label_row = labels + row_offset; - T result = 0; - for (int col = tid; col < num_classes; col += kCrossEntropyGpuBlockSize) { - T label = label_row[col]; - T prob = in_row[col]; - result += -label * SafeLog(prob); - } - __syncthreads(); - T row_reduce_result = BlockReduce(temp_storage).Reduce(result, hipcub::Sum()); - if (0 == tid) { y[row] = row_reduce_result; } - } -} - -__global__ void ComputeEntropyGpuHalf(const int64_t num_instances, const int64_t num_classes, - const half* x, const half* labels, half* y) { - typedef hipcub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - const int tid = threadIdx.x; - for (int row = blockIdx.x; row < num_instances; row += gridDim.x) { - const int row_offset = row * num_classes; - const half* in_row = x + row_offset; - const half* label_row = labels + row_offset; - float result = 0; - for (int col = tid; col < num_classes; col += kCrossEntropyGpuBlockSize) { - float label = __half2float(label_row[col]); - float prob = __half2float(in_row[col]); - result += -label * SafeLog(prob); - } - __syncthreads(); - float row_reduce_result = BlockReduce(temp_storage).Reduce(result, hipcub::Sum()); - if (0 == tid) { y[row] = __float2half(row_reduce_result); } - } -} - -template -__global__ void ComputeDiffWithSoftmaxGpu(const int64_t elem_cnt, const int64_t num_classes, - const T* prob, const T* labels, const T* dy, T* dx) { - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { - const int32_t row_id = i / num_classes; - dx[i] = dy[row_id] * (prob[i] - labels[i]); - } -} - -__global__ void ComputeDiffWithSoftmaxGpuHalf(const int64_t elem_cnt, const int64_t num_classes, - const half* prob, const half* labels, const half* dy, - half* dx) { -#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { - const int32_t row_id = i / num_classes; - dx[i] = __hmul(dy[row_id], __hsub(prob[i], labels[i])); - } -#else - printf("use half need nvcc arch >= 530"); - assert(false); -#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ -} - -} // namespace - -int GetCrossEntropyNumBlocks(const int num_instances) { - return std::min(static_cast(num_instances), kCudaMaxBlocksNum); -} - -int GetCrossEntropyBlockSize() { return kCrossEntropyGpuBlockSize; } - -template -struct CrossEntropyKernelUtil { - static void ComputeEntropy(ep::Stream* stream, const int64_t num_instances, - const int64_t num_classes, const T* x, const T* labels, T* y) { - OF_CUDA_CHECK(hipMemsetAsync(y, 0, sizeof(T) * num_instances, - stream->As()->cuda_stream())); - ComputeEntropyGpu<<As()->cuda_stream()>>>(num_instances, num_classes, - x, labels, y); - } - - static void ComputeDiffWithSoftmax(ep::Stream* stream, const int64_t elem_cnt, - const int64_t num_classes, const T* prob, const T* labels, - const T* dy, T* dx) { - ComputeDiffWithSoftmaxGpu<<As()->cuda_stream()>>>( - elem_cnt, num_classes, prob, labels, dy, dx); - } -}; - -template<> -struct CrossEntropyKernelUtil { - static void ComputeEntropy(ep::Stream* stream, const int64_t num_instances, - const int64_t num_classes, const float16* x, const float16* labels, - float16* y) { - OF_CUDA_CHECK(hipMemsetAsync(y, 0, sizeof(float16) * num_instances, - stream->As()->cuda_stream())); - ComputeEntropyGpuHalf<<As()->cuda_stream()>>>( - num_instances, num_classes, reinterpret_cast(x), - reinterpret_cast(labels), reinterpret_cast(y)); - } - - static void ComputeDiffWithSoftmax(ep::Stream* stream, const int64_t elem_cnt, - const int64_t num_classes, const float16* prob, - const float16* labels, const float16* dy, float16* dx) { - ComputeDiffWithSoftmaxGpuHalf<<As()->cuda_stream()>>>( - elem_cnt, num_classes, reinterpret_cast(prob), - reinterpret_cast(labels), reinterpret_cast(dy), - reinterpret_cast(dx)); - } -}; - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SOFTMAX_CROSS_ENTROPY_KERNEL, - OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA), - FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ) - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SOFTMAX_CROSS_ENTROPY_GRAD_KERNEL, - OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA), - FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ) - -} // namespace user_op +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/user/kernels/softmax_cross_entropy_kernel.h" +#include "oneflow/core/kernel/kernel_util.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include + +namespace oneflow { +namespace user_op { + +namespace { + +constexpr int64_t kCrossEntropyGpuBlockSize = 128; + +template +__global__ void ComputeEntropyGpu(const int64_t num_instances, const int64_t num_classes, + const T* x, const T* labels, T* y) { + typedef hipcub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + const int tid = threadIdx.x; + for (int row = blockIdx.x; row < num_instances; row += gridDim.x) { + const int row_offset = row * num_classes; + const T* in_row = x + row_offset; + const T* label_row = labels + row_offset; + T result = 0; + for (int col = tid; col < num_classes; col += kCrossEntropyGpuBlockSize) { + T label = label_row[col]; + T prob = in_row[col]; + result += -label * SafeLog(prob); + } + __syncthreads(); + T row_reduce_result = BlockReduce(temp_storage).Reduce(result, hipcub::Sum()); + if (0 == tid) { y[row] = row_reduce_result; } + } +} + +__global__ void ComputeEntropyGpuHalf(const int64_t num_instances, const int64_t num_classes, + const half* x, const half* labels, half* y) { + typedef hipcub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + const int tid = threadIdx.x; + for (int row = blockIdx.x; row < num_instances; row += gridDim.x) { + const int row_offset = row * num_classes; + const half* in_row = x + row_offset; + const half* label_row = labels + row_offset; + float result = 0; + for (int col = tid; col < num_classes; col += kCrossEntropyGpuBlockSize) { + float label = __half2float(label_row[col]); + float prob = __half2float(in_row[col]); + result += -label * SafeLog(prob); + } + __syncthreads(); + float row_reduce_result = BlockReduce(temp_storage).Reduce(result, hipcub::Sum()); + if (0 == tid) { y[row] = __float2half(row_reduce_result); } + } +} + +template +__global__ void ComputeDiffWithSoftmaxGpu(const int64_t elem_cnt, const int64_t num_classes, + const T* prob, const T* labels, const T* dy, T* dx) { + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { + const int32_t row_id = i / num_classes; + dx[i] = dy[row_id] * (prob[i] - labels[i]); + } +} + +__global__ void ComputeDiffWithSoftmaxGpuHalf(const int64_t elem_cnt, const int64_t num_classes, + const half* prob, const half* labels, const half* dy, + half* dx) { +#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { + const int32_t row_id = i / num_classes; + dx[i] = __hmul(dy[row_id], __hsub(prob[i], labels[i])); + } +#else + printf("use half need nvcc arch >= 530"); + assert(false); +#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ +} + +} // namespace + +int GetCrossEntropyNumBlocks(const int num_instances) { + return std::min(static_cast(num_instances), kCudaMaxBlocksNum); +} + +int GetCrossEntropyBlockSize() { return kCrossEntropyGpuBlockSize; } + +template +struct CrossEntropyKernelUtil { + static void ComputeEntropy(ep::Stream* stream, const int64_t num_instances, + const int64_t num_classes, const T* x, const T* labels, T* y) { + OF_CUDA_CHECK(hipMemsetAsync(y, 0, sizeof(T) * num_instances, + stream->As()->cuda_stream())); + ComputeEntropyGpu<<As()->cuda_stream()>>>(num_instances, num_classes, + x, labels, y); + } + + static void ComputeDiffWithSoftmax(ep::Stream* stream, const int64_t elem_cnt, + const int64_t num_classes, const T* prob, const T* labels, + const T* dy, T* dx) { + ComputeDiffWithSoftmaxGpu<<As()->cuda_stream()>>>( + elem_cnt, num_classes, prob, labels, dy, dx); + } +}; + +template<> +struct CrossEntropyKernelUtil { + static void ComputeEntropy(ep::Stream* stream, const int64_t num_instances, + const int64_t num_classes, const float16* x, const float16* labels, + float16* y) { + OF_CUDA_CHECK(hipMemsetAsync(y, 0, sizeof(float16) * num_instances, + stream->As()->cuda_stream())); + ComputeEntropyGpuHalf<<As()->cuda_stream()>>>( + num_instances, num_classes, reinterpret_cast(x), + reinterpret_cast(labels), reinterpret_cast(y)); + } + + static void ComputeDiffWithSoftmax(ep::Stream* stream, const int64_t elem_cnt, + const int64_t num_classes, const float16* prob, + const float16* labels, const float16* dy, float16* dx) { + ComputeDiffWithSoftmaxGpuHalf<<As()->cuda_stream()>>>( + elem_cnt, num_classes, reinterpret_cast(prob), + reinterpret_cast(labels), reinterpret_cast(dy), + reinterpret_cast(dx)); + } +}; + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SOFTMAX_CROSS_ENTROPY_KERNEL, + OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA), + FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ) + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SOFTMAX_CROSS_ENTROPY_GRAD_KERNEL, + OF_PP_MAKE_TUPLE_SEQ(DeviceType::kCUDA), + FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ) + +} // namespace user_op } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/sort_kernel.hip.cpp b/oneflow/user/kernels/sort_kernel.hip.cpp index 1f6067f..186dfb9 100644 --- a/oneflow/user/kernels/sort_kernel.hip.cpp +++ b/oneflow/user/kernels/sort_kernel.hip.cpp @@ -1,81 +1,81 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/user/kernels/radix_sort.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -template -class GpuSortKernel final : public user_op::OpKernel { - public: - GpuSortKernel() = default; - ~GpuSortKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - - Memcpy(ctx->stream(), out->mut_dptr(), in->dptr(), - in->shape_view().elem_cnt() * sizeof(T)); - const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); - const int32_t instance_num = in->shape_view().elem_cnt() / instance_size; - const std::string& direction = ctx->Attr("direction"); - if (direction == "ASCENDING") { - SortKeysAscending(in->dptr(), instance_num, instance_size, tmp_buffer->mut_dptr(), - tmp_buffer->shape_view().elem_cnt(), out->mut_dptr(), - ctx->stream()->As()->cuda_stream()); - } else if (direction == "DESCENDING") { - SortKeysDescending(in->dptr(), instance_num, instance_size, tmp_buffer->mut_dptr(), - tmp_buffer->shape_view().elem_cnt(), out->mut_dptr(), - ctx->stream()->As()->cuda_stream()); - } else { - UNIMPLEMENTED(); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_SORT_KERNEL(dtype) \ - REGISTER_USER_KERNEL("sort") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("out", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { \ - const Shape& in_shape = ctx->InputShape("in", 0); \ - const int32_t instance_size = in_shape.dim_vec().back(); \ - const int32_t instance_num = in_shape.elem_cnt() / instance_size; \ - const std::string& direction = ctx->Attr("direction"); \ - if (direction == "ASCENDING") { \ - return InferTempStorageForSortKeysAscending(instance_num, instance_size); \ - } else if (direction == "DESCENDING") { \ - return InferTempStorageForSortKeysDescending(instance_num, instance_size); \ - } else { \ - UNIMPLEMENTED(); \ - return 0; \ - } \ - }); - -REGISTER_CUDA_SORT_KERNEL(float) -REGISTER_CUDA_SORT_KERNEL(double) -REGISTER_CUDA_SORT_KERNEL(int32_t) -REGISTER_CUDA_SORT_KERNEL(int64_t) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/user/kernels/radix_sort.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +template +class GpuSortKernel final : public user_op::OpKernel { + public: + GpuSortKernel() = default; + ~GpuSortKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + + Memcpy(ctx->stream(), out->mut_dptr(), in->dptr(), + in->shape_view().elem_cnt() * sizeof(T)); + const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); + const int32_t instance_num = in->shape_view().elem_cnt() / instance_size; + const std::string& direction = ctx->Attr("direction"); + if (direction == "ASCENDING") { + SortKeysAscending(in->dptr(), instance_num, instance_size, tmp_buffer->mut_dptr(), + tmp_buffer->shape_view().elem_cnt(), out->mut_dptr(), + ctx->stream()->As()->cuda_stream()); + } else if (direction == "DESCENDING") { + SortKeysDescending(in->dptr(), instance_num, instance_size, tmp_buffer->mut_dptr(), + tmp_buffer->shape_view().elem_cnt(), out->mut_dptr(), + ctx->stream()->As()->cuda_stream()); + } else { + UNIMPLEMENTED(); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_SORT_KERNEL(dtype) \ + REGISTER_USER_KERNEL("sort") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("out", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t { \ + const Shape& in_shape = ctx->InputShape("in", 0); \ + const int32_t instance_size = in_shape.dim_vec().back(); \ + const int32_t instance_num = in_shape.elem_cnt() / instance_size; \ + const std::string& direction = ctx->Attr("direction"); \ + if (direction == "ASCENDING") { \ + return InferTempStorageForSortKeysAscending(instance_num, instance_size); \ + } else if (direction == "DESCENDING") { \ + return InferTempStorageForSortKeysDescending(instance_num, instance_size); \ + } else { \ + UNIMPLEMENTED(); \ + return 0; \ + } \ + }); + +REGISTER_CUDA_SORT_KERNEL(float) +REGISTER_CUDA_SORT_KERNEL(double) +REGISTER_CUDA_SORT_KERNEL(int32_t) +REGISTER_CUDA_SORT_KERNEL(int64_t) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/sparse_cross_entropy_kernel_util.hip.cpp b/oneflow/user/kernels/sparse_cross_entropy_kernel_util.hip.cpp index 38c3903..d417fb8 100644 --- a/oneflow/user/kernels/sparse_cross_entropy_kernel_util.hip.cpp +++ b/oneflow/user/kernels/sparse_cross_entropy_kernel_util.hip.cpp @@ -1,267 +1,267 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/user/kernels/sparse_cross_entropy_kernel_util.h" -#include "oneflow/core/kernel/kernel_util.hip.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { -namespace user_op { - -namespace { - -template -__global__ void ComputeEntropyGpu(const int64_t num_instances, const int64_t num_classes, - const int64_t depth, const int64_t lower_bound, const T* x, - const K* labels, T* y) { - CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_instances) { - assert(labels[i] >= 0); - assert(labels[i] < depth); - K label = labels[i] - lower_bound; - if (label >= 0 && label < num_classes) { y[i] = -SafeLog(x[i * num_classes + label]); } - } -} - -template -__global__ void ComputeEntropyGpuHalf(const int64_t num_instances, const int64_t num_classes, - const int64_t depth, const int64_t lower_bound, const half* x, - const K* labels, half* y) { -#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_instances) { - assert(labels[i] >= 0); - assert(labels[i] < depth); - K label = labels[i] - lower_bound; - if (label >= 0 && label < num_classes) { - y[i] = __float2half(-SafeLog(__half2float(x[i * num_classes + label]))); - } - } -#else - printf("use half need nvcc arch >= 530"); - assert(false); -#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ -} - -template -__global__ void ComputeDiffGpu(const int64_t num_instances, const int64_t num_classes, - const int64_t depth, const int64_t lower_bound, const T* x, - const K* labels, const T* dy, T* dx) { - CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_instances) { - assert(labels[i] >= 0); - assert(labels[i] < depth); - K label = labels[i] - lower_bound; - if (label >= 0 && label < num_classes) { - dx[i * num_classes + label] = -dy[i] / MaxWithLogThreshold(x[i * num_classes + label]); - } - } -} - -template -__global__ void ComputeDiffGpuHalf(const int64_t num_instances, const int64_t num_classes, - const int64_t depth, const int64_t lower_bound, const half* x, - const K* labels, const half* dy, half* dx) { -#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_instances) { - assert(labels[i] >= 0); - assert(labels[i] < depth); - K label = labels[i] - lower_bound; - if (label >= 0 && label < num_classes) { - dx[i * num_classes + label] = - __hneg(__hdiv(__float2half(dy[i]), MaxWithLogThreshold(x[i * num_classes + label]))); - } - } -#else - printf("use half need nvcc arch >= 530"); - assert(false); -#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ -} - -template -__global__ void ComputeDiffWithSoftmaxGpu(const int64_t elem_cnt, const int64_t num_classes, - const int64_t depth, const int64_t lower_bound, - const T* prob, const K* labels, const T* dy, T* dx) { - CUDA_1D_KERNEL_LOOP_T(IndexType, i, elem_cnt) { - const IndexType row_id = i / num_classes; - const IndexType col_id = i - row_id * num_classes; - assert(labels[row_id] >= 0); - assert(labels[row_id] < depth); - K label = labels[row_id] - lower_bound; - if (label == col_id) { - dx[i] = dy[row_id] * (prob[i] - 1); - } else { - dx[i] = dy[row_id] * prob[i]; - } - } -} - -template -__global__ void ComputeDiffWithSoftmaxGpuHalf(const int64_t elem_cnt, const int64_t num_classes, - const int64_t depth, const int64_t lower_bound, - const half* prob, const K* labels, const half* dy, - half* dx) { -#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - CUDA_1D_KERNEL_LOOP_T(IndexType, i, elem_cnt) { - // NOTE(chengcheng): int division ('/') of i will reduce performance of int64_t. - const IndexType row_id = i / num_classes; - const IndexType col_id = i - row_id * num_classes; - assert(labels[row_id] >= 0); - assert(labels[row_id] < depth); - K label = labels[row_id] - lower_bound; - if (label == col_id) { - dx[i] = __hmul(dy[row_id], __hsub(prob[i], __float2half(1.0))); - } else { - dx[i] = __hmul(dy[row_id], prob[i]); - } - } -#else - printf("use half need nvcc arch >= 530"); - assert(false); -#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ -} - -template -__global__ void ComputeDiffWithSoftmaxGpuHalf2(const int64_t elem_cnt, const int64_t num_classes, - const int64_t depth, const int64_t lower_bound, - const half* prob, const K* labels, const half* dy, - half* dx) { -#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - const int64_t h2_num_classes = num_classes / 2; - const int64_t h2_elem_cnt = elem_cnt / 2; - const auto* prob_h2 = reinterpret_cast(prob); - auto* dx_h2 = reinterpret_cast(dx); - CUDA_1D_KERNEL_LOOP_T(IndexType, i, h2_elem_cnt) { - const IndexType row_id = i / h2_num_classes; - const IndexType h2_col_id = i - row_id * h2_num_classes; - assert(labels[row_id] >= 0); - assert(labels[row_id] < depth); - K label = labels[row_id] - lower_bound; - const half2 prob_h2_i = prob_h2[i]; - const half dy_row = dy[row_id]; - half2 dx_h2_i; - dx_h2_i.data.x = __hmul(dy_row, __hsub(prob_h2_i.data.x, static_cast(label == 2 * h2_col_id))); - dx_h2_i.data.y = __hmul(dy_row, __hsub(prob_h2_i.data.y, static_cast(label == 2 * h2_col_id + 1))); - dx_h2[i] = dx_h2_i; - } -#else - printf("use half need nvcc arch >= 530"); - assert(false); -#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ -} - -} // namespace - -template -struct SparseCrossEntropyKernelUtil { - static void ComputeEntropy(ep::Stream* stream, const int64_t num_instances, - const int64_t num_classes, const int64_t depth, - const int64_t lower_bound, const T* x, const K* labels, T* y) { - ComputeEntropyGpu<<As()->cuda_stream()>>>( - num_instances, num_classes, depth, lower_bound, x, labels, y); - } - - static void ComputeDiff(ep::Stream* stream, const int64_t num_instances, - const int64_t num_classes, const int64_t depth, const int64_t lower_bound, - const T* x, const K* labels, const T* dy, T* dx) { - ComputeDiffGpu<<As()->cuda_stream()>>>( - num_instances, num_classes, depth, lower_bound, x, labels, dy, dx); - } - - static void ComputeDiffWithSoftmax(ep::Stream* stream, const int64_t elem_cnt, - const int64_t num_classes, const int64_t depth, - const int64_t lower_bound, const T* prob, const K* labels, - const T* dy, T* dx) { - if (elem_cnt < GetMaxVal() / 2) { - ComputeDiffWithSoftmaxGpu - <<As()->cuda_stream()>>>(elem_cnt, num_classes, depth, - lower_bound, prob, labels, dy, dx); - } else { - ComputeDiffWithSoftmaxGpu - <<As()->cuda_stream()>>>(elem_cnt, num_classes, depth, - lower_bound, prob, labels, dy, dx); - } - } -}; - -template -struct SparseCrossEntropyKernelUtil { - static void ComputeEntropy(ep::Stream* stream, const int64_t num_instances, - const int64_t num_classes, const int64_t depth, - const int64_t lower_bound, const float16* x, const K* labels, - float16* y) { - ComputeEntropyGpuHalf<<As()->cuda_stream()>>>( - num_instances, num_classes, depth, lower_bound, reinterpret_cast(x), labels, - reinterpret_cast(y)); - } - - static void ComputeDiff(ep::Stream* stream, const int64_t num_instances, - const int64_t num_classes, const int64_t depth, const int64_t lower_bound, - const float16* x, const K* labels, const float16* dy, float16* dx) { - ComputeDiffGpuHalf<<As()->cuda_stream()>>>( - num_instances, num_classes, depth, lower_bound, reinterpret_cast(x), labels, - reinterpret_cast(dy), reinterpret_cast(dx)); - } - - static void ComputeDiffWithSoftmax(ep::Stream* stream, const int64_t elem_cnt, - const int64_t num_classes, const int64_t depth, - const int64_t lower_bound, const float16* prob, - const K* labels, const float16* dy, float16* dx) { - if (num_classes % 2 == 0) { - if (elem_cnt < GetMaxVal() / 2) { - ComputeDiffWithSoftmaxGpuHalf2 - <<As()->cuda_stream()>>>( - elem_cnt, num_classes, depth, lower_bound, reinterpret_cast(prob), - labels, reinterpret_cast(dy), reinterpret_cast(dx)); - } else { - ComputeDiffWithSoftmaxGpuHalf2 - <<As()->cuda_stream()>>>( - elem_cnt, num_classes, depth, lower_bound, reinterpret_cast(prob), - labels, reinterpret_cast(dy), reinterpret_cast(dx)); - } - } else { - if (elem_cnt < GetMaxVal() / 2) { - ComputeDiffWithSoftmaxGpuHalf - <<As()->cuda_stream()>>>( - elem_cnt, num_classes, depth, lower_bound, reinterpret_cast(prob), - labels, reinterpret_cast(dy), reinterpret_cast(dx)); - } else { - ComputeDiffWithSoftmaxGpuHalf - <<As()->cuda_stream()>>>( - elem_cnt, num_classes, depth, lower_bound, reinterpret_cast(prob), - labels, reinterpret_cast(dy), reinterpret_cast(dx)); - } - } - } -}; - -#define INSTANTIATE_SPARSE_CROSS_ENTROPY_KERNEL_UTIL_CUDA(data_type_pair, index_type_pair) \ - template struct SparseCrossEntropyKernelUtil< \ - DeviceType::kCUDA, OF_PP_PAIR_FIRST(data_type_pair), OF_PP_PAIR_FIRST(index_type_pair)>; -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_SPARSE_CROSS_ENTROPY_KERNEL_UTIL_CUDA, - FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ); -#undef INSTANTIATE_SPARSE_CROSS_ENTROPY_KERNEL_UTIL_CUDA - -} // namespace user_op +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/user/kernels/sparse_cross_entropy_kernel_util.h" +#include "oneflow/core/kernel/kernel_util.hip.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { +namespace user_op { + +namespace { + +template +__global__ void ComputeEntropyGpu(const int64_t num_instances, const int64_t num_classes, + const int64_t depth, const int64_t lower_bound, const T* x, + const K* labels, T* y) { + CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_instances) { + assert(labels[i] >= 0); + assert(labels[i] < depth); + K label = labels[i] - lower_bound; + if (label >= 0 && label < num_classes) { y[i] = -SafeLog(x[i * num_classes + label]); } + } +} + +template +__global__ void ComputeEntropyGpuHalf(const int64_t num_instances, const int64_t num_classes, + const int64_t depth, const int64_t lower_bound, const half* x, + const K* labels, half* y) { +#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_instances) { + assert(labels[i] >= 0); + assert(labels[i] < depth); + K label = labels[i] - lower_bound; + if (label >= 0 && label < num_classes) { + y[i] = __float2half(-SafeLog(__half2float(x[i * num_classes + label]))); + } + } +#else + printf("use half need nvcc arch >= 530"); + assert(false); +#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ +} + +template +__global__ void ComputeDiffGpu(const int64_t num_instances, const int64_t num_classes, + const int64_t depth, const int64_t lower_bound, const T* x, + const K* labels, const T* dy, T* dx) { + CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_instances) { + assert(labels[i] >= 0); + assert(labels[i] < depth); + K label = labels[i] - lower_bound; + if (label >= 0 && label < num_classes) { + dx[i * num_classes + label] = -dy[i] / MaxWithLogThreshold(x[i * num_classes + label]); + } + } +} + +template +__global__ void ComputeDiffGpuHalf(const int64_t num_instances, const int64_t num_classes, + const int64_t depth, const int64_t lower_bound, const half* x, + const K* labels, const half* dy, half* dx) { +#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_instances) { + assert(labels[i] >= 0); + assert(labels[i] < depth); + K label = labels[i] - lower_bound; + if (label >= 0 && label < num_classes) { + dx[i * num_classes + label] = + __hneg(__hdiv(__float2half(dy[i]), MaxWithLogThreshold(x[i * num_classes + label]))); + } + } +#else + printf("use half need nvcc arch >= 530"); + assert(false); +#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ +} + +template +__global__ void ComputeDiffWithSoftmaxGpu(const int64_t elem_cnt, const int64_t num_classes, + const int64_t depth, const int64_t lower_bound, + const T* prob, const K* labels, const T* dy, T* dx) { + CUDA_1D_KERNEL_LOOP_T(IndexType, i, elem_cnt) { + const IndexType row_id = i / num_classes; + const IndexType col_id = i - row_id * num_classes; + assert(labels[row_id] >= 0); + assert(labels[row_id] < depth); + K label = labels[row_id] - lower_bound; + if (label == col_id) { + dx[i] = dy[row_id] * (prob[i] - 1); + } else { + dx[i] = dy[row_id] * prob[i]; + } + } +} + +template +__global__ void ComputeDiffWithSoftmaxGpuHalf(const int64_t elem_cnt, const int64_t num_classes, + const int64_t depth, const int64_t lower_bound, + const half* prob, const K* labels, const half* dy, + half* dx) { +#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + CUDA_1D_KERNEL_LOOP_T(IndexType, i, elem_cnt) { + // NOTE(chengcheng): int division ('/') of i will reduce performance of int64_t. + const IndexType row_id = i / num_classes; + const IndexType col_id = i - row_id * num_classes; + assert(labels[row_id] >= 0); + assert(labels[row_id] < depth); + K label = labels[row_id] - lower_bound; + if (label == col_id) { + dx[i] = __hmul(dy[row_id], __hsub(prob[i], __float2half(1.0))); + } else { + dx[i] = __hmul(dy[row_id], prob[i]); + } + } +#else + printf("use half need nvcc arch >= 530"); + assert(false); +#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ +} + +template +__global__ void ComputeDiffWithSoftmaxGpuHalf2(const int64_t elem_cnt, const int64_t num_classes, + const int64_t depth, const int64_t lower_bound, + const half* prob, const K* labels, const half* dy, + half* dx) { +#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + const int64_t h2_num_classes = num_classes / 2; + const int64_t h2_elem_cnt = elem_cnt / 2; + const auto* prob_h2 = reinterpret_cast(prob); + auto* dx_h2 = reinterpret_cast(dx); + CUDA_1D_KERNEL_LOOP_T(IndexType, i, h2_elem_cnt) { + const IndexType row_id = i / h2_num_classes; + const IndexType h2_col_id = i - row_id * h2_num_classes; + assert(labels[row_id] >= 0); + assert(labels[row_id] < depth); + K label = labels[row_id] - lower_bound; + const half2 prob_h2_i = prob_h2[i]; + const half dy_row = dy[row_id]; + half2 dx_h2_i; + dx_h2_i.data.x = __hmul(dy_row, __hsub(prob_h2_i.data.x, static_cast(label == 2 * h2_col_id))); + dx_h2_i.data.y = __hmul(dy_row, __hsub(prob_h2_i.data.y, static_cast(label == 2 * h2_col_id + 1))); + dx_h2[i] = dx_h2_i; + } +#else + printf("use half need nvcc arch >= 530"); + assert(false); +#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ +} + +} // namespace + +template +struct SparseCrossEntropyKernelUtil { + static void ComputeEntropy(ep::Stream* stream, const int64_t num_instances, + const int64_t num_classes, const int64_t depth, + const int64_t lower_bound, const T* x, const K* labels, T* y) { + ComputeEntropyGpu<<As()->cuda_stream()>>>( + num_instances, num_classes, depth, lower_bound, x, labels, y); + } + + static void ComputeDiff(ep::Stream* stream, const int64_t num_instances, + const int64_t num_classes, const int64_t depth, const int64_t lower_bound, + const T* x, const K* labels, const T* dy, T* dx) { + ComputeDiffGpu<<As()->cuda_stream()>>>( + num_instances, num_classes, depth, lower_bound, x, labels, dy, dx); + } + + static void ComputeDiffWithSoftmax(ep::Stream* stream, const int64_t elem_cnt, + const int64_t num_classes, const int64_t depth, + const int64_t lower_bound, const T* prob, const K* labels, + const T* dy, T* dx) { + if (elem_cnt < GetMaxVal() / 2) { + ComputeDiffWithSoftmaxGpu + <<As()->cuda_stream()>>>(elem_cnt, num_classes, depth, + lower_bound, prob, labels, dy, dx); + } else { + ComputeDiffWithSoftmaxGpu + <<As()->cuda_stream()>>>(elem_cnt, num_classes, depth, + lower_bound, prob, labels, dy, dx); + } + } +}; + +template +struct SparseCrossEntropyKernelUtil { + static void ComputeEntropy(ep::Stream* stream, const int64_t num_instances, + const int64_t num_classes, const int64_t depth, + const int64_t lower_bound, const float16* x, const K* labels, + float16* y) { + ComputeEntropyGpuHalf<<As()->cuda_stream()>>>( + num_instances, num_classes, depth, lower_bound, reinterpret_cast(x), labels, + reinterpret_cast(y)); + } + + static void ComputeDiff(ep::Stream* stream, const int64_t num_instances, + const int64_t num_classes, const int64_t depth, const int64_t lower_bound, + const float16* x, const K* labels, const float16* dy, float16* dx) { + ComputeDiffGpuHalf<<As()->cuda_stream()>>>( + num_instances, num_classes, depth, lower_bound, reinterpret_cast(x), labels, + reinterpret_cast(dy), reinterpret_cast(dx)); + } + + static void ComputeDiffWithSoftmax(ep::Stream* stream, const int64_t elem_cnt, + const int64_t num_classes, const int64_t depth, + const int64_t lower_bound, const float16* prob, + const K* labels, const float16* dy, float16* dx) { + if (num_classes % 2 == 0) { + if (elem_cnt < GetMaxVal() / 2) { + ComputeDiffWithSoftmaxGpuHalf2 + <<As()->cuda_stream()>>>( + elem_cnt, num_classes, depth, lower_bound, reinterpret_cast(prob), + labels, reinterpret_cast(dy), reinterpret_cast(dx)); + } else { + ComputeDiffWithSoftmaxGpuHalf2 + <<As()->cuda_stream()>>>( + elem_cnt, num_classes, depth, lower_bound, reinterpret_cast(prob), + labels, reinterpret_cast(dy), reinterpret_cast(dx)); + } + } else { + if (elem_cnt < GetMaxVal() / 2) { + ComputeDiffWithSoftmaxGpuHalf + <<As()->cuda_stream()>>>( + elem_cnt, num_classes, depth, lower_bound, reinterpret_cast(prob), + labels, reinterpret_cast(dy), reinterpret_cast(dx)); + } else { + ComputeDiffWithSoftmaxGpuHalf + <<As()->cuda_stream()>>>( + elem_cnt, num_classes, depth, lower_bound, reinterpret_cast(prob), + labels, reinterpret_cast(dy), reinterpret_cast(dx)); + } + } + } +}; + +#define INSTANTIATE_SPARSE_CROSS_ENTROPY_KERNEL_UTIL_CUDA(data_type_pair, index_type_pair) \ + template struct SparseCrossEntropyKernelUtil< \ + DeviceType::kCUDA, OF_PP_PAIR_FIRST(data_type_pair), OF_PP_PAIR_FIRST(index_type_pair)>; +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_SPARSE_CROSS_ENTROPY_KERNEL_UTIL_CUDA, + FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ); +#undef INSTANTIATE_SPARSE_CROSS_ENTROPY_KERNEL_UTIL_CUDA + +} // namespace user_op } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.hip.cpp b/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.hip.cpp index 869b283..71d7845 100644 --- a/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.hip.cpp +++ b/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel.hip.cpp @@ -1,131 +1,131 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/user/kernels/sparse_cross_entropy_kernel_util.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/hip/softmax.hip.h" -#include "oneflow/core/kernel/cuda_graph_support.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { -namespace user_op { - -namespace { - -template -void ComputeProb(ep::Stream* stream, const int64_t row, const int64_t col, const T* in, T* prob) { - using ComputeType = typename cuda::softmax::DefaultComputeType::type; - cuda::softmax::DirectLoad load(in, col); - cuda::softmax::DirectStore store(prob, col); - OF_CUDA_CHECK((cuda::softmax::DispatchLogSoftmax( - stream->As()->cuda_stream(), load, store, row, col))); -} - -template<> -void ComputeProb(ep::Stream* stream, const int64_t row, const int64_t col, const float16* in, - float16* prob) { - cuda::softmax::DirectLoad load(reinterpret_cast(in), col); - cuda::softmax::DirectStore store(reinterpret_cast(prob), col); - OF_CUDA_CHECK((cuda::softmax::DispatchLogSoftmax( - stream->As()->cuda_stream(), load, store, row, col))); -} - -template -__global__ void ComputeSparseSoftmaxCrossEntropyResultGpu(const int64_t num_instances, - const int64_t num_classes, - const int64_t depth, - const int64_t lower_bound, - const K* labels, const T* prob, T* out) { - CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_instances) { - assert(labels[i] >= 0); - assert(labels[i] < depth); - K label = labels[i] - lower_bound; - if (label >= 0 && label < num_classes) { out[i] = -prob[i * num_classes + label]; } - } -} -template -inline typename std::enable_if::value, void>::type -ComputeSparseSoftmaxCrossEntropyResult(ep::Stream* stream, const int64_t num_instances, - const int64_t num_classes, const int64_t depth, - const int64_t lower_bound, const K* labels, const T* prob, - T* out) { - ComputeSparseSoftmaxCrossEntropyResultGpu - <<As()->cuda_stream()>>>(num_instances, num_classes, depth, - lower_bound, labels, prob, out); -} -template -inline typename std::enable_if::value, void>::type -ComputeSparseSoftmaxCrossEntropyResult(ep::Stream* stream, const int64_t num_instances, - const int64_t num_classes, const int64_t depth, - const int64_t lower_bound, const K* labels, const T* prob, - T* out) { -#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - ComputeSparseSoftmaxCrossEntropyResultGpu - <<As()->cuda_stream()>>>( - num_instances, num_classes, depth, lower_bound, labels, - reinterpret_cast(prob), reinterpret_cast(out)); -#else - printf("use half need nvcc arch >= 530"); - assert(false); -#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ -} -} // namespace - -template -class SparseSoftmaxCrossEntropyKernel final : public user_op::OpKernel, - public user_op::CudaGraphSupport { - public: - SparseSoftmaxCrossEntropyKernel() = default; - ~SparseSoftmaxCrossEntropyKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* prediction = ctx->Tensor4ArgNameAndIndex("prediction", 0); - const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0); - user_op::Tensor* prob = ctx->Tensor4ArgNameAndIndex("prob", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); - - const int64_t num_instances = label->shape_view().elem_cnt(); - CHECK_EQ(prediction->shape_view().elem_cnt() % num_instances, 0); - const int64_t num_classes = prediction->shape_view().elem_cnt() / num_instances; - const int64_t lower_bound = 0; - const int64_t depth = ctx->Attr("depth"); - - ComputeProb(ctx->stream(), num_instances, num_classes, prediction->dptr(), - prob->mut_dptr()); - ComputeSparseSoftmaxCrossEntropyResult(ctx->stream(), num_instances, num_classes, depth, - lower_bound, label->dptr(), prob->dptr(), - out->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL(dtype_pair, ltype_pair) \ - REGISTER_USER_KERNEL("sparse_softmax_cross_entropy") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("label", 0) == OF_PP_PAIR_SECOND(ltype_pair)) \ - && (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(dtype_pair))); - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL, - FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) - -} // namespace user_op +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/user/kernels/sparse_cross_entropy_kernel_util.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/hip/softmax.hip.h" +#include "oneflow/core/kernel/cuda_graph_support.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { +namespace user_op { + +namespace { + +template +void ComputeProb(ep::Stream* stream, const int64_t row, const int64_t col, const T* in, T* prob) { + using ComputeType = typename cuda::softmax::DefaultComputeType::type; + cuda::softmax::DirectLoad load(in, col); + cuda::softmax::DirectStore store(prob, col); + OF_CUDA_CHECK((cuda::softmax::DispatchLogSoftmax( + stream->As()->cuda_stream(), load, store, row, col))); +} + +template<> +void ComputeProb(ep::Stream* stream, const int64_t row, const int64_t col, const float16* in, + float16* prob) { + cuda::softmax::DirectLoad load(reinterpret_cast(in), col); + cuda::softmax::DirectStore store(reinterpret_cast(prob), col); + OF_CUDA_CHECK((cuda::softmax::DispatchLogSoftmax( + stream->As()->cuda_stream(), load, store, row, col))); +} + +template +__global__ void ComputeSparseSoftmaxCrossEntropyResultGpu(const int64_t num_instances, + const int64_t num_classes, + const int64_t depth, + const int64_t lower_bound, + const K* labels, const T* prob, T* out) { + CUDA_1D_KERNEL_LOOP_T(int64_t, i, num_instances) { + assert(labels[i] >= 0); + assert(labels[i] < depth); + K label = labels[i] - lower_bound; + if (label >= 0 && label < num_classes) { out[i] = -prob[i * num_classes + label]; } + } +} +template +inline typename std::enable_if::value, void>::type +ComputeSparseSoftmaxCrossEntropyResult(ep::Stream* stream, const int64_t num_instances, + const int64_t num_classes, const int64_t depth, + const int64_t lower_bound, const K* labels, const T* prob, + T* out) { + ComputeSparseSoftmaxCrossEntropyResultGpu + <<As()->cuda_stream()>>>(num_instances, num_classes, depth, + lower_bound, labels, prob, out); +} +template +inline typename std::enable_if::value, void>::type +ComputeSparseSoftmaxCrossEntropyResult(ep::Stream* stream, const int64_t num_instances, + const int64_t num_classes, const int64_t depth, + const int64_t lower_bound, const K* labels, const T* prob, + T* out) { +#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + ComputeSparseSoftmaxCrossEntropyResultGpu + <<As()->cuda_stream()>>>( + num_instances, num_classes, depth, lower_bound, labels, + reinterpret_cast(prob), reinterpret_cast(out)); +#else + printf("use half need nvcc arch >= 530"); + assert(false); +#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ +} +} // namespace + +template +class SparseSoftmaxCrossEntropyKernel final : public user_op::OpKernel, + public user_op::CudaGraphSupport { + public: + SparseSoftmaxCrossEntropyKernel() = default; + ~SparseSoftmaxCrossEntropyKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* prediction = ctx->Tensor4ArgNameAndIndex("prediction", 0); + const user_op::Tensor* label = ctx->Tensor4ArgNameAndIndex("label", 0); + user_op::Tensor* prob = ctx->Tensor4ArgNameAndIndex("prob", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); + + const int64_t num_instances = label->shape_view().elem_cnt(); + CHECK_EQ(prediction->shape_view().elem_cnt() % num_instances, 0); + const int64_t num_classes = prediction->shape_view().elem_cnt() / num_instances; + const int64_t lower_bound = 0; + const int64_t depth = ctx->Attr("depth"); + + ComputeProb(ctx->stream(), num_instances, num_classes, prediction->dptr(), + prob->mut_dptr()); + ComputeSparseSoftmaxCrossEntropyResult(ctx->stream(), num_instances, num_classes, depth, + lower_bound, label->dptr(), prob->dptr(), + out->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL(dtype_pair, ltype_pair) \ + REGISTER_USER_KERNEL("sparse_softmax_cross_entropy") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("label", 0) == OF_PP_PAIR_SECOND(ltype_pair)) \ + && (user_op::HobDataType("out", 0) == OF_PP_PAIR_SECOND(dtype_pair))); + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL, + FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ) + +} // namespace user_op } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel_util.hip.cpp b/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel_util.hip.cpp index b2a004a..e637751 100644 --- a/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel_util.hip.cpp +++ b/oneflow/user/kernels/sparse_softmax_cross_entropy_kernel_util.hip.cpp @@ -1,134 +1,134 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/user/kernels/sparse_softmax_cross_entropy_kernel_util.h" -#include "oneflow/core/hip/softmax.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { -namespace user_op { -namespace { - -template -__inline__ __device__ T Exp(T x); - -template<> -__inline__ __device__ float Exp(float x) { -#ifdef OF_SOFTMAX_USE_FAST_MATH - return __expf(x); -#else - return exp(x); -#endif -} - -template<> -__inline__ __device__ double Exp(double x) { - return exp(x); -} - -template<> -__inline__ __device__ half Exp(half x) { -#ifdef OF_SOFTMAX_USE_FAST_MATH - return __float2half(__expf(__half2float(x))); -#else - return __float2half(exp(__half2float(x))); -#endif -} - -template -__global__ void ComputeDiffGpu(const int64_t num_instances, const int64_t num_classes, - const int64_t depth, const int64_t lower_bound, const T* prob, - const K* labels, const T* dy, T* dx) { - CUDA_1D_KERNEL_LOOP_T(IndexType, i, num_instances) { - const IndexType row_id = i / num_classes; - const IndexType col_id = i - row_id * num_classes; - assert(labels[row_id] >= 0); - assert(labels[row_id] < depth); - K label = labels[row_id] - lower_bound; - if (label == col_id) { - dx[i] = dy[row_id] * (Exp(prob[i]) - 1); - } else { - dx[i] = dy[row_id] * Exp(prob[i]); - } - } -} - -template -__global__ void ComputeDiffGpuHalf(const int64_t num_instances, const int64_t num_classes, - const int64_t depth, const int64_t lower_bound, const half* prob, - const K* labels, const half* dy, half* dx) { - CUDA_1D_KERNEL_LOOP_T(IndexType, i, num_instances) { - const IndexType row_id = i / num_classes; - const IndexType col_id = i - row_id * num_classes; - assert(labels[row_id] >= 0); - assert(labels[row_id] < depth); - K label = labels[row_id] - lower_bound; - if (label == col_id) { - dx[i] = __hmul(dy[row_id], __hsub(Exp(prob[i]), __float2half(1.0))); - } else { - dx[i] = __hmul(dy[row_id], Exp(prob[i])); - } - } -} - -} // namespace - -template -struct SparseSoftmaxCrossEntropyKernelUtil { - static void ComputeDiff(ep::Stream* stream, const int64_t num_instances, - const int64_t num_classes, const int64_t depth, const int64_t lower_bound, - const T* prob, const K* labels, const T* dy, T* dx) { - if (num_instances < GetMaxVal() / 2) { - ComputeDiffGpu<<As()->cuda_stream()>>>( - num_instances, num_classes, depth, lower_bound, prob, labels, dy, dx); - } else { - // NOTE(chengcheng): int division ('/') of i will reduce performance of int64_t. - ComputeDiffGpu<<As()->cuda_stream()>>>( - num_instances, num_classes, depth, lower_bound, prob, labels, dy, dx); - } - } -}; - -template -struct SparseSoftmaxCrossEntropyKernelUtil { - static void ComputeDiff(ep::Stream* stream, const int64_t num_instances, - const int64_t num_classes, const int64_t depth, const int64_t lower_bound, - const float16* prob, const K* labels, const float16* dy, float16* dx) { - if (num_instances < GetMaxVal() / 2) { - ComputeDiffGpuHalf<<As()->cuda_stream()>>>( - num_instances, num_classes, depth, lower_bound, reinterpret_cast(prob), - labels, reinterpret_cast(dy), reinterpret_cast(dx)); - } else { - ComputeDiffGpuHalf<<As()->cuda_stream()>>>( - num_instances, num_classes, depth, lower_bound, reinterpret_cast(prob), - labels, reinterpret_cast(dy), reinterpret_cast(dx)); - } - } -}; - -#define INSTANTIATE_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL_UTIL_CUDA(data_type_pair, index_type_pair) \ - template struct SparseSoftmaxCrossEntropyKernelUtil< \ - DeviceType::kCUDA, OF_PP_PAIR_FIRST(data_type_pair), OF_PP_PAIR_FIRST(index_type_pair)>; -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL_UTIL_CUDA, - FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ); -#undef INSTANTIATE_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL_UTIL_CUDA - -} // namespace user_op +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/user/kernels/sparse_softmax_cross_entropy_kernel_util.h" +#include "oneflow/core/hip/softmax.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { +namespace user_op { +namespace { + +template +__inline__ __device__ T Exp(T x); + +template<> +__inline__ __device__ float Exp(float x) { +#ifdef OF_SOFTMAX_USE_FAST_MATH + return __expf(x); +#else + return exp(x); +#endif +} + +template<> +__inline__ __device__ double Exp(double x) { + return exp(x); +} + +template<> +__inline__ __device__ half Exp(half x) { +#ifdef OF_SOFTMAX_USE_FAST_MATH + return __float2half(__expf(__half2float(x))); +#else + return __float2half(exp(__half2float(x))); +#endif +} + +template +__global__ void ComputeDiffGpu(const int64_t num_instances, const int64_t num_classes, + const int64_t depth, const int64_t lower_bound, const T* prob, + const K* labels, const T* dy, T* dx) { + CUDA_1D_KERNEL_LOOP_T(IndexType, i, num_instances) { + const IndexType row_id = i / num_classes; + const IndexType col_id = i - row_id * num_classes; + assert(labels[row_id] >= 0); + assert(labels[row_id] < depth); + K label = labels[row_id] - lower_bound; + if (label == col_id) { + dx[i] = dy[row_id] * (Exp(prob[i]) - 1); + } else { + dx[i] = dy[row_id] * Exp(prob[i]); + } + } +} + +template +__global__ void ComputeDiffGpuHalf(const int64_t num_instances, const int64_t num_classes, + const int64_t depth, const int64_t lower_bound, const half* prob, + const K* labels, const half* dy, half* dx) { + CUDA_1D_KERNEL_LOOP_T(IndexType, i, num_instances) { + const IndexType row_id = i / num_classes; + const IndexType col_id = i - row_id * num_classes; + assert(labels[row_id] >= 0); + assert(labels[row_id] < depth); + K label = labels[row_id] - lower_bound; + if (label == col_id) { + dx[i] = __hmul(dy[row_id], __hsub(Exp(prob[i]), __float2half(1.0))); + } else { + dx[i] = __hmul(dy[row_id], Exp(prob[i])); + } + } +} + +} // namespace + +template +struct SparseSoftmaxCrossEntropyKernelUtil { + static void ComputeDiff(ep::Stream* stream, const int64_t num_instances, + const int64_t num_classes, const int64_t depth, const int64_t lower_bound, + const T* prob, const K* labels, const T* dy, T* dx) { + if (num_instances < GetMaxVal() / 2) { + ComputeDiffGpu<<As()->cuda_stream()>>>( + num_instances, num_classes, depth, lower_bound, prob, labels, dy, dx); + } else { + // NOTE(chengcheng): int division ('/') of i will reduce performance of int64_t. + ComputeDiffGpu<<As()->cuda_stream()>>>( + num_instances, num_classes, depth, lower_bound, prob, labels, dy, dx); + } + } +}; + +template +struct SparseSoftmaxCrossEntropyKernelUtil { + static void ComputeDiff(ep::Stream* stream, const int64_t num_instances, + const int64_t num_classes, const int64_t depth, const int64_t lower_bound, + const float16* prob, const K* labels, const float16* dy, float16* dx) { + if (num_instances < GetMaxVal() / 2) { + ComputeDiffGpuHalf<<As()->cuda_stream()>>>( + num_instances, num_classes, depth, lower_bound, reinterpret_cast(prob), + labels, reinterpret_cast(dy), reinterpret_cast(dx)); + } else { + ComputeDiffGpuHalf<<As()->cuda_stream()>>>( + num_instances, num_classes, depth, lower_bound, reinterpret_cast(prob), + labels, reinterpret_cast(dy), reinterpret_cast(dx)); + } + } +}; + +#define INSTANTIATE_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL_UTIL_CUDA(data_type_pair, index_type_pair) \ + template struct SparseSoftmaxCrossEntropyKernelUtil< \ + DeviceType::kCUDA, OF_PP_PAIR_FIRST(data_type_pair), OF_PP_PAIR_FIRST(index_type_pair)>; +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL_UTIL_CUDA, + FLOATING_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ, INDEX_DATA_TYPE_SEQ); +#undef INSTANTIATE_SPARSE_SOFTMAX_CROSS_ENTROPY_KERNEL_UTIL_CUDA + +} // namespace user_op } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/sqrt_square_sum_kernel_util.hip.cpp b/oneflow/user/kernels/sqrt_square_sum_kernel_util.hip.cpp index 7db4c19..fbe4d2a 100644 --- a/oneflow/user/kernels/sqrt_square_sum_kernel_util.hip.cpp +++ b/oneflow/user/kernels/sqrt_square_sum_kernel_util.hip.cpp @@ -1,83 +1,83 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/user/kernels/sqrt_square_sum_kernel_util.h" -#include "oneflow/core/hip/atomic.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include - -namespace oneflow { - -namespace { - -template -__global__ void SqrtSquareSumForOneThreadBlock(int64_t n, const T* x, T* y) { - T t_sum = 0; - CUDA_1D_KERNEL_LOOP(i, n) { t_sum += x[i] * x[i]; } - typedef hipcub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - T b_sum = BlockReduce(temp_storage).Sum(t_sum); - if (threadIdx.x == 0) { *y = sqrt(b_sum); } -} - -template -__global__ void SqrtSumForMultiThreadBlock(int64_t n, const T* x, T* y) { - T t_sum = 0; - CUDA_1D_KERNEL_LOOP(i, n) { t_sum += x[i]; } - typedef hipcub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - T b_sum = BlockReduce(temp_storage).Sum(t_sum); - if (threadIdx.x == 0) { *y = sqrt(b_sum); } -} - -template -__global__ void SquareSumForMultiThreadBlock(int64_t n, const T* x, T* tmp) { - T t_sum = 0; - CUDA_1D_KERNEL_LOOP(i, n) { t_sum += x[i] * x[i]; } - typedef hipcub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - T b_sum = BlockReduce(temp_storage).Sum(t_sum); - if (threadIdx.x == 0) { tmp[blockIdx.x] = b_sum; } -} - -} // namespace - -template -struct SqrtSquareSumKernelUtil { - static void SqrtSquareSum(ep::Stream* stream, int64_t n, const T* x, T* y, T* tmp) { - const int32_t num_blocks = BlocksNum4ThreadsNum(n); - CHECK_GE(num_blocks, 0); - if (num_blocks == 1) { - SqrtSquareSumForOneThreadBlock - <<<1, kCudaThreadsNumPerBlock, 0, stream->As()->cuda_stream()>>>(n, x, y); - } else { - Memset(stream, y, 0, sizeof(T)); - SquareSumForMultiThreadBlock - <<As()->cuda_stream()>>>( - n, x, tmp); - SqrtSumForMultiThreadBlock - <<<1, kCudaThreadsNumPerBlock, 0, stream->As()->cuda_stream()>>>( - num_blocks, tmp, y); - } - } -}; - -#define INSTANTIATE_SQRT_SQUARE_SUM_KERNEL_UTIL_CUDA(type_cpp, type_proto) \ - template struct SqrtSquareSumKernelUtil; -OF_PP_FOR_EACH_TUPLE(INSTANTIATE_SQRT_SQUARE_SUM_KERNEL_UTIL_CUDA, FLOATING_DATA_TYPE_SEQ); -#undef INSTANTIATE_SQRT_SQUARE_SUM_KERNEL_UTIL_CUDA - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/user/kernels/sqrt_square_sum_kernel_util.h" +#include "oneflow/core/hip/atomic.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include + +namespace oneflow { + +namespace { + +template +__global__ void SqrtSquareSumForOneThreadBlock(int64_t n, const T* x, T* y) { + T t_sum = 0; + CUDA_1D_KERNEL_LOOP(i, n) { t_sum += x[i] * x[i]; } + typedef hipcub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + T b_sum = BlockReduce(temp_storage).Sum(t_sum); + if (threadIdx.x == 0) { *y = sqrt(b_sum); } +} + +template +__global__ void SqrtSumForMultiThreadBlock(int64_t n, const T* x, T* y) { + T t_sum = 0; + CUDA_1D_KERNEL_LOOP(i, n) { t_sum += x[i]; } + typedef hipcub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + T b_sum = BlockReduce(temp_storage).Sum(t_sum); + if (threadIdx.x == 0) { *y = sqrt(b_sum); } +} + +template +__global__ void SquareSumForMultiThreadBlock(int64_t n, const T* x, T* tmp) { + T t_sum = 0; + CUDA_1D_KERNEL_LOOP(i, n) { t_sum += x[i] * x[i]; } + typedef hipcub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + T b_sum = BlockReduce(temp_storage).Sum(t_sum); + if (threadIdx.x == 0) { tmp[blockIdx.x] = b_sum; } +} + +} // namespace + +template +struct SqrtSquareSumKernelUtil { + static void SqrtSquareSum(ep::Stream* stream, int64_t n, const T* x, T* y, T* tmp) { + const int32_t num_blocks = BlocksNum4ThreadsNum(n); + CHECK_GE(num_blocks, 0); + if (num_blocks == 1) { + SqrtSquareSumForOneThreadBlock + <<<1, kCudaThreadsNumPerBlock, 0, stream->As()->cuda_stream()>>>(n, x, y); + } else { + Memset(stream, y, 0, sizeof(T)); + SquareSumForMultiThreadBlock + <<As()->cuda_stream()>>>( + n, x, tmp); + SqrtSumForMultiThreadBlock + <<<1, kCudaThreadsNumPerBlock, 0, stream->As()->cuda_stream()>>>( + num_blocks, tmp, y); + } + } +}; + +#define INSTANTIATE_SQRT_SQUARE_SUM_KERNEL_UTIL_CUDA(type_cpp, type_proto) \ + template struct SqrtSquareSumKernelUtil; +OF_PP_FOR_EACH_TUPLE(INSTANTIATE_SQRT_SQUARE_SUM_KERNEL_UTIL_CUDA, FLOATING_DATA_TYPE_SEQ); +#undef INSTANTIATE_SQRT_SQUARE_SUM_KERNEL_UTIL_CUDA + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/square_sum_kernel_util.hip.cpp b/oneflow/user/kernels/square_sum_kernel_util.hip.cpp index 490c6b9..62c8562 100644 --- a/oneflow/user/kernels/square_sum_kernel_util.hip.cpp +++ b/oneflow/user/kernels/square_sum_kernel_util.hip.cpp @@ -1,105 +1,105 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/user/kernels/square_sum_kernel_util.h" -#include "oneflow/core/hip/atomic.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include - -namespace oneflow { - -namespace { - -template -__global__ void SquareSumGpu(int64_t n, const T* x, T* y) { - T t_sum = 0; - CUDA_1D_KERNEL_LOOP(i, n) { t_sum += x[i] * x[i]; } - typedef hipcub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - T b_sum = BlockReduce(temp_storage).Sum(t_sum); - if (threadIdx.x == 0) { - if (ONE_BLOCK) { - *y = b_sum; - } else { - cuda::atomic::Add(y, b_sum); - } - } -} - -constexpr int64_t kMultiSquareSumMaxSize = 64; - -template -struct MultiSquareSumParams { - SquareSumParam params[kMultiSquareSumMaxSize]; - int32_t size; -}; - -template -__global__ void MultiSquareSumGpu(const MultiSquareSumParams params, T* y) { - T t_sum = 0; - for (int i = 0; i < params.size; ++i) { - const SquareSumParam param = params.params[i]; - CUDA_1D_KERNEL_LOOP(j, param.count) { t_sum += param.ptr[j] * param.ptr[j]; } - } - typedef hipcub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - T b_sum = BlockReduce(temp_storage).Sum(t_sum); - if (threadIdx.x == 0) { cuda::atomic::Add(y, b_sum); } -} - -} // namespace - -template -struct SquareSumKernelUtil { - static void SquareSum(ep::Stream* stream, int64_t n, const T* x, T* y) { - const int32_t num_blocks = BlocksNum4ThreadsNum(n); - CHECK_GE(num_blocks, 0); - if (num_blocks == 0) { - Memset(stream, y, 0, sizeof(T)); - } else if (num_blocks == 1) { - SquareSumGpu - <<<1, kCudaThreadsNumPerBlock, 0, stream->As()->cuda_stream()>>>(n, x, y); - } else { - Memset(stream, y, 0, sizeof(T)); - SquareSumGpu - <<As()->cuda_stream()>>>( - n, x, y); - } - } - - static void MultiSquareSum(ep::Stream* stream, const std::vector>& params, - T* y) { - Memset(stream, y, 0, sizeof(T)); - for (int64_t start = 0; start < params.size(); start += kMultiSquareSumMaxSize) { - MultiSquareSumParams gpu_params{}; - int64_t max_count = 0; - gpu_params.size = std::min(start + kMultiSquareSumMaxSize, params.size()) - start; - for (int64_t i = 0; i < gpu_params.size; ++i) { - gpu_params.params[i] = params[start + i]; - max_count = std::max(max_count, gpu_params.params[i].count); - } - MultiSquareSumGpu<<As()->cuda_stream()>>>(gpu_params, y); - } - } -}; - -#define INSTANTIATE_SQUARE_SUM_KERNEL_UTIL_CUDA(type_cpp, type_proto) \ - template struct SquareSumKernelUtil; -OF_PP_FOR_EACH_TUPLE(INSTANTIATE_SQUARE_SUM_KERNEL_UTIL_CUDA, FLOATING_DATA_TYPE_SEQ); -#undef INSTANTIATE_SQUARE_SUM_KERNEL_UTIL_CUDA - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/user/kernels/square_sum_kernel_util.h" +#include "oneflow/core/hip/atomic.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include + +namespace oneflow { + +namespace { + +template +__global__ void SquareSumGpu(int64_t n, const T* x, T* y) { + T t_sum = 0; + CUDA_1D_KERNEL_LOOP(i, n) { t_sum += x[i] * x[i]; } + typedef hipcub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + T b_sum = BlockReduce(temp_storage).Sum(t_sum); + if (threadIdx.x == 0) { + if (ONE_BLOCK) { + *y = b_sum; + } else { + cuda::atomic::Add(y, b_sum); + } + } +} + +constexpr int64_t kMultiSquareSumMaxSize = 64; + +template +struct MultiSquareSumParams { + SquareSumParam params[kMultiSquareSumMaxSize]; + int32_t size; +}; + +template +__global__ void MultiSquareSumGpu(const MultiSquareSumParams params, T* y) { + T t_sum = 0; + for (int i = 0; i < params.size; ++i) { + const SquareSumParam param = params.params[i]; + CUDA_1D_KERNEL_LOOP(j, param.count) { t_sum += param.ptr[j] * param.ptr[j]; } + } + typedef hipcub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + T b_sum = BlockReduce(temp_storage).Sum(t_sum); + if (threadIdx.x == 0) { cuda::atomic::Add(y, b_sum); } +} + +} // namespace + +template +struct SquareSumKernelUtil { + static void SquareSum(ep::Stream* stream, int64_t n, const T* x, T* y) { + const int32_t num_blocks = BlocksNum4ThreadsNum(n); + CHECK_GE(num_blocks, 0); + if (num_blocks == 0) { + Memset(stream, y, 0, sizeof(T)); + } else if (num_blocks == 1) { + SquareSumGpu + <<<1, kCudaThreadsNumPerBlock, 0, stream->As()->cuda_stream()>>>(n, x, y); + } else { + Memset(stream, y, 0, sizeof(T)); + SquareSumGpu + <<As()->cuda_stream()>>>( + n, x, y); + } + } + + static void MultiSquareSum(ep::Stream* stream, const std::vector>& params, + T* y) { + Memset(stream, y, 0, sizeof(T)); + for (int64_t start = 0; start < params.size(); start += kMultiSquareSumMaxSize) { + MultiSquareSumParams gpu_params{}; + int64_t max_count = 0; + gpu_params.size = std::min(start + kMultiSquareSumMaxSize, params.size()) - start; + for (int64_t i = 0; i < gpu_params.size; ++i) { + gpu_params.params[i] = params[start + i]; + max_count = std::max(max_count, gpu_params.params[i].count); + } + MultiSquareSumGpu<<As()->cuda_stream()>>>(gpu_params, y); + } + } +}; + +#define INSTANTIATE_SQUARE_SUM_KERNEL_UTIL_CUDA(type_cpp, type_proto) \ + template struct SquareSumKernelUtil; +OF_PP_FOR_EACH_TUPLE(INSTANTIATE_SQUARE_SUM_KERNEL_UTIL_CUDA, FLOATING_DATA_TYPE_SEQ); +#undef INSTANTIATE_SQUARE_SUM_KERNEL_UTIL_CUDA + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/stateful_opkernel.cpp b/oneflow/user/kernels/stateful_opkernel.cpp index b72a456..db373fe 100644 --- a/oneflow/user/kernels/stateful_opkernel.cpp +++ b/oneflow/user/kernels/stateful_opkernel.cpp @@ -1,901 +1,901 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/user/kernels/stateful_opkernel.h" -#include "oneflow/core/framework/attr_value_accessor.h" -#include "oneflow/core/framework/user_op_conf.h" -#include "oneflow/core/framework/user_op_registry_manager.h" -#include "oneflow/core/eager/eager_blob_object.h" -#include "oneflow/core/framework/attr_map.h" -#include "oneflow/core/rpc/include/global_process_ctx.h" -#include "oneflow/core/framework/consistent_tensor_infer_cache.h" -#include "oneflow/core/operator/operator.h" -#include "oneflow/core/profiler/profiler.h" -#include "oneflow/core/profiler/profile_manager.h" -#include "oneflow/core/profiler/event_recorder.h" -#include "oneflow/core/eager/call_context.h" - -namespace oneflow { -namespace one { - -class ConsistentTensorInferResult; - -using ArgVec = std::vector>; - -using EagerBlobObjectListRawPtr = const std::vector>*; -using ConsistentTensorInferResultRawPtr = const ConsistentTensorInferResult*; - -class ZeroCopyBaseContextHelper { - public: - ZeroCopyBaseContextHelper(const std::shared_ptr& input_arg_tuple, - const std::shared_ptr& output_arg_tuple) - : input_arg_tuple_(input_arg_tuple), output_arg_tuple_(output_arg_tuple) {} - -#define RETURN_IF_FOUND(inputs, outputs, post_action) \ - int32_t i = TryGetTensorTupleIndex(input_arg_tuple_->arg_name2bn_index2tensor_tuple_index(), \ - arg_name, index); \ - if (i >= 0) { return (inputs).at(i) post_action; } \ - i = TryGetTensorTupleIndex(output_arg_tuple_->arg_name2bn_index2tensor_tuple_index(), arg_name, \ - index); \ - if (i >= 0) { return (outputs).at(i) post_action; } - - user_op::TensorDesc* TensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx, - const std::string& arg_name, - const int32_t index) const { - RETURN_IF_FOUND(*call_ctx->inputs(), *call_ctx->outputs(), .get()); - return nullptr; - } - - user_op::Tensor* Tensor4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name, - const int32_t index) const { - RETURN_IF_FOUND(*call_ctx->inputs(), *call_ctx->outputs(), .get()); - if (arg_name == "tmp_buffer" && index == 0) { return call_ctx->mut_tmp_tensor(); } - return nullptr; - } - - const ConsistentTensorMeta* ConsistentTensorMeta4ArgNameAndIndex(eager::CallContext* call_ctx, - const std::string& arg_name, - const int32_t index) const { - const auto& consistent_tensor_infer_result = call_ctx->consistent_tensor_infer_result(); - RETURN_IF_FOUND(consistent_tensor_infer_result->input_tensor_metas(), - consistent_tensor_infer_result->output_tensor_metas(), - .shared_from_symbol().get()); - return nullptr; - } - - Optional> parallel_desc(eager::CallContext* call_ctx) const { - const auto& consistent_tensor_infer_result = call_ctx->consistent_tensor_infer_result(); - if (!consistent_tensor_infer_result) { return Optional>(); } - if (!consistent_tensor_infer_result->input_tensor_metas().empty()) { - return consistent_tensor_infer_result->input_tensor_metas().at(0)->parallel_desc(); - } else if (!consistent_tensor_infer_result->output_tensor_metas().empty()) { - return consistent_tensor_infer_result->output_tensor_metas().at(0)->parallel_desc(); - } else { - UNIMPLEMENTED(); - return Optional>(); - } - } - - const ParallelContext& parallel_ctx(eager::CallContext* call_ctx) const { - const auto& parallel_desc = this->parallel_desc(call_ctx); - if (parallel_desc.has_value()) { - const auto& parallel_desc_symbol = CHECK_JUST(parallel_desc); - return *CHECK_JUST(GetParallelContext4CurrentProcessCtx(parallel_desc_symbol)); - } else { - static ParallelContext single_device_parallel_ctx(MakeSingleDeviceParallelCtx()); - return single_device_parallel_ctx; - } - } - - const ArgVec& inputs() const { return input_arg_tuple_->indexed_arg_name_and_index(); } - const ArgVec& outputs() const { return output_arg_tuple_->indexed_arg_name_and_index(); } - - private: - static int32_t TryGetTensorTupleIndex(const std::unordered_map>& - arg_name2bn_index2tensor_tuple_index, - const std::string& arg_name, const int32_t arg_index) { - auto it = arg_name2bn_index2tensor_tuple_index.find(arg_name); - if (it != arg_name2bn_index2tensor_tuple_index.end()) { return it->second.at(arg_index); } - return -1; - } - - static ParallelContext MakeSingleDeviceParallelCtx() { - ParallelContext single_device_parallel_ctx; - single_device_parallel_ctx.set_parallel_id(0); - single_device_parallel_ctx.set_parallel_num(1); - return single_device_parallel_ctx; - } - - std::shared_ptr input_arg_tuple_; - std::shared_ptr output_arg_tuple_; -}; - -class UserKernelBaseContextHelper final : public ZeroCopyBaseContextHelper { - public: - UserKernelBaseContextHelper(DeviceType device_type, - const std::shared_ptr& input_arg_tuple, - const std::shared_ptr& output_arg_tuple) - : ZeroCopyBaseContextHelper(input_arg_tuple, output_arg_tuple), device_type_(device_type) {} - - ~UserKernelBaseContextHelper() = default; - - DeviceType device_type() const { return device_type_; } - const JobDesc& job_desc() const { - UNIMPLEMENTED(); - return *(const JobDesc*)nullptr; - } - - private: - const DeviceType device_type_; -}; - -class UserOpInferContextHelper final { - public: - UserOpInferContextHelper(const user_op::UserOpConfWrapper* user_op_conf, - const std::shared_ptr& input_arg_tuple, - const std::shared_ptr& output_arg_tuple) - : user_op_conf_(user_op_conf), - zero_copy_base_ctx_helper_(input_arg_tuple, output_arg_tuple) {} - - ~UserOpInferContextHelper() = default; - - const user_op::TensorDesc* LogicalTensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx, - const std::string& arg_name, - int32_t index) const { - UNIMPLEMENTED(); - return nullptr; - } - - const user_op::TensorDesc& InputTensorDesc(eager::CallContext* call_ctx, - const std::string& arg_name, int32_t index) const { - return *CHECK_NOTNULL(TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index)); - } - - user_op::TensorDesc* OutputTensorDesc(eager::CallContext* call_ctx, const std::string& arg_name, - int32_t index) const { - return TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index); - } - user_op::TensorDesc* TensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx, - const std::string& arg_name, - int32_t index) const { - return zero_copy_base_ctx_helper_.TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index); - } - - const Shape& InputShape(eager::CallContext* call_ctx, const std::string& arg_name, - int32_t index) const { - return *Shape4ArgNameAndIndex(call_ctx, arg_name, index); - } - Shape* OutputShape(eager::CallContext* call_ctx, const std::string& arg_name, - int32_t index) const { - return Shape4ArgNameAndIndex(call_ctx, arg_name, index); - } - Shape* Shape4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name, - int32_t index) const { - return NonNullTensorDesc4ArgNameAndIndex(call_ctx, arg_name, index)->mut_shape(); - } - const Stride& InputStride(eager::CallContext* call_ctx, const std::string& arg_name, - int32_t index) const { - return *Stride4ArgNameAndIndex(call_ctx, arg_name, index); - } - Stride* OutputStride(eager::CallContext* call_ctx, const std::string& arg_name, - int32_t index) const { - return Stride4ArgNameAndIndex(call_ctx, arg_name, index); - } - Stride* Stride4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name, - int32_t index) const { - return NonNullTensorDesc4ArgNameAndIndex(call_ctx, arg_name, index)->mut_stride(); - } - const DataType& InputDType(eager::CallContext* call_ctx, const std::string& arg_name, - int32_t index) const { - return *Dtype4ArgNameAndIndex(call_ctx, arg_name, index); - } - DataType* OutputDType(eager::CallContext* call_ctx, const std::string& arg_name, - int32_t index) const { - return Dtype4ArgNameAndIndex(call_ctx, arg_name, index); - } - DataType* Dtype4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name, - int32_t index) const { - return NonNullTensorDesc4ArgNameAndIndex(call_ctx, arg_name, index)->mut_data_type(); - } - bool InputIsDynamic(eager::CallContext* call_ctx, const std::string& arg_name, - int32_t index) const { - return *IsDynamic4ArgNameAndIndex(call_ctx, arg_name, index); - } - bool* OutputIsDynamic(eager::CallContext* call_ctx, const std::string& arg_name, - int32_t index) const { - return IsDynamic4ArgNameAndIndex(call_ctx, arg_name, index); - } - bool* IsDynamic4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name, - int32_t index) const { - return NonNullTensorDesc4ArgNameAndIndex(call_ctx, arg_name, index)->mut_is_dynamic(); - } - - const ArgVec& inputs() const { return zero_copy_base_ctx_helper_.inputs(); } - const ArgVec& outputs() const { return zero_copy_base_ctx_helper_.outputs(); } - const JobDesc* job_desc() const { - UNIMPLEMENTED(); - return nullptr; - } - const ParallelContext& parallel_ctx(eager::CallContext* call_ctx) const { - return zero_copy_base_ctx_helper_.parallel_ctx(call_ctx); - } - const ParallelDesc& parallel_desc(eager::CallContext* call_ctx) const { - return *CHECK_JUST(zero_copy_base_ctx_helper_.parallel_desc(call_ctx)); - } - const SbpParallel& SbpParallel4ArgNameAndIndex(eager::CallContext* call_ctx, - const std::string& arg_name, int32_t index) const { - const auto& nd_sbp = NdSbp4ArgNameAndIndex(call_ctx, arg_name, index); - CHECK_EQ(nd_sbp.sbp_parallel_size(), 1); - return nd_sbp.sbp_parallel(0); - } - const NdSbp& NdSbp4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name, - int32_t index) const { - return *CHECK_NOTNULL(zero_copy_base_ctx_helper_.ConsistentTensorMeta4ArgNameAndIndex( - call_ctx, arg_name, index)) - ->nd_sbp(); - } - - int64_t parallel_num(eager::CallContext* call_ctx) const { - return parallel_ctx(call_ctx).parallel_num(); - } - - const std::string& input(const std::string& arg_name, int32_t index) const { - return user_op_conf().input(arg_name, index); - } - const std::string& output(const std::string& arg_name, int32_t index) const { - return user_op_conf().output(arg_name, index); - } - bool has_input(const std::string& arg_name, int32_t index) const { - return user_op_conf().has_input(arg_name, index); - } - bool has_output(const std::string& arg_name, int32_t index) const { - return user_op_conf().has_output(arg_name, index); - } - int32_t input_size(const std::string& arg_name) const { - return user_op_conf().input_size(arg_name); - } - int32_t output_size(const std::string& arg_name) const { - return user_op_conf().output_size(arg_name); - } - const std::string& op_name() const { return user_op_conf().op_name(); } - const std::string& op_type_name() const { return user_op_conf().op_type_name(); } - const std::string& op_loc() const { return user_op_conf_->op_conf().loc(); } - - const user_op::UserOpConfWrapper& user_op_conf() const { return *user_op_conf_; } - const std::shared_ptr& Attr4Name(eager::CallContext* call_ctx, - const std::string& attr_name) const { - return call_ctx->composed_attrs().Attr4Name(attr_name); - } - - private: - user_op::TensorDesc* NonNullTensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx, - const std::string& arg_name, - int32_t index) const { - user_op::TensorDesc* tensor_desc = TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index); - if (!tensor_desc) { LOG(FATAL) << "Arg (" << arg_name << "," << index << ") is not found"; } - return tensor_desc; - } - - const user_op::UserOpConfWrapper* user_op_conf_; - ZeroCopyBaseContextHelper zero_copy_base_ctx_helper_; -}; - -class UserOpInferContext : public user_op::InferContext { - public: - UserOpInferContext(const UserOpInferContextHelper* helper, eager::CallContext* call_ctx) - : helper_(helper), call_ctx_(call_ctx) {} - - ~UserOpInferContext() override = default; - - const user_op::TensorDesc* LogicalTensorDesc4ArgNameAndIndex(const std::string& arg_name, - int32_t index) const override { - return helper_->LogicalTensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index); - } - - const user_op::TensorDesc& InputTensorDesc(const std::string& arg_name, - int32_t index) const override { - return helper_->InputTensorDesc(call_ctx_, arg_name, index); - } - user_op::TensorDesc* OutputTensorDesc(const std::string& arg_name, int32_t index) override { - return helper_->OutputTensorDesc(call_ctx_, arg_name, index); - } - user_op::TensorDesc* TensorDesc4ArgNameAndIndex(const std::string& arg_name, int32_t index) { - return helper_->TensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index); - } - - const Shape& InputShape(const std::string& arg_name, int32_t index) const override { - return helper_->InputShape(call_ctx_, arg_name, index); - } - Shape* OutputShape(const std::string& arg_name, int32_t index) override { - return helper_->OutputShape(call_ctx_, arg_name, index); - } - Shape* Shape4ArgNameAndIndex(const std::string& arg_name, int32_t index) override { - return helper_->Shape4ArgNameAndIndex(call_ctx_, arg_name, index); - } - const Stride& InputStride(const std::string& arg_name, int32_t index) const override { - return helper_->InputStride(call_ctx_, arg_name, index); - } - Stride* OutputStride(const std::string& arg_name, int32_t index) override { - return helper_->OutputStride(call_ctx_, arg_name, index); - } - Stride* Stride4ArgNameAndIndex(const std::string& arg_name, int32_t index) override { - return helper_->Stride4ArgNameAndIndex(call_ctx_, arg_name, index); - } - const DataType& InputDType(const std::string& arg_name, int32_t index) const override { - return helper_->InputDType(call_ctx_, arg_name, index); - } - DataType* OutputDType(const std::string& arg_name, int32_t index) override { - return helper_->OutputDType(call_ctx_, arg_name, index); - } - DataType* Dtype4ArgNameAndIndex(const std::string& arg_name, int32_t index) override { - return helper_->Dtype4ArgNameAndIndex(call_ctx_, arg_name, index); - } - bool InputIsDynamic(const std::string& arg_name, int32_t index) const override { - return helper_->InputIsDynamic(call_ctx_, arg_name, index); - } - bool* OutputIsDynamic(const std::string& arg_name, int32_t index) override { - return helper_->OutputIsDynamic(call_ctx_, arg_name, index); - } - bool* IsDynamic4ArgNameAndIndex(const std::string& arg_name, int32_t index) override { - return helper_->IsDynamic4ArgNameAndIndex(call_ctx_, arg_name, index); - } - - const ArgVec& inputs() const override { return helper_->inputs(); } - const ArgVec& outputs() const override { return helper_->outputs(); } - const JobDesc* job_desc() const override { return helper_->job_desc(); } - const ParallelContext& parallel_ctx() const override { return helper_->parallel_ctx(call_ctx_); } - const ParallelDesc& parallel_desc() const override { return helper_->parallel_desc(call_ctx_); } - const SbpParallel& SbpParallel4ArgNameAndIndex(const std::string& arg_name, - int32_t index) const override { - return helper_->SbpParallel4ArgNameAndIndex(call_ctx_, arg_name, index); - } - const NdSbp& NdSbp4ArgNameAndIndex(const std::string& arg_name, int32_t index) const override { - return helper_->NdSbp4ArgNameAndIndex(call_ctx_, arg_name, index); - } - - int64_t parallel_num() const override { return helper_->parallel_num(call_ctx_); } - - const std::string& input(const std::string& arg_name, int32_t index) const override { - return helper_->input(arg_name, index); - } - const std::string& output(const std::string& arg_name, int32_t index) const override { - return helper_->output(arg_name, index); - } - bool has_input(const std::string& arg_name, int32_t index) const override { - return helper_->has_input(arg_name, index); - } - bool has_output(const std::string& arg_name, int32_t index) const override { - return helper_->has_output(arg_name, index); - } - int32_t input_size(const std::string& arg_name) const override { - return helper_->input_size(arg_name); - } - int32_t output_size(const std::string& arg_name) const override { - return helper_->output_size(arg_name); - } - const std::string& op_name() const override { return helper_->op_name(); } - const std::string& op_type_name() const override { return helper_->op_type_name(); } - const std::string& op_loc() const override { return helper_->op_loc(); } - - private: - const std::shared_ptr& Attr4Name( - const std::string& attr_name) const override { - return helper_->Attr4Name(call_ctx_, attr_name); - } - - const UserOpInferContextHelper* helper_; - eager::CallContext* call_ctx_; -}; - -class UserKernelComputeContextHelper final { - public: - UserKernelComputeContextHelper(DeviceType device_type, - const user_op::UserOpConfWrapper* user_op_conf, - const std::shared_ptr& input_arg_tuple, - const std::shared_ptr& output_arg_tuple) - : user_op_conf_(user_op_conf), - base_ctx_helper_(device_type, input_arg_tuple, output_arg_tuple) {} - - ~UserKernelComputeContextHelper() = default; - - const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx, - const std::string& arg_name, - int32_t index) const { - return base_ctx_helper_.TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index); - } - - user_op::Tensor* Tensor4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name, - int32_t index) const { - return base_ctx_helper_.Tensor4ArgNameAndIndex(call_ctx, arg_name, index); - } - ep::Stream* stream(DeviceCtx* device_ctx) const { - CHECK(device_ctx); - return device_ctx->stream(); - } - - DeviceType device_type() const { return base_ctx_helper_.device_type(); } - const ParallelContext& parallel_ctx(eager::CallContext* call_ctx) const { - return base_ctx_helper_.parallel_ctx(call_ctx); - } - - const ArgVec& inputs() const { return base_ctx_helper_.inputs(); } - const ArgVec& outputs() const { return base_ctx_helper_.outputs(); } - - const user_op::UserOpConfWrapper& user_op_conf() const { return *user_op_conf_; } - const std::shared_ptr& Attr4Name(eager::CallContext* call_ctx, - const std::string& attr_name) const { - return call_ctx->composed_attrs().Attr4Name(attr_name); - } - - private: - const user_op::UserOpConfWrapper* user_op_conf_; - UserKernelBaseContextHelper base_ctx_helper_; -}; - -class UserKernelComputeContext final : public user_op::KernelComputeContext { - public: - UserKernelComputeContext(const UserKernelComputeContextHelper* helper, - eager::CallContext* call_ctx, DeviceCtx* device_ctx) - : helper_(helper), call_ctx_(call_ctx), device_ctx_(device_ctx) {} - - ~UserKernelComputeContext() = default; - - const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(const std::string& arg_name, - int32_t index) const override { - return helper_->TensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index); - } - - user_op::Tensor* Tensor4ArgNameAndIndex(const std::string& arg_name, int32_t index) override { - return helper_->Tensor4ArgNameAndIndex(call_ctx_, arg_name, index); - } - - ep::Stream* stream() override { return helper_->stream(device_ctx_); } - - DeviceType device_type() const override { return helper_->device_type(); } - - const ParallelContext& parallel_ctx() const override { return helper_->parallel_ctx(call_ctx_); } - - const ArgVec& inputs() const override { return helper_->inputs(); } - const ArgVec& outputs() const override { return helper_->outputs(); } - - private: - const user_op::UserOpConfWrapper& user_op_conf() const override { - return helper_->user_op_conf(); - } - - const std::shared_ptr& Attr4Name( - const std::string& attr_name) const override { - return helper_->Attr4Name(call_ctx_, attr_name); - } - - const UserKernelComputeContextHelper* helper_; - eager::CallContext* call_ctx_; - DeviceCtx* device_ctx_; -}; - -class UserKernelRegContextHelper final { - public: - UserKernelRegContextHelper(DeviceType device_type, const user_op::UserOpConfWrapper* user_op_conf, - const std::shared_ptr& input_arg_tuple, - const std::shared_ptr& output_arg_tuple) - : user_op_conf_(user_op_conf), - base_ctx_helper_(device_type, input_arg_tuple, output_arg_tuple) {} - ~UserKernelRegContextHelper() = default; - - DeviceType device_type() const { return base_ctx_helper_.device_type(); } - const ParallelContext& parallel_ctx(eager::CallContext* call_ctx) const { - return base_ctx_helper_.parallel_ctx(call_ctx); - } - const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx, - const std::string& arg_name, - int32_t index) const { - return base_ctx_helper_.TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index); - } - const ArgVec& inputs() const { return base_ctx_helper_.inputs(); } - const ArgVec& outputs() const { return base_ctx_helper_.outputs(); } - - const user_op::UserOpConfWrapper& user_op_conf() const { return *user_op_conf_; } - - const std::shared_ptr& Attr4Name(eager::CallContext* call_ctx, - const std::string& attr_name) const { - return call_ctx->composed_attrs().Attr4Name(attr_name); - } - - private: - const user_op::UserOpConfWrapper* user_op_conf_; - UserKernelBaseContextHelper base_ctx_helper_; -}; - -class UserKernelRegContext final : public user_op::KernelRegContext { - public: - UserKernelRegContext(const UserKernelRegContextHelper* helper, eager::CallContext* call_ctx) - : helper_(helper), call_ctx_(call_ctx) {} - ~UserKernelRegContext() = default; - - DeviceType device_type() const override { return helper_->device_type(); } - const ParallelContext& parallel_ctx() const override { return helper_->parallel_ctx(call_ctx_); } - const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(const std::string& arg_name, - int32_t index) const override { - return helper_->TensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index); - } - const ArgVec& inputs() const override { return helper_->inputs(); } - const ArgVec& outputs() const override { return helper_->outputs(); } - - const user_op::UserOpConfWrapper& user_op_conf() const override { - return helper_->user_op_conf(); - } - - private: - const std::shared_ptr& Attr4Name( - const std::string& attr_name) const override { - return helper_->Attr4Name(call_ctx_, attr_name); - } - - const UserKernelRegContextHelper* helper_; - eager::CallContext* call_ctx_; -}; - -class UserKernelInitAndCacheContextHelper final { - public: - UserKernelInitAndCacheContextHelper(DeviceType device_type, - const user_op::UserOpConfWrapper* user_op_conf, - const std::shared_ptr& input_arg_tuple, - const std::shared_ptr& output_arg_tuple) - : user_op_conf_(user_op_conf), - base_ctx_helper_(device_type, input_arg_tuple, output_arg_tuple) {} - - ~UserKernelInitAndCacheContextHelper() = default; - - ep::Stream* stream(DeviceCtx* device_ctx) const { - CHECK(device_ctx); - return device_ctx->stream(); - } - - DeviceType device_type() const { return base_ctx_helper_.device_type(); } - const ParallelContext& parallel_ctx(eager::CallContext* call_ctx) const { - return base_ctx_helper_.parallel_ctx(call_ctx); - } - const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx, - const std::string& arg_name, - int32_t index) const { - return base_ctx_helper_.TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index); - } - const user_op::TensorDesc* LogicalTensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx, - const std::string& arg_name, - int32_t index) const { - return base_ctx_helper_.ConsistentTensorMeta4ArgNameAndIndex(call_ctx, arg_name, index); - } - const SbpParallel& SbpParallel4ArgNameAndIndex(eager::CallContext* call_ctx, - const std::string& arg_name, int32_t index) const { - const auto& nd_sbp = NdSbp4ArgNameAndIndex(call_ctx, arg_name, index); - CHECK_EQ(nd_sbp.sbp_parallel_size(), 1); - return nd_sbp.sbp_parallel(0); - } - - const NdSbp& NdSbp4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name, - int32_t index) const { - return *CHECK_NOTNULL( - base_ctx_helper_.ConsistentTensorMeta4ArgNameAndIndex(call_ctx, arg_name, index)) - ->nd_sbp(); - } - - const ArgVec& inputs() const { return base_ctx_helper_.inputs(); } - const ArgVec& outputs() const { return base_ctx_helper_.outputs(); } - const ParallelDesc& parallel_desc(eager::CallContext* call_ctx) const { - return *CHECK_JUST(base_ctx_helper_.parallel_desc(call_ctx)); - } - - const std::shared_ptr& Attr4Name(eager::CallContext* call_ctx, - const std::string& attr_name) const { - return call_ctx->composed_attrs().Attr4Name(attr_name); - } - - const user_op::UserOpConfWrapper& user_op_conf() const { return *user_op_conf_; } - - private: - const user_op::UserOpConfWrapper* user_op_conf_; - UserKernelBaseContextHelper base_ctx_helper_; -}; - -class UserKernelInitAndCacheContext final : public user_op::KernelInitContext, - public user_op::KernelCacheContext { - public: - UserKernelInitAndCacheContext(const UserKernelInitAndCacheContextHelper* helper, - eager::CallContext* call_ctx, DeviceCtx* device_ctx) - : helper_(helper), call_ctx_(call_ctx), device_ctx_(device_ctx) {} - - ~UserKernelInitAndCacheContext() override = default; - - ep::Stream* stream() override { return helper_->stream(device_ctx_); } - - DeviceType device_type() const override { return helper_->device_type(); } - const ParallelContext& parallel_ctx() const override { return helper_->parallel_ctx(call_ctx_); } - const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(const std::string& arg_name, - int32_t index) const override { - return helper_->TensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index); - } - const user_op::TensorDesc* LogicalTensorDesc4ArgNameAndIndex(const std::string& arg_name, - int32_t index) const override { - return helper_->LogicalTensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index); - } - const SbpParallel& SbpParallel4ArgNameAndIndex(const std::string& arg_name, - int32_t index) const override { - return helper_->SbpParallel4ArgNameAndIndex(call_ctx_, arg_name, index); - } - - const NdSbp& NdSbp4ArgNameAndIndex(const std::string& arg_name, int32_t index) const override { - return helper_->NdSbp4ArgNameAndIndex(call_ctx_, arg_name, index); - } - - const ArgVec& inputs() const override { return helper_->inputs(); } - const ArgVec& outputs() const override { return helper_->outputs(); } - const ParallelDesc& parallel_desc() const override { return helper_->parallel_desc(call_ctx_); } - - private: - const std::shared_ptr& Attr4Name( - const std::string& attr_name) const override { - return helper_->Attr4Name(call_ctx_, attr_name); - } - - const user_op::UserOpConfWrapper& user_op_conf() const override { - return helper_->user_op_conf(); - } - - const UserKernelInitAndCacheContextHelper* helper_; - eager::CallContext* call_ctx_; - DeviceCtx* device_ctx_; -}; - -namespace { - -Maybe InitTensorTupleIndexes4Bns(const std::shared_ptr& op_conf, - const ArgVec& indexed_input_pairs, - const ArgVec& indexed_output_pairs, - std::vector* input_tuple_indexes4const_ibns, - std::vector* input_tuple_indexes4mut_ibns, - std::vector* output_tuple_indexes4mut_obns, - std::vector* output_tuple_indexes4mut2_obns) { - const auto* op_reg_val = - user_op::UserOpRegistryMgr::Get().GetOpRegistryResult(op_conf->user_conf().op_type_name()); - CHECK_NOTNULL_OR_RETURN(op_reg_val); - - ArgModifierSignature arg_modifier_signature; - for (const auto& pair : indexed_input_pairs) { - const std::string ibn = GenRepeatedBn(pair.first, pair.second); - arg_modifier_signature.mutable_ibn2input_blob_modifier()->insert( - {ibn, user_op::InputArgModifier()}); - } - for (const auto& pair : indexed_output_pairs) { - const std::string obn = GenRepeatedBn(pair.first, pair.second); - arg_modifier_signature.mutable_obn2output_blob_modifier()->insert( - {obn, user_op::OutputArgModifier()}); - } - user_op::UserOpConfWrapper op_conf_wrapper(op_conf); - if (op_reg_val->input_arg_modify_fn) { - user_op::GetInputArgModifier GetInputArgModifierFn = - [&arg_modifier_signature](const std::string& in_arg_name, - int32_t in_arg_index) -> user_op::InputArgModifier* { - const std::string ibn = GenRepeatedBn(in_arg_name, in_arg_index); - auto* map = arg_modifier_signature.mutable_ibn2input_blob_modifier(); - return &map->at(ibn); - }; - JUST(op_reg_val->input_arg_modify_fn(GetInputArgModifierFn, op_conf_wrapper)); - } - if (op_reg_val->output_arg_modify_fn) { - user_op::GetOutputArgModifier GetOutputArgModifierFn = - [&arg_modifier_signature](const std::string& in_arg_name, - int32_t in_arg_index) -> user_op::OutputArgModifier* { - const std::string obn = GenRepeatedBn(in_arg_name, in_arg_index); - auto* map = arg_modifier_signature.mutable_obn2output_blob_modifier(); - return &map->at(obn); - }; - JUST(op_reg_val->output_arg_modify_fn(GetOutputArgModifierFn, op_conf_wrapper)); - } - - for (int i = 0; i < indexed_input_pairs.size(); i++) { - const auto& pair = indexed_input_pairs.at(i); - const std::string ibn = GenRepeatedBn(pair.first, pair.second); - if (arg_modifier_signature.ibn2input_blob_modifier().at(ibn).is_mutable()) { - input_tuple_indexes4mut_ibns->emplace_back(i); - } else { - input_tuple_indexes4const_ibns->emplace_back(i); - } - } - - for (int i = 0; i < indexed_output_pairs.size(); i++) { - const auto& pair = indexed_output_pairs.at(i); - const std::string obn = GenRepeatedBn(pair.first, pair.second); - if (arg_modifier_signature.obn2output_blob_modifier().at(obn).header_infered_before_compute()) { - output_tuple_indexes4mut_obns->emplace_back(i); - } else { - output_tuple_indexes4mut2_obns->emplace_back(i); - } - } - return Maybe::Ok(); -} - -} // namespace - -/* static */ Maybe StatefulOpKernel::New( - const std::shared_ptr& op_conf, const Symbol& stream, - const AttrMap& base_attrs, const std::shared_ptr& parallel_desc, - const std::shared_ptr& input_arg_tuple, - const std::shared_ptr& output_arg_tuple) { - auto opkernel = std::shared_ptr(new StatefulOpKernel()); - opkernel->base_attrs_ = base_attrs; - opkernel->op_conf_ = op_conf; - opkernel->user_op_conf_.reset(new user_op::UserOpConfWrapper(op_conf)); - opkernel->stream_ = stream; - opkernel->input_arg_tuple_ = input_arg_tuple; - opkernel->output_arg_tuple_ = output_arg_tuple; - opkernel->need_check_mem_case_ = true; - - const DeviceType device_type = CHECK_JUST(DeviceType4DeviceTag(op_conf->device_tag())); - const user_op::UserOpConfWrapper* user_op_conf = opkernel->user_op_conf_.get(); - opkernel->op_infer_ctx_helper_.reset( - new UserOpInferContextHelper(user_op_conf, input_arg_tuple, output_arg_tuple)); - - opkernel->init_and_cache_ctx_helper_.reset(new UserKernelInitAndCacheContextHelper( - device_type, opkernel->user_op_conf_.get(), opkernel->input_arg_tuple_, - opkernel->output_arg_tuple_)); - opkernel->compute_ctx_helper_.reset(new UserKernelComputeContextHelper( - device_type, user_op_conf, input_arg_tuple, output_arg_tuple)); - opkernel->reg_ctx_helper_.reset( - new UserKernelRegContextHelper(device_type, user_op_conf, input_arg_tuple, output_arg_tuple)); - const auto* op_reg_val = - user_op::UserOpRegistryMgr::Get().GetOpRegistryResult(user_op_conf->op_type_name()); - CHECK_NOTNULL_OR_RETURN(op_reg_val); - if (op_reg_val->logical_tensor_desc_infer_fn) { - opkernel->tensor_desc_infer_fn_ = op_reg_val->logical_tensor_desc_infer_fn; - } else { - return Error::UnimplementedError(); - } - opkernel->data_type_infer_fn_ = op_reg_val->data_type_infer_fn; - - JUST(InitTensorTupleIndexes4Bns( - op_conf, input_arg_tuple->indexed_arg_name_and_index(), - output_arg_tuple->indexed_arg_name_and_index(), &opkernel->input_tuple_indexes4const_ibns_, - &opkernel->input_tuple_indexes4mut_ibns_, &opkernel->output_tuple_indexes4mut_obns_, - &opkernel->output_tuple_indexes4mut2_obns_)); - - return opkernel; -} - -StatefulOpKernel::~StatefulOpKernel() = default; - -size_t StatefulOpKernel::InferTmpSize(eager::CallContext* call_ctx, - const user_op::OpKernel* user_opkernel) const { - UserOpInferContext op_infer_ctx(op_infer_ctx_helper_.get(), call_ctx); - const auto& InferTmpSizeFn = GetInferTmpSizeFn(user_opkernel); - return InferTmpSizeFn(&op_infer_ctx); -} - -Maybe StatefulOpKernel::ChooseOpKernel(eager::CallContext* call_ctx, - const user_op::OpKernel** user_opkernel, - bool* need_temp_storage) { - OF_PROFILER_RANGE_GUARD("ChooseOpKernel"); - DataType primary_dtype = kInvalidDataType; - const auto& inputs = call_ctx->inputs(); - const auto& outputs = call_ctx->outputs(); - if (likely(!inputs->empty())) { - primary_dtype = (*inputs)[0]->data_type(); - } else if (likely(!outputs->empty())) { - primary_dtype = (*outputs)[0]->data_type(); - } else { - // do nothing - } - - UserKernelRegContext reg_ctx(reg_ctx_helper_.get(), call_ctx); - for (const auto& pair : dtype2cached_kernels_[primary_dtype]) { - if (likely(pair.first->is_matched_hob->get(reg_ctx))) { - *need_temp_storage = pair.first->need_temp_storage; - *user_opkernel = pair.second.get(); - return Maybe::Ok(); - } - } - - OF_PROFILER_RANGE_GUARD("fallback"); - - const auto& op_type_name = user_op_conf_->op_type_name(); - const auto* kernel_reg_val = - JUST(user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult(op_type_name, reg_ctx)); - CHECK_NOTNULL(kernel_reg_val); - auto* kernel = kernel_reg_val->create_fn(); - dtype2cached_kernels_[primary_dtype].push_back( - {kernel_reg_val, std::shared_ptr(kernel)}); - - infer_tmp_size_fn_map_.emplace(kernel, &kernel_reg_val->infer_tmp_size_fn); - *need_temp_storage = kernel_reg_val->need_temp_storage; - *user_opkernel = kernel; - return Maybe::Ok(); -} - -void StatefulOpKernel::TryInitOpKernelStateAndCache(eager::CallContext* call_ctx, - DeviceCtx* device_ctx, - const user_op::OpKernel* op_kernel, - user_op::OpKernelState** state, - user_op::OpKernelCache** cache) { - UserKernelInitAndCacheContext init_and_cache_ctx(init_and_cache_ctx_helper_.get(), call_ctx, - device_ctx); - if (state != nullptr) { - auto it = op_kernel_state_map_.find(op_kernel); - if (it != op_kernel_state_map_.end()) { - *state = it->second.get(); - } else { - auto created_state = op_kernel->CreateOpKernelState(&init_and_cache_ctx); - op_kernel_state_map_.emplace(op_kernel, created_state); - *state = created_state.get(); - } - } - - { - auto& cache_in_map = op_kernel_cache_map_[op_kernel]; - op_kernel->InitOpKernelCacheWithFlags(&init_and_cache_ctx, - user_op::OpKernelCache::kAllMayChanged, &cache_in_map); - *cache = cache_in_map.get(); - } -} - -const user_op::InferTmpSizeFn& StatefulOpKernel::GetInferTmpSizeFn( - const user_op::OpKernel* op_kernel) const { - return *infer_tmp_size_fn_map_.at(op_kernel); -} - -user_op::TensorDescInferFn StatefulOpKernel::TensorDescInferFn() const { - return tensor_desc_infer_fn_; -} - -user_op::DataTypeInferFn StatefulOpKernel::DataTypeInferFn() const { return data_type_infer_fn_; } - -void StatefulOpKernel::Compute(eager::CallContext* call_ctx, DeviceCtx* device_ctx, - const user_op::OpKernel* user_opkernel, - user_op::OpKernelState* state, - const user_op::OpKernelCache* cache) const { - UserKernelComputeContext compute_context(compute_ctx_helper_.get(), call_ctx, device_ctx); - auto* compute_ctx = &compute_context; - OF_PROFILER_RANGE_GUARD("Compute"); - if (Singleton::Get()) { -#if defined(WITH_CUDA) || defined(WITH_ROCM) - const auto CalMemorySize = [compute_ctx](const one::ArgVec& args) -> int64_t { - const auto Func = [compute_ctx](int64_t mem_size, const auto& pair) { - const auto tensor = compute_ctx->Tensor4ArgNameAndIndex(pair.first, pair.second); - return mem_size + tensor->shape_view().elem_cnt() * GetSizeOfDataType(tensor->data_type()); - }; - return std::accumulate(args.begin(), args.end(), static_cast(0), Func); - }; -#endif - auto er_guard = CHECK_JUST(profiler::EventRecorder::CreateKernelEventRecorder( - op_type_name(), -#if defined(WITH_CUDA) || defined(WITH_ROCM) - [compute_ctx, CalMemorySize]() -> int64_t { - return CalMemorySize(compute_ctx->inputs()) + CalMemorySize(compute_ctx->outputs()); - }, -#endif - [compute_ctx]() -> std::vector { - std::vector shapes; - for (const auto& pair : compute_ctx->inputs()) { - shapes.emplace_back( - compute_ctx->TensorDesc4ArgNameAndIndex(pair.first, pair.second)->shape()); - } - return shapes; - })); - user_opkernel->Compute(compute_ctx, state, cache); - } else { - user_opkernel->Compute(compute_ctx, state, cache); - } -} - -} // namespace one -} // namespace oneflow +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/user/kernels/stateful_opkernel.h" +#include "oneflow/core/framework/attr_value_accessor.h" +#include "oneflow/core/framework/user_op_conf.h" +#include "oneflow/core/framework/user_op_registry_manager.h" +#include "oneflow/core/eager/eager_blob_object.h" +#include "oneflow/core/framework/attr_map.h" +#include "oneflow/core/rpc/include/global_process_ctx.h" +#include "oneflow/core/framework/consistent_tensor_infer_cache.h" +#include "oneflow/core/operator/operator.h" +#include "oneflow/core/profiler/profiler.h" +#include "oneflow/core/profiler/profile_manager.h" +#include "oneflow/core/profiler/event_recorder.h" +#include "oneflow/core/eager/call_context.h" + +namespace oneflow { +namespace one { + +class ConsistentTensorInferResult; + +using ArgVec = std::vector>; + +using EagerBlobObjectListRawPtr = const std::vector>*; +using ConsistentTensorInferResultRawPtr = const ConsistentTensorInferResult*; + +class ZeroCopyBaseContextHelper { + public: + ZeroCopyBaseContextHelper(const std::shared_ptr& input_arg_tuple, + const std::shared_ptr& output_arg_tuple) + : input_arg_tuple_(input_arg_tuple), output_arg_tuple_(output_arg_tuple) {} + +#define RETURN_IF_FOUND(inputs, outputs, post_action) \ + int32_t i = TryGetTensorTupleIndex(input_arg_tuple_->arg_name2bn_index2tensor_tuple_index(), \ + arg_name, index); \ + if (i >= 0) { return (inputs).at(i) post_action; } \ + i = TryGetTensorTupleIndex(output_arg_tuple_->arg_name2bn_index2tensor_tuple_index(), arg_name, \ + index); \ + if (i >= 0) { return (outputs).at(i) post_action; } + + user_op::TensorDesc* TensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx, + const std::string& arg_name, + const int32_t index) const { + RETURN_IF_FOUND(*call_ctx->inputs(), *call_ctx->outputs(), .get()); + return nullptr; + } + + user_op::Tensor* Tensor4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name, + const int32_t index) const { + RETURN_IF_FOUND(*call_ctx->inputs(), *call_ctx->outputs(), .get()); + if (arg_name == "tmp_buffer" && index == 0) { return call_ctx->mut_tmp_tensor(); } + return nullptr; + } + + const ConsistentTensorMeta* ConsistentTensorMeta4ArgNameAndIndex(eager::CallContext* call_ctx, + const std::string& arg_name, + const int32_t index) const { + const auto& consistent_tensor_infer_result = call_ctx->consistent_tensor_infer_result(); + RETURN_IF_FOUND(consistent_tensor_infer_result->input_tensor_metas(), + consistent_tensor_infer_result->output_tensor_metas(), + .shared_from_symbol().get()); + return nullptr; + } + + Optional> parallel_desc(eager::CallContext* call_ctx) const { + const auto& consistent_tensor_infer_result = call_ctx->consistent_tensor_infer_result(); + if (!consistent_tensor_infer_result) { return Optional>(); } + if (!consistent_tensor_infer_result->input_tensor_metas().empty()) { + return consistent_tensor_infer_result->input_tensor_metas().at(0)->parallel_desc(); + } else if (!consistent_tensor_infer_result->output_tensor_metas().empty()) { + return consistent_tensor_infer_result->output_tensor_metas().at(0)->parallel_desc(); + } else { + UNIMPLEMENTED(); + return Optional>(); + } + } + + const ParallelContext& parallel_ctx(eager::CallContext* call_ctx) const { + const auto& parallel_desc = this->parallel_desc(call_ctx); + if (parallel_desc.has_value()) { + const auto& parallel_desc_symbol = CHECK_JUST(parallel_desc); + return *CHECK_JUST(GetParallelContext4CurrentProcessCtx(parallel_desc_symbol)); + } else { + static ParallelContext single_device_parallel_ctx(MakeSingleDeviceParallelCtx()); + return single_device_parallel_ctx; + } + } + + const ArgVec& inputs() const { return input_arg_tuple_->indexed_arg_name_and_index(); } + const ArgVec& outputs() const { return output_arg_tuple_->indexed_arg_name_and_index(); } + + private: + static int32_t TryGetTensorTupleIndex(const std::unordered_map>& + arg_name2bn_index2tensor_tuple_index, + const std::string& arg_name, const int32_t arg_index) { + auto it = arg_name2bn_index2tensor_tuple_index.find(arg_name); + if (it != arg_name2bn_index2tensor_tuple_index.end()) { return it->second.at(arg_index); } + return -1; + } + + static ParallelContext MakeSingleDeviceParallelCtx() { + ParallelContext single_device_parallel_ctx; + single_device_parallel_ctx.set_parallel_id(0); + single_device_parallel_ctx.set_parallel_num(1); + return single_device_parallel_ctx; + } + + std::shared_ptr input_arg_tuple_; + std::shared_ptr output_arg_tuple_; +}; + +class UserKernelBaseContextHelper final : public ZeroCopyBaseContextHelper { + public: + UserKernelBaseContextHelper(DeviceType device_type, + const std::shared_ptr& input_arg_tuple, + const std::shared_ptr& output_arg_tuple) + : ZeroCopyBaseContextHelper(input_arg_tuple, output_arg_tuple), device_type_(device_type) {} + + ~UserKernelBaseContextHelper() = default; + + DeviceType device_type() const { return device_type_; } + const JobDesc& job_desc() const { + UNIMPLEMENTED(); + return *(const JobDesc*)nullptr; + } + + private: + const DeviceType device_type_; +}; + +class UserOpInferContextHelper final { + public: + UserOpInferContextHelper(const user_op::UserOpConfWrapper* user_op_conf, + const std::shared_ptr& input_arg_tuple, + const std::shared_ptr& output_arg_tuple) + : user_op_conf_(user_op_conf), + zero_copy_base_ctx_helper_(input_arg_tuple, output_arg_tuple) {} + + ~UserOpInferContextHelper() = default; + + const user_op::TensorDesc* LogicalTensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx, + const std::string& arg_name, + int32_t index) const { + UNIMPLEMENTED(); + return nullptr; + } + + const user_op::TensorDesc& InputTensorDesc(eager::CallContext* call_ctx, + const std::string& arg_name, int32_t index) const { + return *CHECK_NOTNULL(TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index)); + } + + user_op::TensorDesc* OutputTensorDesc(eager::CallContext* call_ctx, const std::string& arg_name, + int32_t index) const { + return TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index); + } + user_op::TensorDesc* TensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx, + const std::string& arg_name, + int32_t index) const { + return zero_copy_base_ctx_helper_.TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index); + } + + const Shape& InputShape(eager::CallContext* call_ctx, const std::string& arg_name, + int32_t index) const { + return *Shape4ArgNameAndIndex(call_ctx, arg_name, index); + } + Shape* OutputShape(eager::CallContext* call_ctx, const std::string& arg_name, + int32_t index) const { + return Shape4ArgNameAndIndex(call_ctx, arg_name, index); + } + Shape* Shape4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name, + int32_t index) const { + return NonNullTensorDesc4ArgNameAndIndex(call_ctx, arg_name, index)->mut_shape(); + } + const Stride& InputStride(eager::CallContext* call_ctx, const std::string& arg_name, + int32_t index) const { + return *Stride4ArgNameAndIndex(call_ctx, arg_name, index); + } + Stride* OutputStride(eager::CallContext* call_ctx, const std::string& arg_name, + int32_t index) const { + return Stride4ArgNameAndIndex(call_ctx, arg_name, index); + } + Stride* Stride4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name, + int32_t index) const { + return NonNullTensorDesc4ArgNameAndIndex(call_ctx, arg_name, index)->mut_stride(); + } + const DataType& InputDType(eager::CallContext* call_ctx, const std::string& arg_name, + int32_t index) const { + return *Dtype4ArgNameAndIndex(call_ctx, arg_name, index); + } + DataType* OutputDType(eager::CallContext* call_ctx, const std::string& arg_name, + int32_t index) const { + return Dtype4ArgNameAndIndex(call_ctx, arg_name, index); + } + DataType* Dtype4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name, + int32_t index) const { + return NonNullTensorDesc4ArgNameAndIndex(call_ctx, arg_name, index)->mut_data_type(); + } + bool InputIsDynamic(eager::CallContext* call_ctx, const std::string& arg_name, + int32_t index) const { + return *IsDynamic4ArgNameAndIndex(call_ctx, arg_name, index); + } + bool* OutputIsDynamic(eager::CallContext* call_ctx, const std::string& arg_name, + int32_t index) const { + return IsDynamic4ArgNameAndIndex(call_ctx, arg_name, index); + } + bool* IsDynamic4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name, + int32_t index) const { + return NonNullTensorDesc4ArgNameAndIndex(call_ctx, arg_name, index)->mut_is_dynamic(); + } + + const ArgVec& inputs() const { return zero_copy_base_ctx_helper_.inputs(); } + const ArgVec& outputs() const { return zero_copy_base_ctx_helper_.outputs(); } + const JobDesc* job_desc() const { + UNIMPLEMENTED(); + return nullptr; + } + const ParallelContext& parallel_ctx(eager::CallContext* call_ctx) const { + return zero_copy_base_ctx_helper_.parallel_ctx(call_ctx); + } + const ParallelDesc& parallel_desc(eager::CallContext* call_ctx) const { + return *CHECK_JUST(zero_copy_base_ctx_helper_.parallel_desc(call_ctx)); + } + const SbpParallel& SbpParallel4ArgNameAndIndex(eager::CallContext* call_ctx, + const std::string& arg_name, int32_t index) const { + const auto& nd_sbp = NdSbp4ArgNameAndIndex(call_ctx, arg_name, index); + CHECK_EQ(nd_sbp.sbp_parallel_size(), 1); + return nd_sbp.sbp_parallel(0); + } + const NdSbp& NdSbp4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name, + int32_t index) const { + return *CHECK_NOTNULL(zero_copy_base_ctx_helper_.ConsistentTensorMeta4ArgNameAndIndex( + call_ctx, arg_name, index)) + ->nd_sbp(); + } + + int64_t parallel_num(eager::CallContext* call_ctx) const { + return parallel_ctx(call_ctx).parallel_num(); + } + + const std::string& input(const std::string& arg_name, int32_t index) const { + return user_op_conf().input(arg_name, index); + } + const std::string& output(const std::string& arg_name, int32_t index) const { + return user_op_conf().output(arg_name, index); + } + bool has_input(const std::string& arg_name, int32_t index) const { + return user_op_conf().has_input(arg_name, index); + } + bool has_output(const std::string& arg_name, int32_t index) const { + return user_op_conf().has_output(arg_name, index); + } + int32_t input_size(const std::string& arg_name) const { + return user_op_conf().input_size(arg_name); + } + int32_t output_size(const std::string& arg_name) const { + return user_op_conf().output_size(arg_name); + } + const std::string& op_name() const { return user_op_conf().op_name(); } + const std::string& op_type_name() const { return user_op_conf().op_type_name(); } + const std::string& op_loc() const { return user_op_conf_->op_conf().loc(); } + + const user_op::UserOpConfWrapper& user_op_conf() const { return *user_op_conf_; } + const std::shared_ptr& Attr4Name(eager::CallContext* call_ctx, + const std::string& attr_name) const { + return call_ctx->composed_attrs().Attr4Name(attr_name); + } + + private: + user_op::TensorDesc* NonNullTensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx, + const std::string& arg_name, + int32_t index) const { + user_op::TensorDesc* tensor_desc = TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index); + if (!tensor_desc) { LOG(FATAL) << "Arg (" << arg_name << "," << index << ") is not found"; } + return tensor_desc; + } + + const user_op::UserOpConfWrapper* user_op_conf_; + ZeroCopyBaseContextHelper zero_copy_base_ctx_helper_; +}; + +class UserOpInferContext : public user_op::InferContext { + public: + UserOpInferContext(const UserOpInferContextHelper* helper, eager::CallContext* call_ctx) + : helper_(helper), call_ctx_(call_ctx) {} + + ~UserOpInferContext() override = default; + + const user_op::TensorDesc* LogicalTensorDesc4ArgNameAndIndex(const std::string& arg_name, + int32_t index) const override { + return helper_->LogicalTensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index); + } + + const user_op::TensorDesc& InputTensorDesc(const std::string& arg_name, + int32_t index) const override { + return helper_->InputTensorDesc(call_ctx_, arg_name, index); + } + user_op::TensorDesc* OutputTensorDesc(const std::string& arg_name, int32_t index) override { + return helper_->OutputTensorDesc(call_ctx_, arg_name, index); + } + user_op::TensorDesc* TensorDesc4ArgNameAndIndex(const std::string& arg_name, int32_t index) { + return helper_->TensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index); + } + + const Shape& InputShape(const std::string& arg_name, int32_t index) const override { + return helper_->InputShape(call_ctx_, arg_name, index); + } + Shape* OutputShape(const std::string& arg_name, int32_t index) override { + return helper_->OutputShape(call_ctx_, arg_name, index); + } + Shape* Shape4ArgNameAndIndex(const std::string& arg_name, int32_t index) override { + return helper_->Shape4ArgNameAndIndex(call_ctx_, arg_name, index); + } + const Stride& InputStride(const std::string& arg_name, int32_t index) const override { + return helper_->InputStride(call_ctx_, arg_name, index); + } + Stride* OutputStride(const std::string& arg_name, int32_t index) override { + return helper_->OutputStride(call_ctx_, arg_name, index); + } + Stride* Stride4ArgNameAndIndex(const std::string& arg_name, int32_t index) override { + return helper_->Stride4ArgNameAndIndex(call_ctx_, arg_name, index); + } + const DataType& InputDType(const std::string& arg_name, int32_t index) const override { + return helper_->InputDType(call_ctx_, arg_name, index); + } + DataType* OutputDType(const std::string& arg_name, int32_t index) override { + return helper_->OutputDType(call_ctx_, arg_name, index); + } + DataType* Dtype4ArgNameAndIndex(const std::string& arg_name, int32_t index) override { + return helper_->Dtype4ArgNameAndIndex(call_ctx_, arg_name, index); + } + bool InputIsDynamic(const std::string& arg_name, int32_t index) const override { + return helper_->InputIsDynamic(call_ctx_, arg_name, index); + } + bool* OutputIsDynamic(const std::string& arg_name, int32_t index) override { + return helper_->OutputIsDynamic(call_ctx_, arg_name, index); + } + bool* IsDynamic4ArgNameAndIndex(const std::string& arg_name, int32_t index) override { + return helper_->IsDynamic4ArgNameAndIndex(call_ctx_, arg_name, index); + } + + const ArgVec& inputs() const override { return helper_->inputs(); } + const ArgVec& outputs() const override { return helper_->outputs(); } + const JobDesc* job_desc() const override { return helper_->job_desc(); } + const ParallelContext& parallel_ctx() const override { return helper_->parallel_ctx(call_ctx_); } + const ParallelDesc& parallel_desc() const override { return helper_->parallel_desc(call_ctx_); } + const SbpParallel& SbpParallel4ArgNameAndIndex(const std::string& arg_name, + int32_t index) const override { + return helper_->SbpParallel4ArgNameAndIndex(call_ctx_, arg_name, index); + } + const NdSbp& NdSbp4ArgNameAndIndex(const std::string& arg_name, int32_t index) const override { + return helper_->NdSbp4ArgNameAndIndex(call_ctx_, arg_name, index); + } + + int64_t parallel_num() const override { return helper_->parallel_num(call_ctx_); } + + const std::string& input(const std::string& arg_name, int32_t index) const override { + return helper_->input(arg_name, index); + } + const std::string& output(const std::string& arg_name, int32_t index) const override { + return helper_->output(arg_name, index); + } + bool has_input(const std::string& arg_name, int32_t index) const override { + return helper_->has_input(arg_name, index); + } + bool has_output(const std::string& arg_name, int32_t index) const override { + return helper_->has_output(arg_name, index); + } + int32_t input_size(const std::string& arg_name) const override { + return helper_->input_size(arg_name); + } + int32_t output_size(const std::string& arg_name) const override { + return helper_->output_size(arg_name); + } + const std::string& op_name() const override { return helper_->op_name(); } + const std::string& op_type_name() const override { return helper_->op_type_name(); } + const std::string& op_loc() const override { return helper_->op_loc(); } + + private: + const std::shared_ptr& Attr4Name( + const std::string& attr_name) const override { + return helper_->Attr4Name(call_ctx_, attr_name); + } + + const UserOpInferContextHelper* helper_; + eager::CallContext* call_ctx_; +}; + +class UserKernelComputeContextHelper final { + public: + UserKernelComputeContextHelper(DeviceType device_type, + const user_op::UserOpConfWrapper* user_op_conf, + const std::shared_ptr& input_arg_tuple, + const std::shared_ptr& output_arg_tuple) + : user_op_conf_(user_op_conf), + base_ctx_helper_(device_type, input_arg_tuple, output_arg_tuple) {} + + ~UserKernelComputeContextHelper() = default; + + const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx, + const std::string& arg_name, + int32_t index) const { + return base_ctx_helper_.TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index); + } + + user_op::Tensor* Tensor4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name, + int32_t index) const { + return base_ctx_helper_.Tensor4ArgNameAndIndex(call_ctx, arg_name, index); + } + ep::Stream* stream(DeviceCtx* device_ctx) const { + CHECK(device_ctx); + return device_ctx->stream(); + } + + DeviceType device_type() const { return base_ctx_helper_.device_type(); } + const ParallelContext& parallel_ctx(eager::CallContext* call_ctx) const { + return base_ctx_helper_.parallel_ctx(call_ctx); + } + + const ArgVec& inputs() const { return base_ctx_helper_.inputs(); } + const ArgVec& outputs() const { return base_ctx_helper_.outputs(); } + + const user_op::UserOpConfWrapper& user_op_conf() const { return *user_op_conf_; } + const std::shared_ptr& Attr4Name(eager::CallContext* call_ctx, + const std::string& attr_name) const { + return call_ctx->composed_attrs().Attr4Name(attr_name); + } + + private: + const user_op::UserOpConfWrapper* user_op_conf_; + UserKernelBaseContextHelper base_ctx_helper_; +}; + +class UserKernelComputeContext final : public user_op::KernelComputeContext { + public: + UserKernelComputeContext(const UserKernelComputeContextHelper* helper, + eager::CallContext* call_ctx, DeviceCtx* device_ctx) + : helper_(helper), call_ctx_(call_ctx), device_ctx_(device_ctx) {} + + ~UserKernelComputeContext() = default; + + const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(const std::string& arg_name, + int32_t index) const override { + return helper_->TensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index); + } + + user_op::Tensor* Tensor4ArgNameAndIndex(const std::string& arg_name, int32_t index) override { + return helper_->Tensor4ArgNameAndIndex(call_ctx_, arg_name, index); + } + + ep::Stream* stream() override { return helper_->stream(device_ctx_); } + + DeviceType device_type() const override { return helper_->device_type(); } + + const ParallelContext& parallel_ctx() const override { return helper_->parallel_ctx(call_ctx_); } + + const ArgVec& inputs() const override { return helper_->inputs(); } + const ArgVec& outputs() const override { return helper_->outputs(); } + + private: + const user_op::UserOpConfWrapper& user_op_conf() const override { + return helper_->user_op_conf(); + } + + const std::shared_ptr& Attr4Name( + const std::string& attr_name) const override { + return helper_->Attr4Name(call_ctx_, attr_name); + } + + const UserKernelComputeContextHelper* helper_; + eager::CallContext* call_ctx_; + DeviceCtx* device_ctx_; +}; + +class UserKernelRegContextHelper final { + public: + UserKernelRegContextHelper(DeviceType device_type, const user_op::UserOpConfWrapper* user_op_conf, + const std::shared_ptr& input_arg_tuple, + const std::shared_ptr& output_arg_tuple) + : user_op_conf_(user_op_conf), + base_ctx_helper_(device_type, input_arg_tuple, output_arg_tuple) {} + ~UserKernelRegContextHelper() = default; + + DeviceType device_type() const { return base_ctx_helper_.device_type(); } + const ParallelContext& parallel_ctx(eager::CallContext* call_ctx) const { + return base_ctx_helper_.parallel_ctx(call_ctx); + } + const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx, + const std::string& arg_name, + int32_t index) const { + return base_ctx_helper_.TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index); + } + const ArgVec& inputs() const { return base_ctx_helper_.inputs(); } + const ArgVec& outputs() const { return base_ctx_helper_.outputs(); } + + const user_op::UserOpConfWrapper& user_op_conf() const { return *user_op_conf_; } + + const std::shared_ptr& Attr4Name(eager::CallContext* call_ctx, + const std::string& attr_name) const { + return call_ctx->composed_attrs().Attr4Name(attr_name); + } + + private: + const user_op::UserOpConfWrapper* user_op_conf_; + UserKernelBaseContextHelper base_ctx_helper_; +}; + +class UserKernelRegContext final : public user_op::KernelRegContext { + public: + UserKernelRegContext(const UserKernelRegContextHelper* helper, eager::CallContext* call_ctx) + : helper_(helper), call_ctx_(call_ctx) {} + ~UserKernelRegContext() = default; + + DeviceType device_type() const override { return helper_->device_type(); } + const ParallelContext& parallel_ctx() const override { return helper_->parallel_ctx(call_ctx_); } + const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(const std::string& arg_name, + int32_t index) const override { + return helper_->TensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index); + } + const ArgVec& inputs() const override { return helper_->inputs(); } + const ArgVec& outputs() const override { return helper_->outputs(); } + + const user_op::UserOpConfWrapper& user_op_conf() const override { + return helper_->user_op_conf(); + } + + private: + const std::shared_ptr& Attr4Name( + const std::string& attr_name) const override { + return helper_->Attr4Name(call_ctx_, attr_name); + } + + const UserKernelRegContextHelper* helper_; + eager::CallContext* call_ctx_; +}; + +class UserKernelInitAndCacheContextHelper final { + public: + UserKernelInitAndCacheContextHelper(DeviceType device_type, + const user_op::UserOpConfWrapper* user_op_conf, + const std::shared_ptr& input_arg_tuple, + const std::shared_ptr& output_arg_tuple) + : user_op_conf_(user_op_conf), + base_ctx_helper_(device_type, input_arg_tuple, output_arg_tuple) {} + + ~UserKernelInitAndCacheContextHelper() = default; + + ep::Stream* stream(DeviceCtx* device_ctx) const { + CHECK(device_ctx); + return device_ctx->stream(); + } + + DeviceType device_type() const { return base_ctx_helper_.device_type(); } + const ParallelContext& parallel_ctx(eager::CallContext* call_ctx) const { + return base_ctx_helper_.parallel_ctx(call_ctx); + } + const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx, + const std::string& arg_name, + int32_t index) const { + return base_ctx_helper_.TensorDesc4ArgNameAndIndex(call_ctx, arg_name, index); + } + const user_op::TensorDesc* LogicalTensorDesc4ArgNameAndIndex(eager::CallContext* call_ctx, + const std::string& arg_name, + int32_t index) const { + return base_ctx_helper_.ConsistentTensorMeta4ArgNameAndIndex(call_ctx, arg_name, index); + } + const SbpParallel& SbpParallel4ArgNameAndIndex(eager::CallContext* call_ctx, + const std::string& arg_name, int32_t index) const { + const auto& nd_sbp = NdSbp4ArgNameAndIndex(call_ctx, arg_name, index); + CHECK_EQ(nd_sbp.sbp_parallel_size(), 1); + return nd_sbp.sbp_parallel(0); + } + + const NdSbp& NdSbp4ArgNameAndIndex(eager::CallContext* call_ctx, const std::string& arg_name, + int32_t index) const { + return *CHECK_NOTNULL( + base_ctx_helper_.ConsistentTensorMeta4ArgNameAndIndex(call_ctx, arg_name, index)) + ->nd_sbp(); + } + + const ArgVec& inputs() const { return base_ctx_helper_.inputs(); } + const ArgVec& outputs() const { return base_ctx_helper_.outputs(); } + const ParallelDesc& parallel_desc(eager::CallContext* call_ctx) const { + return *CHECK_JUST(base_ctx_helper_.parallel_desc(call_ctx)); + } + + const std::shared_ptr& Attr4Name(eager::CallContext* call_ctx, + const std::string& attr_name) const { + return call_ctx->composed_attrs().Attr4Name(attr_name); + } + + const user_op::UserOpConfWrapper& user_op_conf() const { return *user_op_conf_; } + + private: + const user_op::UserOpConfWrapper* user_op_conf_; + UserKernelBaseContextHelper base_ctx_helper_; +}; + +class UserKernelInitAndCacheContext final : public user_op::KernelInitContext, + public user_op::KernelCacheContext { + public: + UserKernelInitAndCacheContext(const UserKernelInitAndCacheContextHelper* helper, + eager::CallContext* call_ctx, DeviceCtx* device_ctx) + : helper_(helper), call_ctx_(call_ctx), device_ctx_(device_ctx) {} + + ~UserKernelInitAndCacheContext() override = default; + + ep::Stream* stream() override { return helper_->stream(device_ctx_); } + + DeviceType device_type() const override { return helper_->device_type(); } + const ParallelContext& parallel_ctx() const override { return helper_->parallel_ctx(call_ctx_); } + const user_op::TensorDesc* TensorDesc4ArgNameAndIndex(const std::string& arg_name, + int32_t index) const override { + return helper_->TensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index); + } + const user_op::TensorDesc* LogicalTensorDesc4ArgNameAndIndex(const std::string& arg_name, + int32_t index) const override { + return helper_->LogicalTensorDesc4ArgNameAndIndex(call_ctx_, arg_name, index); + } + const SbpParallel& SbpParallel4ArgNameAndIndex(const std::string& arg_name, + int32_t index) const override { + return helper_->SbpParallel4ArgNameAndIndex(call_ctx_, arg_name, index); + } + + const NdSbp& NdSbp4ArgNameAndIndex(const std::string& arg_name, int32_t index) const override { + return helper_->NdSbp4ArgNameAndIndex(call_ctx_, arg_name, index); + } + + const ArgVec& inputs() const override { return helper_->inputs(); } + const ArgVec& outputs() const override { return helper_->outputs(); } + const ParallelDesc& parallel_desc() const override { return helper_->parallel_desc(call_ctx_); } + + private: + const std::shared_ptr& Attr4Name( + const std::string& attr_name) const override { + return helper_->Attr4Name(call_ctx_, attr_name); + } + + const user_op::UserOpConfWrapper& user_op_conf() const override { + return helper_->user_op_conf(); + } + + const UserKernelInitAndCacheContextHelper* helper_; + eager::CallContext* call_ctx_; + DeviceCtx* device_ctx_; +}; + +namespace { + +Maybe InitTensorTupleIndexes4Bns(const std::shared_ptr& op_conf, + const ArgVec& indexed_input_pairs, + const ArgVec& indexed_output_pairs, + std::vector* input_tuple_indexes4const_ibns, + std::vector* input_tuple_indexes4mut_ibns, + std::vector* output_tuple_indexes4mut_obns, + std::vector* output_tuple_indexes4mut2_obns) { + const auto* op_reg_val = + user_op::UserOpRegistryMgr::Get().GetOpRegistryResult(op_conf->user_conf().op_type_name()); + CHECK_NOTNULL_OR_RETURN(op_reg_val); + + ArgModifierSignature arg_modifier_signature; + for (const auto& pair : indexed_input_pairs) { + const std::string ibn = GenRepeatedBn(pair.first, pair.second); + arg_modifier_signature.mutable_ibn2input_blob_modifier()->insert( + {ibn, user_op::InputArgModifier()}); + } + for (const auto& pair : indexed_output_pairs) { + const std::string obn = GenRepeatedBn(pair.first, pair.second); + arg_modifier_signature.mutable_obn2output_blob_modifier()->insert( + {obn, user_op::OutputArgModifier()}); + } + user_op::UserOpConfWrapper op_conf_wrapper(op_conf); + if (op_reg_val->input_arg_modify_fn) { + user_op::GetInputArgModifier GetInputArgModifierFn = + [&arg_modifier_signature](const std::string& in_arg_name, + int32_t in_arg_index) -> user_op::InputArgModifier* { + const std::string ibn = GenRepeatedBn(in_arg_name, in_arg_index); + auto* map = arg_modifier_signature.mutable_ibn2input_blob_modifier(); + return &map->at(ibn); + }; + JUST(op_reg_val->input_arg_modify_fn(GetInputArgModifierFn, op_conf_wrapper)); + } + if (op_reg_val->output_arg_modify_fn) { + user_op::GetOutputArgModifier GetOutputArgModifierFn = + [&arg_modifier_signature](const std::string& in_arg_name, + int32_t in_arg_index) -> user_op::OutputArgModifier* { + const std::string obn = GenRepeatedBn(in_arg_name, in_arg_index); + auto* map = arg_modifier_signature.mutable_obn2output_blob_modifier(); + return &map->at(obn); + }; + JUST(op_reg_val->output_arg_modify_fn(GetOutputArgModifierFn, op_conf_wrapper)); + } + + for (int i = 0; i < indexed_input_pairs.size(); i++) { + const auto& pair = indexed_input_pairs.at(i); + const std::string ibn = GenRepeatedBn(pair.first, pair.second); + if (arg_modifier_signature.ibn2input_blob_modifier().at(ibn).is_mutable()) { + input_tuple_indexes4mut_ibns->emplace_back(i); + } else { + input_tuple_indexes4const_ibns->emplace_back(i); + } + } + + for (int i = 0; i < indexed_output_pairs.size(); i++) { + const auto& pair = indexed_output_pairs.at(i); + const std::string obn = GenRepeatedBn(pair.first, pair.second); + if (arg_modifier_signature.obn2output_blob_modifier().at(obn).header_infered_before_compute()) { + output_tuple_indexes4mut_obns->emplace_back(i); + } else { + output_tuple_indexes4mut2_obns->emplace_back(i); + } + } + return Maybe::Ok(); +} + +} // namespace + +/* static */ Maybe StatefulOpKernel::New( + const std::shared_ptr& op_conf, const Symbol& stream, + const AttrMap& base_attrs, const std::shared_ptr& parallel_desc, + const std::shared_ptr& input_arg_tuple, + const std::shared_ptr& output_arg_tuple) { + auto opkernel = std::shared_ptr(new StatefulOpKernel()); + opkernel->base_attrs_ = base_attrs; + opkernel->op_conf_ = op_conf; + opkernel->user_op_conf_.reset(new user_op::UserOpConfWrapper(op_conf)); + opkernel->stream_ = stream; + opkernel->input_arg_tuple_ = input_arg_tuple; + opkernel->output_arg_tuple_ = output_arg_tuple; + opkernel->need_check_mem_case_ = true; + + const DeviceType device_type = CHECK_JUST(DeviceType4DeviceTag(op_conf->device_tag())); + const user_op::UserOpConfWrapper* user_op_conf = opkernel->user_op_conf_.get(); + opkernel->op_infer_ctx_helper_.reset( + new UserOpInferContextHelper(user_op_conf, input_arg_tuple, output_arg_tuple)); + + opkernel->init_and_cache_ctx_helper_.reset(new UserKernelInitAndCacheContextHelper( + device_type, opkernel->user_op_conf_.get(), opkernel->input_arg_tuple_, + opkernel->output_arg_tuple_)); + opkernel->compute_ctx_helper_.reset(new UserKernelComputeContextHelper( + device_type, user_op_conf, input_arg_tuple, output_arg_tuple)); + opkernel->reg_ctx_helper_.reset( + new UserKernelRegContextHelper(device_type, user_op_conf, input_arg_tuple, output_arg_tuple)); + const auto* op_reg_val = + user_op::UserOpRegistryMgr::Get().GetOpRegistryResult(user_op_conf->op_type_name()); + CHECK_NOTNULL_OR_RETURN(op_reg_val); + if (op_reg_val->logical_tensor_desc_infer_fn) { + opkernel->tensor_desc_infer_fn_ = op_reg_val->logical_tensor_desc_infer_fn; + } else { + return Error::UnimplementedError(); + } + opkernel->data_type_infer_fn_ = op_reg_val->data_type_infer_fn; + + JUST(InitTensorTupleIndexes4Bns( + op_conf, input_arg_tuple->indexed_arg_name_and_index(), + output_arg_tuple->indexed_arg_name_and_index(), &opkernel->input_tuple_indexes4const_ibns_, + &opkernel->input_tuple_indexes4mut_ibns_, &opkernel->output_tuple_indexes4mut_obns_, + &opkernel->output_tuple_indexes4mut2_obns_)); + + return opkernel; +} + +StatefulOpKernel::~StatefulOpKernel() = default; + +size_t StatefulOpKernel::InferTmpSize(eager::CallContext* call_ctx, + const user_op::OpKernel* user_opkernel) const { + UserOpInferContext op_infer_ctx(op_infer_ctx_helper_.get(), call_ctx); + const auto& InferTmpSizeFn = GetInferTmpSizeFn(user_opkernel); + return InferTmpSizeFn(&op_infer_ctx); +} + +Maybe StatefulOpKernel::ChooseOpKernel(eager::CallContext* call_ctx, + const user_op::OpKernel** user_opkernel, + bool* need_temp_storage) { + OF_PROFILER_RANGE_GUARD("ChooseOpKernel"); + DataType primary_dtype = kInvalidDataType; + const auto& inputs = call_ctx->inputs(); + const auto& outputs = call_ctx->outputs(); + if (likely(!inputs->empty())) { + primary_dtype = (*inputs)[0]->data_type(); + } else if (likely(!outputs->empty())) { + primary_dtype = (*outputs)[0]->data_type(); + } else { + // do nothing + } + + UserKernelRegContext reg_ctx(reg_ctx_helper_.get(), call_ctx); + for (const auto& pair : dtype2cached_kernels_[primary_dtype]) { + if (likely(pair.first->is_matched_hob->get(reg_ctx))) { + *need_temp_storage = pair.first->need_temp_storage; + *user_opkernel = pair.second.get(); + return Maybe::Ok(); + } + } + + OF_PROFILER_RANGE_GUARD("fallback"); + + const auto& op_type_name = user_op_conf_->op_type_name(); + const auto* kernel_reg_val = + JUST(user_op::UserOpRegistryMgr::Get().GetOpKernelRegistryResult(op_type_name, reg_ctx)); + CHECK_NOTNULL(kernel_reg_val); + auto* kernel = kernel_reg_val->create_fn(); + dtype2cached_kernels_[primary_dtype].push_back( + {kernel_reg_val, std::shared_ptr(kernel)}); + + infer_tmp_size_fn_map_.emplace(kernel, &kernel_reg_val->infer_tmp_size_fn); + *need_temp_storage = kernel_reg_val->need_temp_storage; + *user_opkernel = kernel; + return Maybe::Ok(); +} + +void StatefulOpKernel::TryInitOpKernelStateAndCache(eager::CallContext* call_ctx, + DeviceCtx* device_ctx, + const user_op::OpKernel* op_kernel, + user_op::OpKernelState** state, + user_op::OpKernelCache** cache) { + UserKernelInitAndCacheContext init_and_cache_ctx(init_and_cache_ctx_helper_.get(), call_ctx, + device_ctx); + if (state != nullptr) { + auto it = op_kernel_state_map_.find(op_kernel); + if (it != op_kernel_state_map_.end()) { + *state = it->second.get(); + } else { + auto created_state = op_kernel->CreateOpKernelState(&init_and_cache_ctx); + op_kernel_state_map_.emplace(op_kernel, created_state); + *state = created_state.get(); + } + } + + { + auto& cache_in_map = op_kernel_cache_map_[op_kernel]; + op_kernel->InitOpKernelCacheWithFlags(&init_and_cache_ctx, + user_op::OpKernelCache::kAllMayChanged, &cache_in_map); + *cache = cache_in_map.get(); + } +} + +const user_op::InferTmpSizeFn& StatefulOpKernel::GetInferTmpSizeFn( + const user_op::OpKernel* op_kernel) const { + return *infer_tmp_size_fn_map_.at(op_kernel); +} + +user_op::TensorDescInferFn StatefulOpKernel::TensorDescInferFn() const { + return tensor_desc_infer_fn_; +} + +user_op::DataTypeInferFn StatefulOpKernel::DataTypeInferFn() const { return data_type_infer_fn_; } + +void StatefulOpKernel::Compute(eager::CallContext* call_ctx, DeviceCtx* device_ctx, + const user_op::OpKernel* user_opkernel, + user_op::OpKernelState* state, + const user_op::OpKernelCache* cache) const { + UserKernelComputeContext compute_context(compute_ctx_helper_.get(), call_ctx, device_ctx); + auto* compute_ctx = &compute_context; + OF_PROFILER_RANGE_GUARD("Compute"); + if (Singleton::Get()) { +#if defined(WITH_CUDA) || defined(WITH_ROCM) + const auto CalMemorySize = [compute_ctx](const one::ArgVec& args) -> int64_t { + const auto Func = [compute_ctx](int64_t mem_size, const auto& pair) { + const auto tensor = compute_ctx->Tensor4ArgNameAndIndex(pair.first, pair.second); + return mem_size + tensor->shape_view().elem_cnt() * GetSizeOfDataType(tensor->data_type()); + }; + return std::accumulate(args.begin(), args.end(), static_cast(0), Func); + }; +#endif + auto er_guard = CHECK_JUST(profiler::EventRecorder::CreateKernelEventRecorder( + op_type_name(), +#if defined(WITH_CUDA) || defined(WITH_ROCM) + [compute_ctx, CalMemorySize]() -> int64_t { + return CalMemorySize(compute_ctx->inputs()) + CalMemorySize(compute_ctx->outputs()); + }, +#endif + [compute_ctx]() -> std::vector { + std::vector shapes; + for (const auto& pair : compute_ctx->inputs()) { + shapes.emplace_back( + compute_ctx->TensorDesc4ArgNameAndIndex(pair.first, pair.second)->shape()); + } + return shapes; + })); + user_opkernel->Compute(compute_ctx, state, cache); + } else { + user_opkernel->Compute(compute_ctx, state, cache); + } +} + +} // namespace one +} // namespace oneflow diff --git a/oneflow/user/kernels/tf_prelu_kernel.hip.cpp b/oneflow/user/kernels/tf_prelu_kernel.hip.cpp index 2a27ca3..572127b 100644 --- a/oneflow/user/kernels/tf_prelu_kernel.hip.cpp +++ b/oneflow/user/kernels/tf_prelu_kernel.hip.cpp @@ -1,254 +1,254 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/ndarray/ndarray_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -__global__ void BroadcastPReluForwardGpu(const int32_t elem_cnt, const int32_t alpha_size, - const int32_t inner_size, const T* x, const T* alpha, - T* y) { - T zero_val = static_cast(0.0); - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { - const T x_i = x[i]; - const T alpha_i = alpha[(i / inner_size) % alpha_size]; - y[i] = x_i > zero_val ? x_i : x_i * alpha_i; - } -} - -template -__global__ void BroadcastPReluBackwardGpu(const int32_t elem_cnt, const int32_t alpha_size, - const int32_t inner_size, const T* x, const T* alpha, - const T* dy, T* dx, T* alpha_diff) { - T zero_val = static_cast(0.0); - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { - const T x_i = x[i]; - const T dy_i = dy[i]; - const T alpha_i = alpha[(i / inner_size) % alpha_size]; - T dx_i = zero_val; - T alpha_diff_i = zero_val; - if (x_i > zero_val) { - dx_i = dy_i; - alpha_diff_i = zero_val; - } else { - dx_i = dy_i * alpha_i; - alpha_diff_i = dy_i * x_i; - } - dx[i] = dx_i; - alpha_diff[i] = alpha_diff_i; - } -} - -template -__global__ void ElemwisePReluForwardGpu(const int32_t elem_cnt, const T* x, const T* alpha, T* y) { - T zero_val = static_cast(0.0); - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { - const T x_i = x[i]; - const T alpha_i = alpha[i]; - y[i] = x_i > zero_val ? x_i : x_i * alpha_i; - } -} - -template -__global__ void ElemwisePReluBackwardGpu(const int32_t elem_cnt, const T* x, const T* alpha, - const T* dy, T* dx, T* alpha_diff) { - T zero_val = static_cast(0.0); - CUDA_1D_KERNEL_LOOP(i, elem_cnt) { - const T x_i = x[i]; - const T dy_i = dy[i]; - const T alpha_i = alpha[i]; - T dx_i = zero_val; - T alpha_diff_i = zero_val; - if (x_i > zero_val) { - dx_i = dy_i; - alpha_diff_i = zero_val; - } else { - dx_i = dy_i * alpha_i; - alpha_diff_i = dy_i * x_i; - } - dx[i] = dx_i; - alpha_diff[i] = alpha_diff_i; - } -} - -bool IsAlphaShapeContiguous(const ShapeView& alpha_shape, const ShapeView& x_shape) { - if (alpha_shape.elem_cnt() == 1) { return true; } - int64_t begin_idx = -1; - for (int64_t i = 0; i < alpha_shape.NumAxes(); ++i) { - if (alpha_shape.At(i) != 1) { - begin_idx = i; - break; - } - } - CHECK_NE(begin_idx, -1); - int64_t end_idx = -1; - for (int64_t i = alpha_shape.NumAxes(); i > 0; --i) { - if (alpha_shape.At(i - 1) != 1) { - end_idx = i; - break; - } - } - CHECK_NE(end_idx, -1); - if (alpha_shape.elem_cnt() == x_shape.Count(begin_idx + 1, end_idx + 1)) { - return true; - } else { - return false; - } -} - -int32_t GetOuterSize(const ShapeView& alpha_shape, const ShapeView& x_shape) { - int32_t outer_size = x_shape.At(0); - for (int32_t i = 0; i < alpha_shape.NumAxes(); ++i) { - if (alpha_shape.At(i) == 1) { - outer_size *= x_shape.At(i + 1); - } else { - break; - } - } - return outer_size; -} - -} // namespace - -template -class TfGpuPReluKernel final : public user_op::OpKernel { - public: - TfGpuPReluKernel() = default; - ~TfGpuPReluKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* alpha = ctx->Tensor4ArgNameAndIndex("alpha", 0); - user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); - const int32_t elem_cnt = x->shape_view().elem_cnt(); - if (IsAlphaShapeContiguous(alpha->shape_view(), x->shape_view())) { - const int32_t outer_size = GetOuterSize(alpha->shape_view(), x->shape_view()); - const int32_t alpha_size = alpha->shape_view().elem_cnt(); - const int32_t inner_size = elem_cnt / outer_size / alpha_size; - BroadcastPReluForwardGpu<<stream()->As()->cuda_stream()>>>( - elem_cnt, alpha_size, inner_size, x->dptr(), alpha->dptr(), y->mut_dptr()); - } else { - user_op::Tensor* broadcasted_alpha = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const Shape& left_extended_shape = - CreateLeftExtendedShape(ShapeView(alpha->shape_view()), x->shape_view().NumAxes()); - NdarrayUtil::BroadcastTo( - ctx->stream(), XpuVarNdarray(x->shape_view(), broadcasted_alpha->mut_dptr()), - XpuVarNdarray(left_extended_shape, alpha->dptr())); - ElemwisePReluForwardGpu<<stream()->As()->cuda_stream()>>>( - elem_cnt, x->dptr(), broadcasted_alpha->dptr(), y->mut_dptr()); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_TF_CUDA_PRELU_KERNEL(dtype) \ - REGISTER_USER_KERNEL("tf_prelu") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ - const Shape& in_shape = ctx->InputShape("x", 0); \ - const Shape& alpha_shape = ctx->InputShape("alpha", 0); \ - const int64_t tmp_buffer_size = \ - IsAlphaShapeContiguous(alpha_shape, in_shape) \ - ? 0 \ - : GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(dtype)); \ - return tmp_buffer_size; \ - }); - -REGISTER_TF_CUDA_PRELU_KERNEL(half) -REGISTER_TF_CUDA_PRELU_KERNEL(float) -REGISTER_TF_CUDA_PRELU_KERNEL(double) - -template -class TfGpuPReluGradKernel final : public user_op::OpKernel { - public: - TfGpuPReluGradKernel() = default; - ~TfGpuPReluGradKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); - const user_op::Tensor* alpha = ctx->Tensor4ArgNameAndIndex("alpha", 0); - const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); - user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); - user_op::Tensor* alpha_diff = ctx->Tensor4ArgNameAndIndex("alpha_diff", 0); - user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); - const int32_t elem_cnt = x->shape_view().elem_cnt(); - T* broadcasted_alpha_diff = tmp_buffer->mut_dptr(); - T* reduce_sum_tmp_buf = reinterpret_cast(tmp_buffer->mut_dptr() - + GetCudaAlignedSize(elem_cnt * sizeof(T))); - const Shape& left_extended_shape = - CreateLeftExtendedShape(ShapeView(alpha->shape_view()), x->shape_view().NumAxes()); - if (IsAlphaShapeContiguous(alpha->shape_view(), x->shape_view())) { - const int32_t outer_size = GetOuterSize(alpha->shape_view(), x->shape_view()); - const int32_t alpha_size = alpha->shape_view().elem_cnt(); - const int32_t inner_size = elem_cnt / outer_size / alpha_size; - BroadcastPReluBackwardGpu<<stream()->As()->cuda_stream()>>>( - elem_cnt, alpha_size, inner_size, x->dptr(), alpha->dptr(), dy->dptr(), - dx->mut_dptr(), broadcasted_alpha_diff); - } else { - T* broadcasted_alpha = reinterpret_cast(tmp_buffer->mut_dptr() - + 2 * GetCudaAlignedSize(elem_cnt * sizeof(T))); - - NdarrayUtil::BroadcastTo( - ctx->stream(), XpuVarNdarray(x->shape_view(), broadcasted_alpha), - XpuVarNdarray(left_extended_shape, alpha->dptr())); - - ElemwisePReluBackwardGpu<<stream()->As()->cuda_stream()>>>( - elem_cnt, x->dptr(), broadcasted_alpha, dy->dptr(), dx->mut_dptr(), - broadcasted_alpha_diff); - } - NdarrayUtil::ReduceSum( - ctx->stream(), XpuVarNdarray(left_extended_shape, alpha_diff->mut_dptr()), - XpuVarNdarray(x->shape_view(), broadcasted_alpha_diff), - XpuVarNdarray(x->shape_view(), reduce_sum_tmp_buf)); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_TF_CUDA_PRELU_GRAD_KERNEL(dtype) \ - REGISTER_USER_KERNEL("tf_prelu_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)) \ - .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ - const Shape& in_shape = ctx->InputShape("x", 0); \ - const Shape& alpha_shape = ctx->InputShape("alpha", 0); \ - const int64_t tmp_buffer_size = \ - IsAlphaShapeContiguous(alpha_shape, in_shape) \ - ? 2 * GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(dtype)) \ - : 3 * GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(dtype)); \ - return tmp_buffer_size; \ - }); - -REGISTER_TF_CUDA_PRELU_GRAD_KERNEL(half) -REGISTER_TF_CUDA_PRELU_GRAD_KERNEL(float) -REGISTER_TF_CUDA_PRELU_GRAD_KERNEL(double) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/ndarray/ndarray_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +__global__ void BroadcastPReluForwardGpu(const int32_t elem_cnt, const int32_t alpha_size, + const int32_t inner_size, const T* x, const T* alpha, + T* y) { + T zero_val = static_cast(0.0); + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { + const T x_i = x[i]; + const T alpha_i = alpha[(i / inner_size) % alpha_size]; + y[i] = x_i > zero_val ? x_i : x_i * alpha_i; + } +} + +template +__global__ void BroadcastPReluBackwardGpu(const int32_t elem_cnt, const int32_t alpha_size, + const int32_t inner_size, const T* x, const T* alpha, + const T* dy, T* dx, T* alpha_diff) { + T zero_val = static_cast(0.0); + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { + const T x_i = x[i]; + const T dy_i = dy[i]; + const T alpha_i = alpha[(i / inner_size) % alpha_size]; + T dx_i = zero_val; + T alpha_diff_i = zero_val; + if (x_i > zero_val) { + dx_i = dy_i; + alpha_diff_i = zero_val; + } else { + dx_i = dy_i * alpha_i; + alpha_diff_i = dy_i * x_i; + } + dx[i] = dx_i; + alpha_diff[i] = alpha_diff_i; + } +} + +template +__global__ void ElemwisePReluForwardGpu(const int32_t elem_cnt, const T* x, const T* alpha, T* y) { + T zero_val = static_cast(0.0); + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { + const T x_i = x[i]; + const T alpha_i = alpha[i]; + y[i] = x_i > zero_val ? x_i : x_i * alpha_i; + } +} + +template +__global__ void ElemwisePReluBackwardGpu(const int32_t elem_cnt, const T* x, const T* alpha, + const T* dy, T* dx, T* alpha_diff) { + T zero_val = static_cast(0.0); + CUDA_1D_KERNEL_LOOP(i, elem_cnt) { + const T x_i = x[i]; + const T dy_i = dy[i]; + const T alpha_i = alpha[i]; + T dx_i = zero_val; + T alpha_diff_i = zero_val; + if (x_i > zero_val) { + dx_i = dy_i; + alpha_diff_i = zero_val; + } else { + dx_i = dy_i * alpha_i; + alpha_diff_i = dy_i * x_i; + } + dx[i] = dx_i; + alpha_diff[i] = alpha_diff_i; + } +} + +bool IsAlphaShapeContiguous(const ShapeView& alpha_shape, const ShapeView& x_shape) { + if (alpha_shape.elem_cnt() == 1) { return true; } + int64_t begin_idx = -1; + for (int64_t i = 0; i < alpha_shape.NumAxes(); ++i) { + if (alpha_shape.At(i) != 1) { + begin_idx = i; + break; + } + } + CHECK_NE(begin_idx, -1); + int64_t end_idx = -1; + for (int64_t i = alpha_shape.NumAxes(); i > 0; --i) { + if (alpha_shape.At(i - 1) != 1) { + end_idx = i; + break; + } + } + CHECK_NE(end_idx, -1); + if (alpha_shape.elem_cnt() == x_shape.Count(begin_idx + 1, end_idx + 1)) { + return true; + } else { + return false; + } +} + +int32_t GetOuterSize(const ShapeView& alpha_shape, const ShapeView& x_shape) { + int32_t outer_size = x_shape.At(0); + for (int32_t i = 0; i < alpha_shape.NumAxes(); ++i) { + if (alpha_shape.At(i) == 1) { + outer_size *= x_shape.At(i + 1); + } else { + break; + } + } + return outer_size; +} + +} // namespace + +template +class TfGpuPReluKernel final : public user_op::OpKernel { + public: + TfGpuPReluKernel() = default; + ~TfGpuPReluKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* alpha = ctx->Tensor4ArgNameAndIndex("alpha", 0); + user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0); + const int32_t elem_cnt = x->shape_view().elem_cnt(); + if (IsAlphaShapeContiguous(alpha->shape_view(), x->shape_view())) { + const int32_t outer_size = GetOuterSize(alpha->shape_view(), x->shape_view()); + const int32_t alpha_size = alpha->shape_view().elem_cnt(); + const int32_t inner_size = elem_cnt / outer_size / alpha_size; + BroadcastPReluForwardGpu<<stream()->As()->cuda_stream()>>>( + elem_cnt, alpha_size, inner_size, x->dptr(), alpha->dptr(), y->mut_dptr()); + } else { + user_op::Tensor* broadcasted_alpha = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + const Shape& left_extended_shape = + CreateLeftExtendedShape(ShapeView(alpha->shape_view()), x->shape_view().NumAxes()); + NdarrayUtil::BroadcastTo( + ctx->stream(), XpuVarNdarray(x->shape_view(), broadcasted_alpha->mut_dptr()), + XpuVarNdarray(left_extended_shape, alpha->dptr())); + ElemwisePReluForwardGpu<<stream()->As()->cuda_stream()>>>( + elem_cnt, x->dptr(), broadcasted_alpha->dptr(), y->mut_dptr()); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_TF_CUDA_PRELU_KERNEL(dtype) \ + REGISTER_USER_KERNEL("tf_prelu") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("y", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ + const Shape& in_shape = ctx->InputShape("x", 0); \ + const Shape& alpha_shape = ctx->InputShape("alpha", 0); \ + const int64_t tmp_buffer_size = \ + IsAlphaShapeContiguous(alpha_shape, in_shape) \ + ? 0 \ + : GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(dtype)); \ + return tmp_buffer_size; \ + }); + +REGISTER_TF_CUDA_PRELU_KERNEL(half) +REGISTER_TF_CUDA_PRELU_KERNEL(float) +REGISTER_TF_CUDA_PRELU_KERNEL(double) + +template +class TfGpuPReluGradKernel final : public user_op::OpKernel { + public: + TfGpuPReluGradKernel() = default; + ~TfGpuPReluGradKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0); + const user_op::Tensor* alpha = ctx->Tensor4ArgNameAndIndex("alpha", 0); + const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); + user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); + user_op::Tensor* alpha_diff = ctx->Tensor4ArgNameAndIndex("alpha_diff", 0); + user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); + const int32_t elem_cnt = x->shape_view().elem_cnt(); + T* broadcasted_alpha_diff = tmp_buffer->mut_dptr(); + T* reduce_sum_tmp_buf = reinterpret_cast(tmp_buffer->mut_dptr() + + GetCudaAlignedSize(elem_cnt * sizeof(T))); + const Shape& left_extended_shape = + CreateLeftExtendedShape(ShapeView(alpha->shape_view()), x->shape_view().NumAxes()); + if (IsAlphaShapeContiguous(alpha->shape_view(), x->shape_view())) { + const int32_t outer_size = GetOuterSize(alpha->shape_view(), x->shape_view()); + const int32_t alpha_size = alpha->shape_view().elem_cnt(); + const int32_t inner_size = elem_cnt / outer_size / alpha_size; + BroadcastPReluBackwardGpu<<stream()->As()->cuda_stream()>>>( + elem_cnt, alpha_size, inner_size, x->dptr(), alpha->dptr(), dy->dptr(), + dx->mut_dptr(), broadcasted_alpha_diff); + } else { + T* broadcasted_alpha = reinterpret_cast(tmp_buffer->mut_dptr() + + 2 * GetCudaAlignedSize(elem_cnt * sizeof(T))); + + NdarrayUtil::BroadcastTo( + ctx->stream(), XpuVarNdarray(x->shape_view(), broadcasted_alpha), + XpuVarNdarray(left_extended_shape, alpha->dptr())); + + ElemwisePReluBackwardGpu<<stream()->As()->cuda_stream()>>>( + elem_cnt, x->dptr(), broadcasted_alpha, dy->dptr(), dx->mut_dptr(), + broadcasted_alpha_diff); + } + NdarrayUtil::ReduceSum( + ctx->stream(), XpuVarNdarray(left_extended_shape, alpha_diff->mut_dptr()), + XpuVarNdarray(x->shape_view(), broadcasted_alpha_diff), + XpuVarNdarray(x->shape_view(), reduce_sum_tmp_buf)); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_TF_CUDA_PRELU_GRAD_KERNEL(dtype) \ + REGISTER_USER_KERNEL("tf_prelu_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value)) \ + .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ + const Shape& in_shape = ctx->InputShape("x", 0); \ + const Shape& alpha_shape = ctx->InputShape("alpha", 0); \ + const int64_t tmp_buffer_size = \ + IsAlphaShapeContiguous(alpha_shape, in_shape) \ + ? 2 * GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(dtype)) \ + : 3 * GetCudaAlignedSize(in_shape.elem_cnt() * sizeof(dtype)); \ + return tmp_buffer_size; \ + }); + +REGISTER_TF_CUDA_PRELU_GRAD_KERNEL(half) +REGISTER_TF_CUDA_PRELU_GRAD_KERNEL(float) +REGISTER_TF_CUDA_PRELU_GRAD_KERNEL(double) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/to_contiguous_kernel.hip.cpp b/oneflow/user/kernels/to_contiguous_kernel.hip.cpp index 72d23a8..ac2791c 100644 --- a/oneflow/user/kernels/to_contiguous_kernel.hip.cpp +++ b/oneflow/user/kernels/to_contiguous_kernel.hip.cpp @@ -1,161 +1,161 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include -#include "oneflow/core/common/device_type.pb.h" -#include "oneflow/user/kernels/to_contiguous_kernel.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include "oneflow/core/hip/elementwise.hip.h" - -namespace oneflow { - -namespace { - -constexpr int32_t kThreadWorkSize = 4; -constexpr int32_t kNumThreads = 32 * 4; -constexpr int32_t get_min_threads_num() { return kNumThreads; } -constexpr int32_t get_block_work_size() { return kThreadWorkSize * kNumThreads; } -constexpr int32_t get_num_blocks(int64_t elem_cnt) { - return (elem_cnt + get_block_work_size() - 1) / get_block_work_size(); -} - -struct StrideParam { - int32_t stride[SHAPE_MAX_AXIS_SIZE]; - - StrideParam(const int64_t* stride_vec, const size_t ndim) { - for (size_t i = 0; i < ndim; ++i) { stride[i] = stride_vec[i]; } - } -}; - -template -__device__ __forceinline__ IndexType compute_index(IndexType out_offset, - const StrideParam& out_params, - const StrideParam& in_params) { - IndexType in_offset = 0; - IndexType remaining = out_offset; - -#pragma unroll - for (size_t i = 0; i < ndim; ++i) { - const IndexType idx = static_cast(remaining / out_params.stride[i]); - remaining -= idx * out_params.stride[i]; - in_offset += idx * in_params.stride[i]; - } - return in_offset; -} - -template -__global__ void ToContiguousForwardGpuParallel(IndexType count, const StrideParam in_stride, - const StrideParam out_stride, const T* in_dptr, - T* out_dptr, const int32_t num_block_threads, - const int32_t thread_work_size, - const int32_t block_work_size) { - IndexType remaining = count - block_work_size * blockIdx.x; - IndexType idx = blockIdx.x; - IndexType thread_idx = threadIdx.x; -#pragma unroll - for (int32_t i = 0; i < thread_work_size; i++) { - if (thread_idx >= remaining) { return; } - IndexType out_idx = thread_idx + block_work_size * idx; - IndexType in_idx = compute_index(out_idx, out_stride, in_stride); - out_dptr[out_idx] = in_dptr[in_idx]; - thread_idx += num_block_threads; - } -} - -template -void LaunchToContiguousKernel(ep::Stream* stream, IndexType count, const size_t ndim, - IndexType block_size, const std::vector& in_stride, - const DimVector& out_stride, const char* in_dptr, char* out_dptr) { - const int32_t num_blocks = get_num_blocks(count); - constexpr int32_t num_threads = get_min_threads_num(); - constexpr int32_t block_work_size = get_block_work_size(); - StrideParam param_in_stride(in_stride.data(), ndim), param_out_stride(out_stride.data(), ndim); - - switch (ndim) { -#define TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(dim) \ - case dim: \ - ToContiguousForwardGpuParallel \ - <<As()->cuda_stream()>>>( \ - count, param_in_stride, param_out_stride, reinterpret_cast(in_dptr), \ - reinterpret_cast(out_dptr), num_threads, kThreadWorkSize, block_work_size); \ - break; - - TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(1) - TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(2) - TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(3) - TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(4) - TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(5) - TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(6) - TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(7) - TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(8) - TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(9) - TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(10) - TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(11) - TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(12) - TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(13) - TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(14) - TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(15) - TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(16) - default: break; -#undef TO_CONTIGUOUS_FORWARD_GPU_PARALLEL - } -} - -} // namespace - -template -struct ToContiguousUtil : ToContiguousUtilBase { - using ToContiguousUtilBase::ToContiguousUtilBase; - static constexpr size_t dsize = sizeof(T); - void operator()() { - int constant_memory_size = 0; - const size_t ndims = contiguous_dim + 1; - if (ndims == 0) { - // 0-dim tensor - OF_CUDA_CHECK(hipMemcpyAsync(out_dptr, in_dptr, block_size * dsize, hipMemcpyDeviceToDevice, - stream->As()->cuda_stream())); - } else { - bool is_same = true; - for (int64_t i = contiguous_dim; i != -1; --i) { - if (out_stride[i] != in_stride[i]) { - is_same = false; - break; - } - } - if (is_same) { - // if input tensor's strides equals to output's, than just copy one memory-contiguous tensor - OF_CUDA_CHECK(hipMemcpyAsync(out_dptr, in_dptr, element_count * dsize, - hipMemcpyDeviceToDevice, - stream->As()->cuda_stream())); - } else { - if (element_count < GetMaxVal()) { - LaunchToContiguousKernel(stream, element_count, ndims, block_size, in_stride, - out_stride, in_dptr, out_dptr); - } else { - LaunchToContiguousKernel(stream, element_count, ndims, block_size, in_stride, - out_stride, in_dptr, out_dptr); - } - } - } - } -}; - -#define INSTANTIATE_TO_CONTIGUOUS_UTILS_FOR_CUDA(T) \ - template struct ToContiguousUtil; -OF_PP_FOR_EACH_TUPLE(INSTANTIATE_TO_CONTIGUOUS_UTILS_FOR_CUDA, - TO_CONTIGUOUS_TYPES TO_CONTIGUOUS_CUDA_SPECIAL_TYPE) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include +#include "oneflow/core/common/device_type.pb.h" +#include "oneflow/user/kernels/to_contiguous_kernel.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include "oneflow/core/hip/elementwise.hip.h" + +namespace oneflow { + +namespace { + +constexpr int32_t kThreadWorkSize = 4; +constexpr int32_t kNumThreads = 32 * 4; +constexpr int32_t get_min_threads_num() { return kNumThreads; } +constexpr int32_t get_block_work_size() { return kThreadWorkSize * kNumThreads; } +constexpr int32_t get_num_blocks(int64_t elem_cnt) { + return (elem_cnt + get_block_work_size() - 1) / get_block_work_size(); +} + +struct StrideParam { + int32_t stride[SHAPE_MAX_AXIS_SIZE]; + + StrideParam(const int64_t* stride_vec, const size_t ndim) { + for (size_t i = 0; i < ndim; ++i) { stride[i] = stride_vec[i]; } + } +}; + +template +__device__ __forceinline__ IndexType compute_index(IndexType out_offset, + const StrideParam& out_params, + const StrideParam& in_params) { + IndexType in_offset = 0; + IndexType remaining = out_offset; + +#pragma unroll + for (size_t i = 0; i < ndim; ++i) { + const IndexType idx = static_cast(remaining / out_params.stride[i]); + remaining -= idx * out_params.stride[i]; + in_offset += idx * in_params.stride[i]; + } + return in_offset; +} + +template +__global__ void ToContiguousForwardGpuParallel(IndexType count, const StrideParam in_stride, + const StrideParam out_stride, const T* in_dptr, + T* out_dptr, const int32_t num_block_threads, + const int32_t thread_work_size, + const int32_t block_work_size) { + IndexType remaining = count - block_work_size * blockIdx.x; + IndexType idx = blockIdx.x; + IndexType thread_idx = threadIdx.x; +#pragma unroll + for (int32_t i = 0; i < thread_work_size; i++) { + if (thread_idx >= remaining) { return; } + IndexType out_idx = thread_idx + block_work_size * idx; + IndexType in_idx = compute_index(out_idx, out_stride, in_stride); + out_dptr[out_idx] = in_dptr[in_idx]; + thread_idx += num_block_threads; + } +} + +template +void LaunchToContiguousKernel(ep::Stream* stream, IndexType count, const size_t ndim, + IndexType block_size, const std::vector& in_stride, + const DimVector& out_stride, const char* in_dptr, char* out_dptr) { + const int32_t num_blocks = get_num_blocks(count); + constexpr int32_t num_threads = get_min_threads_num(); + constexpr int32_t block_work_size = get_block_work_size(); + StrideParam param_in_stride(in_stride.data(), ndim), param_out_stride(out_stride.data(), ndim); + + switch (ndim) { +#define TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(dim) \ + case dim: \ + ToContiguousForwardGpuParallel \ + <<As()->cuda_stream()>>>( \ + count, param_in_stride, param_out_stride, reinterpret_cast(in_dptr), \ + reinterpret_cast(out_dptr), num_threads, kThreadWorkSize, block_work_size); \ + break; + + TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(1) + TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(2) + TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(3) + TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(4) + TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(5) + TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(6) + TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(7) + TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(8) + TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(9) + TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(10) + TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(11) + TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(12) + TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(13) + TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(14) + TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(15) + TO_CONTIGUOUS_FORWARD_GPU_PARALLEL(16) + default: break; +#undef TO_CONTIGUOUS_FORWARD_GPU_PARALLEL + } +} + +} // namespace + +template +struct ToContiguousUtil : ToContiguousUtilBase { + using ToContiguousUtilBase::ToContiguousUtilBase; + static constexpr size_t dsize = sizeof(T); + void operator()() { + int constant_memory_size = 0; + const size_t ndims = contiguous_dim + 1; + if (ndims == 0) { + // 0-dim tensor + OF_CUDA_CHECK(hipMemcpyAsync(out_dptr, in_dptr, block_size * dsize, hipMemcpyDeviceToDevice, + stream->As()->cuda_stream())); + } else { + bool is_same = true; + for (int64_t i = contiguous_dim; i != -1; --i) { + if (out_stride[i] != in_stride[i]) { + is_same = false; + break; + } + } + if (is_same) { + // if input tensor's strides equals to output's, than just copy one memory-contiguous tensor + OF_CUDA_CHECK(hipMemcpyAsync(out_dptr, in_dptr, element_count * dsize, + hipMemcpyDeviceToDevice, + stream->As()->cuda_stream())); + } else { + if (element_count < GetMaxVal()) { + LaunchToContiguousKernel(stream, element_count, ndims, block_size, in_stride, + out_stride, in_dptr, out_dptr); + } else { + LaunchToContiguousKernel(stream, element_count, ndims, block_size, in_stride, + out_stride, in_dptr, out_dptr); + } + } + } + } +}; + +#define INSTANTIATE_TO_CONTIGUOUS_UTILS_FOR_CUDA(T) \ + template struct ToContiguousUtil; +OF_PP_FOR_EACH_TUPLE(INSTANTIATE_TO_CONTIGUOUS_UTILS_FOR_CUDA, + TO_CONTIGUOUS_TYPES TO_CONTIGUOUS_CUDA_SPECIAL_TYPE) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/tril_kernel.hip.cpp b/oneflow/user/kernels/tril_kernel.hip.cpp index 817e8b0..02e7b3d 100644 --- a/oneflow/user/kernels/tril_kernel.hip.cpp +++ b/oneflow/user/kernels/tril_kernel.hip.cpp @@ -1,256 +1,256 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/common/data_type.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/util/cuda_half_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -__global__ void TrilGpu(const int64_t elem_cnt, const int64_t num_rows, const int64_t num_cols, - const int64_t diagonal, const T* x, const T fill, T* y) { - const int64_t matrix_size = num_rows * num_cols; - CUDA_1D_KERNEL_LOOP_T(int64_t, k, elem_cnt) { - const int64_t offset_in_matrix = k % matrix_size; - const int64_t i = offset_in_matrix / num_cols; - const int64_t j = offset_in_matrix - num_cols * i; - y[k] = j > i + diagonal ? fill : x[k]; - } -} - -template -__global__ void TrilWarpProcessRowGpu(const int64_t total_rows, const int64_t num_rows, - const int64_t num_cols, const int64_t diagonal, const T* x, - const T fill, T* y) { - const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize; - const int64_t lan_id = threadIdx.x % kCudaWarpSize; - const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize; - for (int64_t i = warp_id; i < total_rows; i += num_warp) { - const int64_t row = i % num_rows; - for (int64_t col = lan_id; col < num_cols; col += kCudaWarpSize) { - const int64_t idx = i * num_cols + col; - y[idx] = col > row + diagonal ? fill : x[idx]; - } - } -} - -template<> -__global__ void TrilWarpProcessRowGpu(const int64_t total_rows, const int64_t num_rows, - const int64_t num_cols, const int64_t diagonal, - const half* x, const half fill, half* y) { - const int64_t h2_num_cols = num_cols / 2; - const auto* x_h2 = reinterpret_cast(x); - auto* y_h2 = reinterpret_cast(y); - - const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize; - const int64_t lan_id = threadIdx.x % kCudaWarpSize; - const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize; - for (int64_t i = warp_id; i < total_rows; i += num_warp) { - const int64_t row = i % num_rows; - for (int64_t col = lan_id; col < h2_num_cols; col += kCudaWarpSize) { - const int64_t idx = i * h2_num_cols + col; - const half2 x_val = x_h2[idx]; - half2 y_val; - y_val.data.x = (2 * col) > row + diagonal ? fill : static_cast(x_val.data.x); - y_val.data.y = (2 * col + 1) > row + diagonal ? fill : static_cast(x_val.data.y); - y_h2[idx] = y_val; - } - } -} - -template -__global__ void FusedScaleTrilGpu(const int64_t elem_cnt, const int64_t num_rows, - const int64_t num_cols, const int64_t diagonal, const T scale, - const T* x, const T fill, T* y) { - const int64_t matrix_size = num_rows * num_cols; - CUDA_1D_KERNEL_LOOP_T(int64_t, k, elem_cnt) { - const int64_t offset_in_matrix = k % matrix_size; - const int64_t i = offset_in_matrix / num_cols; - const int64_t j = offset_in_matrix - num_cols * i; - y[k] = j > i + diagonal ? fill : (scale * x[k]); - } -} - -template -__global__ void FusedScaleTrilWarpProcessRowGpu(const int64_t total_rows, const int64_t num_rows, - const int64_t num_cols, const int64_t diagonal, - const T scale, const T* x, const T fill, T* y) { - const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize; - const int64_t lan_id = threadIdx.x % kCudaWarpSize; - const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize; - for (int64_t i = warp_id; i < total_rows; i += num_warp) { - const int64_t row = i % num_rows; - for (int64_t col = lan_id; col < num_cols; col += kCudaWarpSize) { - const int64_t idx = i * num_cols + col; - y[idx] = col > row + diagonal ? fill : (scale * x[idx]); - } - } -} - -template<> -__global__ void FusedScaleTrilWarpProcessRowGpu(const int64_t total_rows, - const int64_t num_rows, - const int64_t num_cols, - const int64_t diagonal, const half scale, - const half* x, const half fill, half* y) { - const int64_t h2_num_cols = num_cols / 2; - const auto* x_h2 = reinterpret_cast(x); - auto* y_h2 = reinterpret_cast(y); - const half2 h2_scale = __half2half2(scale); - const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize; - const int64_t lan_id = threadIdx.x % kCudaWarpSize; - const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize; - for (int64_t i = warp_id; i < total_rows; i += num_warp) { - const int64_t row = i % num_rows; - for (int64_t col = lan_id; col < h2_num_cols; col += kCudaWarpSize) { - const int64_t idx = i * h2_num_cols + col; - const half2 scaled_x = __hmul2(h2_scale, x_h2[idx]); - half2 y_val; - y_val.data.x = (2 * col) > row + diagonal ? fill : static_cast(scaled_x.data.x); - y_val.data.y = (2 * col + 1) > row + diagonal ? fill : static_cast(scaled_x.data.y); - y_h2[idx] = y_val; - } - } -} - -template -T GetAttrVal(bool is_floating_val, double floating_value, int64_t integer_value) { - return is_floating_val ? static_cast(floating_value) : static_cast(integer_value); -} - -template<> -half GetAttrVal(bool is_floating_val, double floating_value, int64_t integer_value) { - return is_floating_val ? __float2half(floating_value) : __float2half(integer_value); -} - -} // namespace - -template -class GpuTrilKernel final : public user_op::OpKernel { - public: - GpuTrilKernel() = default; - ~GpuTrilKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("in", 0); - const auto shape = x->shape_view(); - const auto diagonal = ctx->Attr("diagonal"); - const int64_t num_rows = shape.At(shape.NumAxes() - 2); - const int64_t num_cols = shape.At(shape.NumAxes() - 1); - user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("out", 0); - const int32_t elem_cnt = shape.elem_cnt(); - const T fill = GetAttrVal(ctx->Attr("is_floating_fill_value"), - ctx->Attr("floating_fill_value"), - ctx->Attr("integer_fill_value")); - if (num_cols % (kCudaWarpSize * 2) == 0) { - const int64_t total_rows = elem_cnt / num_cols; - TrilWarpProcessRowGpu<<stream()->As()->cuda_stream()>>>( - total_rows, num_rows, num_cols, diagonal, x->dptr(), fill, y->mut_dptr()); - } else { - TrilGpu<<stream()->As()->cuda_stream()>>>( - elem_cnt, num_rows, num_cols, diagonal, x->dptr(), fill, y->mut_dptr()); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_TRIL_KERNEL(dtype) \ - REGISTER_USER_KERNEL("tril") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("out", 0) == GetDataType::value)) \ - .SetInplaceProposalFn([](const user_op::InferContext&, \ - user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe { \ - OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, true)); \ - return Maybe::Ok(); \ - }); - -REGISTER_CUDA_TRIL_KERNEL(float) -REGISTER_CUDA_TRIL_KERNEL(double) -REGISTER_CUDA_TRIL_KERNEL(bool) -REGISTER_CUDA_TRIL_KERNEL(uint8_t) -REGISTER_CUDA_TRIL_KERNEL(int8_t) -REGISTER_CUDA_TRIL_KERNEL(int32_t) -REGISTER_CUDA_TRIL_KERNEL(int64_t) -REGISTER_CUDA_TRIL_KERNEL(half) - -template -class GpuFusedScaleTrilKernel final : public user_op::OpKernel { - public: - GpuFusedScaleTrilKernel() = default; - ~GpuFusedScaleTrilKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("in", 0); - const auto shape = x->shape_view(); - const auto diagonal = ctx->Attr("diagonal"); - const int32_t num_rows = shape.At(shape.NumAxes() - 2); - const int32_t num_cols = shape.At(shape.NumAxes() - 1); - user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("out", 0); - const int32_t elem_cnt = shape.elem_cnt(); - const T fill = GetAttrVal(ctx->Attr("is_floating_fill_value"), - ctx->Attr("floating_fill_value"), - ctx->Attr("integer_fill_value")); - const T scale = GetAttrVal(ctx->Attr("is_floating_scale_value"), - ctx->Attr("floating_scale_value"), - ctx->Attr("integer_scale_value")); - if (num_cols % (kCudaWarpSize * 2) == 0) { - const int64_t total_rows = elem_cnt / num_cols; - FusedScaleTrilWarpProcessRowGpu<<stream()->As()->cuda_stream()>>>( - total_rows, num_rows, num_cols, diagonal, scale, x->dptr(), fill, y->mut_dptr()); - } else { - FusedScaleTrilGpu<<stream()->As()->cuda_stream()>>>( - elem_cnt, num_rows, num_cols, diagonal, scale, x->dptr(), fill, y->mut_dptr()); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(dtype) \ - REGISTER_USER_KERNEL("fused_scale_tril") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("out", 0) == GetDataType::value)) \ - .SetInplaceProposalFn([](const user_op::InferContext&, \ - user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe { \ - OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, true)); \ - return Maybe::Ok(); \ - }); - -REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(float) -REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(double) -REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(bool) -REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(uint8_t) -REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(int8_t) -REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(int32_t) -REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(int64_t) -REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(half) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/common/data_type.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/util/cuda_half_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +__global__ void TrilGpu(const int64_t elem_cnt, const int64_t num_rows, const int64_t num_cols, + const int64_t diagonal, const T* x, const T fill, T* y) { + const int64_t matrix_size = num_rows * num_cols; + CUDA_1D_KERNEL_LOOP_T(int64_t, k, elem_cnt) { + const int64_t offset_in_matrix = k % matrix_size; + const int64_t i = offset_in_matrix / num_cols; + const int64_t j = offset_in_matrix - num_cols * i; + y[k] = j > i + diagonal ? fill : x[k]; + } +} + +template +__global__ void TrilWarpProcessRowGpu(const int64_t total_rows, const int64_t num_rows, + const int64_t num_cols, const int64_t diagonal, const T* x, + const T fill, T* y) { + const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize; + const int64_t lan_id = threadIdx.x % kCudaWarpSize; + const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize; + for (int64_t i = warp_id; i < total_rows; i += num_warp) { + const int64_t row = i % num_rows; + for (int64_t col = lan_id; col < num_cols; col += kCudaWarpSize) { + const int64_t idx = i * num_cols + col; + y[idx] = col > row + diagonal ? fill : x[idx]; + } + } +} + +template<> +__global__ void TrilWarpProcessRowGpu(const int64_t total_rows, const int64_t num_rows, + const int64_t num_cols, const int64_t diagonal, + const half* x, const half fill, half* y) { + const int64_t h2_num_cols = num_cols / 2; + const auto* x_h2 = reinterpret_cast(x); + auto* y_h2 = reinterpret_cast(y); + + const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize; + const int64_t lan_id = threadIdx.x % kCudaWarpSize; + const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize; + for (int64_t i = warp_id; i < total_rows; i += num_warp) { + const int64_t row = i % num_rows; + for (int64_t col = lan_id; col < h2_num_cols; col += kCudaWarpSize) { + const int64_t idx = i * h2_num_cols + col; + const half2 x_val = x_h2[idx]; + half2 y_val; + y_val.data.x = (2 * col) > row + diagonal ? fill : static_cast(x_val.data.x); + y_val.data.y = (2 * col + 1) > row + diagonal ? fill : static_cast(x_val.data.y); + y_h2[idx] = y_val; + } + } +} + +template +__global__ void FusedScaleTrilGpu(const int64_t elem_cnt, const int64_t num_rows, + const int64_t num_cols, const int64_t diagonal, const T scale, + const T* x, const T fill, T* y) { + const int64_t matrix_size = num_rows * num_cols; + CUDA_1D_KERNEL_LOOP_T(int64_t, k, elem_cnt) { + const int64_t offset_in_matrix = k % matrix_size; + const int64_t i = offset_in_matrix / num_cols; + const int64_t j = offset_in_matrix - num_cols * i; + y[k] = j > i + diagonal ? fill : (scale * x[k]); + } +} + +template +__global__ void FusedScaleTrilWarpProcessRowGpu(const int64_t total_rows, const int64_t num_rows, + const int64_t num_cols, const int64_t diagonal, + const T scale, const T* x, const T fill, T* y) { + const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize; + const int64_t lan_id = threadIdx.x % kCudaWarpSize; + const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize; + for (int64_t i = warp_id; i < total_rows; i += num_warp) { + const int64_t row = i % num_rows; + for (int64_t col = lan_id; col < num_cols; col += kCudaWarpSize) { + const int64_t idx = i * num_cols + col; + y[idx] = col > row + diagonal ? fill : (scale * x[idx]); + } + } +} + +template<> +__global__ void FusedScaleTrilWarpProcessRowGpu(const int64_t total_rows, + const int64_t num_rows, + const int64_t num_cols, + const int64_t diagonal, const half scale, + const half* x, const half fill, half* y) { + const int64_t h2_num_cols = num_cols / 2; + const auto* x_h2 = reinterpret_cast(x); + auto* y_h2 = reinterpret_cast(y); + const half2 h2_scale = __half2half2(scale); + const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize; + const int64_t lan_id = threadIdx.x % kCudaWarpSize; + const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize; + for (int64_t i = warp_id; i < total_rows; i += num_warp) { + const int64_t row = i % num_rows; + for (int64_t col = lan_id; col < h2_num_cols; col += kCudaWarpSize) { + const int64_t idx = i * h2_num_cols + col; + const half2 scaled_x = __hmul2(h2_scale, x_h2[idx]); + half2 y_val; + y_val.data.x = (2 * col) > row + diagonal ? fill : static_cast(scaled_x.data.x); + y_val.data.y = (2 * col + 1) > row + diagonal ? fill : static_cast(scaled_x.data.y); + y_h2[idx] = y_val; + } + } +} + +template +T GetAttrVal(bool is_floating_val, double floating_value, int64_t integer_value) { + return is_floating_val ? static_cast(floating_value) : static_cast(integer_value); +} + +template<> +half GetAttrVal(bool is_floating_val, double floating_value, int64_t integer_value) { + return is_floating_val ? __float2half(floating_value) : __float2half(integer_value); +} + +} // namespace + +template +class GpuTrilKernel final : public user_op::OpKernel { + public: + GpuTrilKernel() = default; + ~GpuTrilKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("in", 0); + const auto shape = x->shape_view(); + const auto diagonal = ctx->Attr("diagonal"); + const int64_t num_rows = shape.At(shape.NumAxes() - 2); + const int64_t num_cols = shape.At(shape.NumAxes() - 1); + user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("out", 0); + const int32_t elem_cnt = shape.elem_cnt(); + const T fill = GetAttrVal(ctx->Attr("is_floating_fill_value"), + ctx->Attr("floating_fill_value"), + ctx->Attr("integer_fill_value")); + if (num_cols % (kCudaWarpSize * 2) == 0) { + const int64_t total_rows = elem_cnt / num_cols; + TrilWarpProcessRowGpu<<stream()->As()->cuda_stream()>>>( + total_rows, num_rows, num_cols, diagonal, x->dptr(), fill, y->mut_dptr()); + } else { + TrilGpu<<stream()->As()->cuda_stream()>>>( + elem_cnt, num_rows, num_cols, diagonal, x->dptr(), fill, y->mut_dptr()); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_TRIL_KERNEL(dtype) \ + REGISTER_USER_KERNEL("tril") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("out", 0) == GetDataType::value)) \ + .SetInplaceProposalFn([](const user_op::InferContext&, \ + user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe { \ + OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, true)); \ + return Maybe::Ok(); \ + }); + +REGISTER_CUDA_TRIL_KERNEL(float) +REGISTER_CUDA_TRIL_KERNEL(double) +REGISTER_CUDA_TRIL_KERNEL(bool) +REGISTER_CUDA_TRIL_KERNEL(uint8_t) +REGISTER_CUDA_TRIL_KERNEL(int8_t) +REGISTER_CUDA_TRIL_KERNEL(int32_t) +REGISTER_CUDA_TRIL_KERNEL(int64_t) +REGISTER_CUDA_TRIL_KERNEL(half) + +template +class GpuFusedScaleTrilKernel final : public user_op::OpKernel { + public: + GpuFusedScaleTrilKernel() = default; + ~GpuFusedScaleTrilKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("in", 0); + const auto shape = x->shape_view(); + const auto diagonal = ctx->Attr("diagonal"); + const int32_t num_rows = shape.At(shape.NumAxes() - 2); + const int32_t num_cols = shape.At(shape.NumAxes() - 1); + user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("out", 0); + const int32_t elem_cnt = shape.elem_cnt(); + const T fill = GetAttrVal(ctx->Attr("is_floating_fill_value"), + ctx->Attr("floating_fill_value"), + ctx->Attr("integer_fill_value")); + const T scale = GetAttrVal(ctx->Attr("is_floating_scale_value"), + ctx->Attr("floating_scale_value"), + ctx->Attr("integer_scale_value")); + if (num_cols % (kCudaWarpSize * 2) == 0) { + const int64_t total_rows = elem_cnt / num_cols; + FusedScaleTrilWarpProcessRowGpu<<stream()->As()->cuda_stream()>>>( + total_rows, num_rows, num_cols, diagonal, scale, x->dptr(), fill, y->mut_dptr()); + } else { + FusedScaleTrilGpu<<stream()->As()->cuda_stream()>>>( + elem_cnt, num_rows, num_cols, diagonal, scale, x->dptr(), fill, y->mut_dptr()); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(dtype) \ + REGISTER_USER_KERNEL("fused_scale_tril") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("out", 0) == GetDataType::value)) \ + .SetInplaceProposalFn([](const user_op::InferContext&, \ + user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe { \ + OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, true)); \ + return Maybe::Ok(); \ + }); + +REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(float) +REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(double) +REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(bool) +REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(uint8_t) +REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(int8_t) +REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(int32_t) +REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(int64_t) +REGISTER_CUDA_FUSED_SCALE_TRIL_KERNEL(half) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/triu_kernel.hip.cpp b/oneflow/user/kernels/triu_kernel.hip.cpp index 6ffb20c..23e511c 100644 --- a/oneflow/user/kernels/triu_kernel.hip.cpp +++ b/oneflow/user/kernels/triu_kernel.hip.cpp @@ -1,131 +1,131 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/common/data_type.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/util/cuda_half_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -__global__ void TriuGpu(const int64_t elem_cnt, const int64_t num_rows, const int64_t num_cols, - const int64_t diagonal, const T* x, T* y) { - const int64_t matrix_size = num_rows * num_cols; - CUDA_1D_KERNEL_LOOP_T(int64_t, k, elem_cnt) { - const int64_t offset_in_matrix = k % matrix_size; - const int64_t i = offset_in_matrix / num_cols; - const int64_t j = offset_in_matrix - num_cols * i; - y[k] = j < i + diagonal ? static_cast(0) : x[k]; - } -} - -template -__global__ void TriuWarpProcessRowGpu(const int64_t total_rows, const int64_t num_rows, - const int64_t num_cols, const int64_t diagonal, const T* x, - T* y) { - const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize; - const int64_t lan_id = threadIdx.x % kCudaWarpSize; - const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize; - for (int64_t i = warp_id; i < total_rows; i += num_warp) { - const int64_t row = i % num_rows; - for (int64_t col = lan_id; col < num_cols; col += kCudaWarpSize) { - const int64_t idx = i * num_cols + col; - y[idx] = col < row + diagonal ? static_cast(0) : x[idx]; - } - } -} - -template<> -__global__ void TriuWarpProcessRowGpu(const int64_t total_rows, const int64_t num_rows, - const int64_t num_cols, const int64_t diagonal, - const half* x, half* y) { - const int64_t h2_num_cols = num_cols / 2; - const auto* x_h2 = reinterpret_cast(x); - auto* y_h2 = reinterpret_cast(y); - - const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize; - const int64_t lan_id = threadIdx.x % kCudaWarpSize; - const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize; - for (int64_t i = warp_id; i < total_rows; i += num_warp) { - const int64_t row = i % num_rows; - for (int64_t col = lan_id; col < h2_num_cols; col += kCudaWarpSize) { - const int64_t idx = i * h2_num_cols + col; - const half2 x_val = x_h2[idx]; - half2 y_val; - y_val.data.x = (2 * col) < row + diagonal ? static_cast(0) : static_cast(x_val.data.x); - y_val.data.y = (2 * col + 1) < row + diagonal ? static_cast(0) : static_cast(x_val.data.y); - y_h2[idx] = y_val; - } - } -} - -} // namespace - -template -class GpuTriuKernel final : public user_op::OpKernel { - public: - GpuTriuKernel() = default; - ~GpuTriuKernel() override = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("in", 0); - const auto shape = x->shape_view(); - const auto diagonal = ctx->Attr("diagonal"); - const int64_t num_rows = shape.At(shape.NumAxes() - 2); - const int64_t num_cols = shape.At(shape.NumAxes() - 1); - user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("out", 0); - const int32_t elem_cnt = shape.elem_cnt(); - if (elem_cnt == 0) { return; } - if (num_cols % (kCudaWarpSize * 2) == 0) { - const int64_t total_rows = elem_cnt / num_cols; - TriuWarpProcessRowGpu<<stream()->As()->cuda_stream()>>>( - total_rows, num_rows, num_cols, diagonal, x->dptr(), y->mut_dptr()); - } else { - TriuGpu<<stream()->As()->cuda_stream()>>>( - elem_cnt, num_rows, num_cols, diagonal, x->dptr(), y->mut_dptr()); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_CUDA_TRIU_KERNEL(dtype) \ - REGISTER_USER_KERNEL("triu") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("out", 0) == GetDataType::value)) \ - .SetInplaceProposalFn([](const user_op::InferContext&, \ - user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe { \ - OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, true)); \ - return Maybe::Ok(); \ - }); - -REGISTER_CUDA_TRIU_KERNEL(half) -REGISTER_CUDA_TRIU_KERNEL(float) -REGISTER_CUDA_TRIU_KERNEL(double) -REGISTER_CUDA_TRIU_KERNEL(uint8_t) -REGISTER_CUDA_TRIU_KERNEL(int8_t) -REGISTER_CUDA_TRIU_KERNEL(int32_t) -REGISTER_CUDA_TRIU_KERNEL(int64_t) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/common/data_type.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/util/cuda_half_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +__global__ void TriuGpu(const int64_t elem_cnt, const int64_t num_rows, const int64_t num_cols, + const int64_t diagonal, const T* x, T* y) { + const int64_t matrix_size = num_rows * num_cols; + CUDA_1D_KERNEL_LOOP_T(int64_t, k, elem_cnt) { + const int64_t offset_in_matrix = k % matrix_size; + const int64_t i = offset_in_matrix / num_cols; + const int64_t j = offset_in_matrix - num_cols * i; + y[k] = j < i + diagonal ? static_cast(0) : x[k]; + } +} + +template +__global__ void TriuWarpProcessRowGpu(const int64_t total_rows, const int64_t num_rows, + const int64_t num_cols, const int64_t diagonal, const T* x, + T* y) { + const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize; + const int64_t lan_id = threadIdx.x % kCudaWarpSize; + const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize; + for (int64_t i = warp_id; i < total_rows; i += num_warp) { + const int64_t row = i % num_rows; + for (int64_t col = lan_id; col < num_cols; col += kCudaWarpSize) { + const int64_t idx = i * num_cols + col; + y[idx] = col < row + diagonal ? static_cast(0) : x[idx]; + } + } +} + +template<> +__global__ void TriuWarpProcessRowGpu(const int64_t total_rows, const int64_t num_rows, + const int64_t num_cols, const int64_t diagonal, + const half* x, half* y) { + const int64_t h2_num_cols = num_cols / 2; + const auto* x_h2 = reinterpret_cast(x); + auto* y_h2 = reinterpret_cast(y); + + const int64_t warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / kCudaWarpSize; + const int64_t lan_id = threadIdx.x % kCudaWarpSize; + const int64_t num_warp = blockDim.x * gridDim.x / kCudaWarpSize; + for (int64_t i = warp_id; i < total_rows; i += num_warp) { + const int64_t row = i % num_rows; + for (int64_t col = lan_id; col < h2_num_cols; col += kCudaWarpSize) { + const int64_t idx = i * h2_num_cols + col; + const half2 x_val = x_h2[idx]; + half2 y_val; + y_val.data.x = (2 * col) < row + diagonal ? static_cast(0) : static_cast(x_val.data.x); + y_val.data.y = (2 * col + 1) < row + diagonal ? static_cast(0) : static_cast(x_val.data.y); + y_h2[idx] = y_val; + } + } +} + +} // namespace + +template +class GpuTriuKernel final : public user_op::OpKernel { + public: + GpuTriuKernel() = default; + ~GpuTriuKernel() override = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("in", 0); + const auto shape = x->shape_view(); + const auto diagonal = ctx->Attr("diagonal"); + const int64_t num_rows = shape.At(shape.NumAxes() - 2); + const int64_t num_cols = shape.At(shape.NumAxes() - 1); + user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("out", 0); + const int32_t elem_cnt = shape.elem_cnt(); + if (elem_cnt == 0) { return; } + if (num_cols % (kCudaWarpSize * 2) == 0) { + const int64_t total_rows = elem_cnt / num_cols; + TriuWarpProcessRowGpu<<stream()->As()->cuda_stream()>>>( + total_rows, num_rows, num_cols, diagonal, x->dptr(), y->mut_dptr()); + } else { + TriuGpu<<stream()->As()->cuda_stream()>>>( + elem_cnt, num_rows, num_cols, diagonal, x->dptr(), y->mut_dptr()); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_CUDA_TRIU_KERNEL(dtype) \ + REGISTER_USER_KERNEL("triu") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("out", 0) == GetDataType::value)) \ + .SetInplaceProposalFn([](const user_op::InferContext&, \ + user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe { \ + OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, true)); \ + return Maybe::Ok(); \ + }); + +REGISTER_CUDA_TRIU_KERNEL(half) +REGISTER_CUDA_TRIU_KERNEL(float) +REGISTER_CUDA_TRIU_KERNEL(double) +REGISTER_CUDA_TRIU_KERNEL(uint8_t) +REGISTER_CUDA_TRIU_KERNEL(int8_t) +REGISTER_CUDA_TRIU_KERNEL(int32_t) +REGISTER_CUDA_TRIU_KERNEL(int64_t) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/two_stage_reduce_kernel_util.hip.cpp b/oneflow/user/kernels/two_stage_reduce_kernel_util.hip.cpp index a12653b..bf27697 100644 --- a/oneflow/user/kernels/two_stage_reduce_kernel_util.hip.cpp +++ b/oneflow/user/kernels/two_stage_reduce_kernel_util.hip.cpp @@ -1,67 +1,67 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/user/kernels/two_stage_reduce_kernel_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -__global__ void DivideGpu(const int64_t n, const T* x, const K* count, T* y) { - CUDA_1D_KERNEL_LOOP(i, n) { y[i] = x[i] / count[i]; } -} - -template -__global__ void MaskGpu(const int64_t n, const T* x, const K* mask, T* y) { - CUDA_1D_KERNEL_LOOP(i, n) { y[i] = static_cast(mask[i]) * x[i]; } -} - -template -__global__ void ScaleGpu(const int64_t n, const T* x, const K* scale, T* y) { - CUDA_1D_KERNEL_LOOP(i, n) { y[i] = x[i] * static_cast(scale[i]); } -} - -} // namespace - -template -struct TwoStageReduceKernelUtil { - static void Divide(ep::Stream* stream, const int64_t n, const T* x, const K* count, T* y) { - DivideGpu<<As()->cuda_stream()>>>(n, x, count, y); - } - - static void Mask(ep::Stream* stream, const int64_t n, const T* x, const K* mask, T* y) { - MaskGpu<<As()->cuda_stream()>>>(n, x, mask, y); - } - - static void Scale(ep::Stream* stream, const int64_t n, const T* x, const K* scale, T* y) { - ScaleGpu<<As()->cuda_stream()>>>(n, x, scale, y); - } -}; - -#define INSTANTIATE_TWO_STAGE_REDUCE_KERNEL_UTIL_CUDA(data_type_pair, index_type_pair) \ - template struct TwoStageReduceKernelUtil; -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_TWO_STAGE_REDUCE_KERNEL_UTIL_CUDA, - FLOATING_DATA_TYPE_SEQ INDEX_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, - INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ); -#undef INSTANTIATE_TWO_STAGE_REDUCE_KERNEL_UTIL_CUDA - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/user/kernels/two_stage_reduce_kernel_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +__global__ void DivideGpu(const int64_t n, const T* x, const K* count, T* y) { + CUDA_1D_KERNEL_LOOP(i, n) { y[i] = x[i] / count[i]; } +} + +template +__global__ void MaskGpu(const int64_t n, const T* x, const K* mask, T* y) { + CUDA_1D_KERNEL_LOOP(i, n) { y[i] = static_cast(mask[i]) * x[i]; } +} + +template +__global__ void ScaleGpu(const int64_t n, const T* x, const K* scale, T* y) { + CUDA_1D_KERNEL_LOOP(i, n) { y[i] = x[i] * static_cast(scale[i]); } +} + +} // namespace + +template +struct TwoStageReduceKernelUtil { + static void Divide(ep::Stream* stream, const int64_t n, const T* x, const K* count, T* y) { + DivideGpu<<As()->cuda_stream()>>>(n, x, count, y); + } + + static void Mask(ep::Stream* stream, const int64_t n, const T* x, const K* mask, T* y) { + MaskGpu<<As()->cuda_stream()>>>(n, x, mask, y); + } + + static void Scale(ep::Stream* stream, const int64_t n, const T* x, const K* scale, T* y) { + ScaleGpu<<As()->cuda_stream()>>>(n, x, scale, y); + } +}; + +#define INSTANTIATE_TWO_STAGE_REDUCE_KERNEL_UTIL_CUDA(data_type_pair, index_type_pair) \ + template struct TwoStageReduceKernelUtil; +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_TWO_STAGE_REDUCE_KERNEL_UTIL_CUDA, + FLOATING_DATA_TYPE_SEQ INDEX_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, + INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ); +#undef INSTANTIATE_TWO_STAGE_REDUCE_KERNEL_UTIL_CUDA + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/unfold_kernel_util.hip.cpp b/oneflow/user/kernels/unfold_kernel_util.hip.cpp index 7831c8f..21874a5 100644 --- a/oneflow/user/kernels/unfold_kernel_util.hip.cpp +++ b/oneflow/user/kernels/unfold_kernel_util.hip.cpp @@ -1,70 +1,70 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#ifdef WITH_ROCM -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/user/kernels/unfold_kernel_util.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace user_op { - -namespace { - -constexpr int kBlockSize = cuda::elementwise::kBlockSize; - -int GetNumBlocks(int64_t elem_cnt) { - int num_blocks = 0; - OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks)); - return num_blocks; -} - -// NDIM range: (1, 2, 3) -// SDIM range: (1, 2), 1 indicates channels_last, 2 indicates channels_first -template -__global__ void CudaUnfoldForward(UnfoldParams params, const T* in, T* out) { - CUDA_1D_KERNEL_LOOP_T(INDEX_T, out_offset, params.out_elem_cnt) { - using ParamType = UnfoldParams; - INDEX_T in_index[ParamType::kInputNDim] = {0}; - INDEX_T out_index[ParamType::kOutputNDim] = {0}; - params.out_index_helper.OffsetToNdIndex(out_offset, out_index); - if (!UnfoldIndexTransform(params, out_index, in_index)) { - INDEX_T in_offset = params.in_index_helper.NdIndexToOffset(in_index); - out[out_offset] = in[in_offset]; - } else { - out[out_offset] = static_cast(kUnfoldPaddingValue); - } - } -} - -} // namespace - -template -struct UnfoldKernelUtil { - using ParamType = UnfoldParams; - static void Forward(ep::Stream* stream, const UnfoldParams* params, - const T* input_ptr, T* output_ptr) { - CudaUnfoldForward - <<out_elem_cnt), kBlockSize, 0, - stream->As()->cuda_stream()>>>(*params, input_ptr, output_ptr); - } -}; -INSTANTIATE_UNFOLD_KERNEL_UTIL_FOR_DEVICE(DeviceType::kCUDA) -} // namespace user_op -} // namespace oneflow +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#ifdef WITH_ROCM +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/user/kernels/unfold_kernel_util.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace user_op { + +namespace { + +constexpr int kBlockSize = cuda::elementwise::kBlockSize; + +int GetNumBlocks(int64_t elem_cnt) { + int num_blocks = 0; + OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks)); + return num_blocks; +} + +// NDIM range: (1, 2, 3) +// SDIM range: (1, 2), 1 indicates channels_last, 2 indicates channels_first +template +__global__ void CudaUnfoldForward(UnfoldParams params, const T* in, T* out) { + CUDA_1D_KERNEL_LOOP_T(INDEX_T, out_offset, params.out_elem_cnt) { + using ParamType = UnfoldParams; + INDEX_T in_index[ParamType::kInputNDim] = {0}; + INDEX_T out_index[ParamType::kOutputNDim] = {0}; + params.out_index_helper.OffsetToNdIndex(out_offset, out_index); + if (!UnfoldIndexTransform(params, out_index, in_index)) { + INDEX_T in_offset = params.in_index_helper.NdIndexToOffset(in_index); + out[out_offset] = in[in_offset]; + } else { + out[out_offset] = static_cast(kUnfoldPaddingValue); + } + } +} + +} // namespace + +template +struct UnfoldKernelUtil { + using ParamType = UnfoldParams; + static void Forward(ep::Stream* stream, const UnfoldParams* params, + const T* input_ptr, T* output_ptr) { + CudaUnfoldForward + <<out_elem_cnt), kBlockSize, 0, + stream->As()->cuda_stream()>>>(*params, input_ptr, output_ptr); + } +}; +INSTANTIATE_UNFOLD_KERNEL_UTIL_FOR_DEVICE(DeviceType::kCUDA) +} // namespace user_op +} // namespace oneflow #endif // WITH_ROCM \ No newline at end of file diff --git a/oneflow/user/kernels/unfold_tensor_kernel.hip.cpp b/oneflow/user/kernels/unfold_tensor_kernel.hip.cpp index cc21d30..c90cbc1 100644 --- a/oneflow/user/kernels/unfold_tensor_kernel.hip.cpp +++ b/oneflow/user/kernels/unfold_tensor_kernel.hip.cpp @@ -1,222 +1,222 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/device/cuda_util.h" -#include "oneflow/core/hip/atomic.hip.h" -#include "oneflow/user/kernels/unfold_tensor_kernel_utils.h" - -namespace oneflow { - -namespace { - -const int32_t NDIMS = 16; -struct STRIDES { - int32_t val[NDIMS]; -}; - -template -__global__ void UnfoldTensorCudaKernel(const T* in_ptr, const STRIDES out_stride, - const STRIDES out_shape, const int32_t out_dims, - const int32_t elements, T* out_ptr) { - int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; - int32_t step = gridDim.x * blockDim.x; - while (gid < elements) { - int32_t offset = Offset(gid, out_stride.val, out_shape.val, out_dims - 1); - out_ptr[gid] = in_ptr[offset]; - gid += step; - } -} - -template -__global__ void UnfoldTensorGradCudaKernel(const T* dout_ptr, const STRIDES dout_stride, - const STRIDES dout_shape, const int32_t dout_dims, - const int32_t elements, T* din_ptr) { - int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; - int32_t step = gridDim.x * blockDim.x; - while (gid < elements) { - int32_t offset = Offset(gid, dout_stride.val, dout_shape.val, dout_dims - 1); - cuda::atomic::Add(&din_ptr[offset], dout_ptr[gid]); - gid += step; - } -} - -template -__global__ void InitPtr(const int32_t elements, T* ptr) { - int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; - int32_t step = gridDim.x * blockDim.x; - while (gid < elements) { - ptr[gid] = static_cast(0); - gid += step; - } -} - -template -struct GpuUnfoldTensorFunctor final { - void operator()(ep::Stream* stream, const T* in_ptr, const STRIDES out_stride, - const STRIDES out_shape, const int32_t out_dims, const int32_t elements, - T* out_ptr) { - RUN_CUDA_KERNEL((UnfoldTensorCudaKernel), stream, elements, in_ptr, out_stride, out_shape, - out_dims, elements, out_ptr); - } -}; - -template -struct GpuUnfoldTensorGradFunctor final { - void operator()(ep::Stream* stream, const T* dout_ptr, const STRIDES dout_stride, - const STRIDES dout_shape, const int32_t dout_dims, const int32_t dout_elements, - const int32_t din_elements, T* din_ptr) { - RUN_CUDA_KERNEL((InitPtr), stream, din_elements, din_elements, din_ptr); - RUN_CUDA_KERNEL((UnfoldTensorGradCudaKernel), stream, dout_elements, dout_ptr, dout_stride, - dout_shape, dout_dims, dout_elements, din_ptr); - } -}; - -} // namespace - -template -class GpuUnfoldTensorKernel final : public user_op::OpKernel { - public: - GpuUnfoldTensorKernel() = default; - ~GpuUnfoldTensorKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("x", 0); - user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("y", 0); - - const ShapeView& in_shape = in->shape_view(); - std::vector out_shape; - out_shape.resize(out->shape_view().NumAxes()); - for (int i = 0; i < out->shape_view().NumAxes(); ++i) { - out_shape[i] = out->shape_view().At(i); - } - const int32_t in_dims = in_shape.NumAxes(); - const int32_t out_dims = out_shape.size(); - const int32_t dimension = ctx->Attr("dimension"); - const int32_t step = ctx->Attr("step"); - - std::vector in_stride(in_dims, 1); - for (int32_t i = in_dims - 2; i >= 0; --i) { - in_stride[i] = in_shape.At(i + 1) * in_stride.at(i + 1); - } - - std::vector out_stride(in_dims + 1); - out_stride[in_dims] = in_dims == 0 ? 1 : in_stride[dimension]; - for (int d = 0; d < in_dims; ++d) { - if (d == dimension) { - out_stride[d] = step * in_stride[d]; - } else { - out_stride[d] = in_stride[d]; - } - } - - const T* in_ptr = in->dptr(); - T* out_ptr = out->mut_dptr(); - const int32_t out_size = out->shape_view().elem_cnt(); - - STRIDES out_stride_cuda; - for (int i = 0; i < out_dims; ++i) { out_stride_cuda.val[i] = out_stride[i]; } - STRIDES out_shape_cuda; - for (int i = 0; i < out_dims; ++i) { out_shape_cuda.val[i] = out_shape[i]; } - - GpuUnfoldTensorFunctor()(ctx->stream(), in_ptr, out_stride_cuda, out_shape_cuda, out_dims, - out_size, out_ptr); - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_UNFOLD_TENSOR_KERNEL(dtype) \ - REGISTER_USER_KERNEL("unfold_tensor") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("x", 0) == GetDataType::value)) - -REGISTER_UNFOLD_TENSOR_KERNEL(float); -REGISTER_UNFOLD_TENSOR_KERNEL(double); -REGISTER_UNFOLD_TENSOR_KERNEL(int32_t); -REGISTER_UNFOLD_TENSOR_KERNEL(int64_t); - -template -class GpuUnfoldTensorGradKernel final : public user_op::OpKernel { - public: - GpuUnfoldTensorGradKernel() = default; - ~GpuUnfoldTensorGradKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* dout = ctx->Tensor4ArgNameAndIndex("dy", 0); - const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("x", 0); - user_op::Tensor* din = ctx->Tensor4ArgNameAndIndex("dx", 0); - - const ShapeView& in_shape = in->shape_view(); - const int32_t in_dims = in_shape.NumAxes(); - std::vector din_stride(in_dims, 1); - for (int32_t i = in_dims - 2; i >= 0; --i) { - din_stride[i] = in_shape.At(i + 1) * din_stride.at(i + 1); - } - - std::vector dout_shape; - dout_shape.resize(dout->shape_view().NumAxes()); - for (int i = 0; i < dout->shape_view().NumAxes(); ++i) { - dout_shape[i] = dout->shape_view().At(i); - } - - const int32_t dout_dims = dout_shape.size(); - const int32_t dimension = ctx->Attr("dimension"); - const int32_t step = ctx->Attr("step"); - - std::vector dout_stride(in_dims + 1); - dout_stride[in_dims] = in_dims == 0 ? 1 : din_stride[dimension]; - for (int d = 0; d < in_dims; ++d) { - if (d == dimension) { - dout_stride[d] = step * din_stride[d]; - } else { - dout_stride[d] = din_stride[d]; - } - } - - STRIDES dout_stride_cuda; - for (int i = 0; i < dout_dims; ++i) { dout_stride_cuda.val[i] = dout_stride[i]; } - STRIDES dout_shape_cuda; - for (int i = 0; i < dout_dims; ++i) { dout_shape_cuda.val[i] = dout_shape[i]; } - - const T* dout_ptr = dout->dptr(); - T* din_ptr = din->mut_dptr(); - const int32_t dout_size = dout->shape_view().elem_cnt(); - const int32_t din_size = din->shape_view().elem_cnt(); - - GpuUnfoldTensorGradFunctor()(ctx->stream(), dout_ptr, dout_stride_cuda, dout_shape_cuda, - dout_dims, dout_size, din_size, din_ptr); - } - - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_UNFOLD_TENSOR_GRAD_KERNEL(dtype) \ - REGISTER_USER_KERNEL("unfold_tensor_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("x", 0) == GetDataType::value)) - -REGISTER_UNFOLD_TENSOR_GRAD_KERNEL(float); -REGISTER_UNFOLD_TENSOR_GRAD_KERNEL(double); -REGISTER_UNFOLD_TENSOR_GRAD_KERNEL(int32_t); -REGISTER_UNFOLD_TENSOR_GRAD_KERNEL(int64_t); - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/device/cuda_util.h" +#include "oneflow/core/hip/atomic.hip.h" +#include "oneflow/user/kernels/unfold_tensor_kernel_utils.h" + +namespace oneflow { + +namespace { + +const int32_t NDIMS = 16; +struct STRIDES { + int32_t val[NDIMS]; +}; + +template +__global__ void UnfoldTensorCudaKernel(const T* in_ptr, const STRIDES out_stride, + const STRIDES out_shape, const int32_t out_dims, + const int32_t elements, T* out_ptr) { + int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; + int32_t step = gridDim.x * blockDim.x; + while (gid < elements) { + int32_t offset = Offset(gid, out_stride.val, out_shape.val, out_dims - 1); + out_ptr[gid] = in_ptr[offset]; + gid += step; + } +} + +template +__global__ void UnfoldTensorGradCudaKernel(const T* dout_ptr, const STRIDES dout_stride, + const STRIDES dout_shape, const int32_t dout_dims, + const int32_t elements, T* din_ptr) { + int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; + int32_t step = gridDim.x * blockDim.x; + while (gid < elements) { + int32_t offset = Offset(gid, dout_stride.val, dout_shape.val, dout_dims - 1); + cuda::atomic::Add(&din_ptr[offset], dout_ptr[gid]); + gid += step; + } +} + +template +__global__ void InitPtr(const int32_t elements, T* ptr) { + int32_t gid = (blockDim.x * blockIdx.x) + threadIdx.x; + int32_t step = gridDim.x * blockDim.x; + while (gid < elements) { + ptr[gid] = static_cast(0); + gid += step; + } +} + +template +struct GpuUnfoldTensorFunctor final { + void operator()(ep::Stream* stream, const T* in_ptr, const STRIDES out_stride, + const STRIDES out_shape, const int32_t out_dims, const int32_t elements, + T* out_ptr) { + RUN_CUDA_KERNEL((UnfoldTensorCudaKernel), stream, elements, in_ptr, out_stride, out_shape, + out_dims, elements, out_ptr); + } +}; + +template +struct GpuUnfoldTensorGradFunctor final { + void operator()(ep::Stream* stream, const T* dout_ptr, const STRIDES dout_stride, + const STRIDES dout_shape, const int32_t dout_dims, const int32_t dout_elements, + const int32_t din_elements, T* din_ptr) { + RUN_CUDA_KERNEL((InitPtr), stream, din_elements, din_elements, din_ptr); + RUN_CUDA_KERNEL((UnfoldTensorGradCudaKernel), stream, dout_elements, dout_ptr, dout_stride, + dout_shape, dout_dims, dout_elements, din_ptr); + } +}; + +} // namespace + +template +class GpuUnfoldTensorKernel final : public user_op::OpKernel { + public: + GpuUnfoldTensorKernel() = default; + ~GpuUnfoldTensorKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("x", 0); + user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("y", 0); + + const ShapeView& in_shape = in->shape_view(); + std::vector out_shape; + out_shape.resize(out->shape_view().NumAxes()); + for (int i = 0; i < out->shape_view().NumAxes(); ++i) { + out_shape[i] = out->shape_view().At(i); + } + const int32_t in_dims = in_shape.NumAxes(); + const int32_t out_dims = out_shape.size(); + const int32_t dimension = ctx->Attr("dimension"); + const int32_t step = ctx->Attr("step"); + + std::vector in_stride(in_dims, 1); + for (int32_t i = in_dims - 2; i >= 0; --i) { + in_stride[i] = in_shape.At(i + 1) * in_stride.at(i + 1); + } + + std::vector out_stride(in_dims + 1); + out_stride[in_dims] = in_dims == 0 ? 1 : in_stride[dimension]; + for (int d = 0; d < in_dims; ++d) { + if (d == dimension) { + out_stride[d] = step * in_stride[d]; + } else { + out_stride[d] = in_stride[d]; + } + } + + const T* in_ptr = in->dptr(); + T* out_ptr = out->mut_dptr(); + const int32_t out_size = out->shape_view().elem_cnt(); + + STRIDES out_stride_cuda; + for (int i = 0; i < out_dims; ++i) { out_stride_cuda.val[i] = out_stride[i]; } + STRIDES out_shape_cuda; + for (int i = 0; i < out_dims; ++i) { out_shape_cuda.val[i] = out_shape[i]; } + + GpuUnfoldTensorFunctor()(ctx->stream(), in_ptr, out_stride_cuda, out_shape_cuda, out_dims, + out_size, out_ptr); + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_UNFOLD_TENSOR_KERNEL(dtype) \ + REGISTER_USER_KERNEL("unfold_tensor") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == GetDataType::value)) + +REGISTER_UNFOLD_TENSOR_KERNEL(float); +REGISTER_UNFOLD_TENSOR_KERNEL(double); +REGISTER_UNFOLD_TENSOR_KERNEL(int32_t); +REGISTER_UNFOLD_TENSOR_KERNEL(int64_t); + +template +class GpuUnfoldTensorGradKernel final : public user_op::OpKernel { + public: + GpuUnfoldTensorGradKernel() = default; + ~GpuUnfoldTensorGradKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* dout = ctx->Tensor4ArgNameAndIndex("dy", 0); + const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("x", 0); + user_op::Tensor* din = ctx->Tensor4ArgNameAndIndex("dx", 0); + + const ShapeView& in_shape = in->shape_view(); + const int32_t in_dims = in_shape.NumAxes(); + std::vector din_stride(in_dims, 1); + for (int32_t i = in_dims - 2; i >= 0; --i) { + din_stride[i] = in_shape.At(i + 1) * din_stride.at(i + 1); + } + + std::vector dout_shape; + dout_shape.resize(dout->shape_view().NumAxes()); + for (int i = 0; i < dout->shape_view().NumAxes(); ++i) { + dout_shape[i] = dout->shape_view().At(i); + } + + const int32_t dout_dims = dout_shape.size(); + const int32_t dimension = ctx->Attr("dimension"); + const int32_t step = ctx->Attr("step"); + + std::vector dout_stride(in_dims + 1); + dout_stride[in_dims] = in_dims == 0 ? 1 : din_stride[dimension]; + for (int d = 0; d < in_dims; ++d) { + if (d == dimension) { + dout_stride[d] = step * din_stride[d]; + } else { + dout_stride[d] = din_stride[d]; + } + } + + STRIDES dout_stride_cuda; + for (int i = 0; i < dout_dims; ++i) { dout_stride_cuda.val[i] = dout_stride[i]; } + STRIDES dout_shape_cuda; + for (int i = 0; i < dout_dims; ++i) { dout_shape_cuda.val[i] = dout_shape[i]; } + + const T* dout_ptr = dout->dptr(); + T* din_ptr = din->mut_dptr(); + const int32_t dout_size = dout->shape_view().elem_cnt(); + const int32_t din_size = din->shape_view().elem_cnt(); + + GpuUnfoldTensorGradFunctor()(ctx->stream(), dout_ptr, dout_stride_cuda, dout_shape_cuda, + dout_dims, dout_size, din_size, din_ptr); + } + + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_UNFOLD_TENSOR_GRAD_KERNEL(dtype) \ + REGISTER_USER_KERNEL("unfold_tensor_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("x", 0) == GetDataType::value)) + +REGISTER_UNFOLD_TENSOR_GRAD_KERNEL(float); +REGISTER_UNFOLD_TENSOR_GRAD_KERNEL(double); +REGISTER_UNFOLD_TENSOR_GRAD_KERNEL(int32_t); +REGISTER_UNFOLD_TENSOR_GRAD_KERNEL(int64_t); + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/unique_kernel_util.hip.cpp b/oneflow/user/kernels/unique_kernel_util.hip.cpp index b69bba1..287850f 100644 --- a/oneflow/user/kernels/unique_kernel_util.hip.cpp +++ b/oneflow/user/kernels/unique_kernel_util.hip.cpp @@ -1,86 +1,86 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/user/kernels/unique_kernel_util.h" -#include "oneflow/core/hip/unique.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -constexpr cuda::unique::Flag kUniqueFlag = cuda::unique::kOutputInverseIndices; -constexpr cuda::unique::Flag kUniqueWithCountsFlag = - cuda::unique::kOutputInverseIndices | cuda::unique::kOutputCounts; - -} // namespace - -template -struct UniqueKernelUtil { - static void Unique(ep::Stream* stream, int64_t n, const KEY* in, IDX* num_unique, KEY* unique_out, - IDX* idx_out, void* workspace, int64_t workspace_size_in_bytes); - static void UniqueWithCounts(ep::Stream* stream, int64_t n, const KEY* in, IDX* num_unique, - KEY* unique_out, IDX* idx_out, IDX* count, void* workspace, - int64_t workspace_size_in_bytes); - static void GetUniqueWorkspaceSizeInBytes(ep::Stream* stream, int64_t n, - int64_t* workspace_size_in_bytes); - static void GetUniqueWithCountsWorkspaceSizeInBytes(ep::Stream* stream, int64_t n, - int64_t* workspace_size_in_bytes); -}; - -template -void UniqueKernelUtil::Unique(ep::Stream* stream, int64_t n, - const KEY* in, IDX* num_unique, - KEY* unique_out, IDX* idx_out, - void* workspace, - int64_t workspace_size_in_bytes) { - OF_CUDA_CHECK((cuda::unique::Launch(kUniqueFlag, n, in, unique_out, num_unique, idx_out, - nullptr, workspace, workspace_size_in_bytes, - stream->As()->cuda_stream()))); -} - -template -void UniqueKernelUtil::UniqueWithCounts( - ep::Stream* stream, int64_t n, const KEY* in, IDX* num_unique, KEY* unique_out, IDX* idx_out, - IDX* count, void* workspace, int64_t workspace_size_in_bytes) { - OF_CUDA_CHECK((cuda::unique::Launch( - kUniqueWithCountsFlag, n, in, unique_out, num_unique, idx_out, count, workspace, - workspace_size_in_bytes, stream->As()->cuda_stream()))); -} - -template -void UniqueKernelUtil::GetUniqueWorkspaceSizeInBytes( - ep::Stream* stream, int64_t n, int64_t* workspace_size_in_bytes) { - size_t ws = 0; - OF_CUDA_CHECK((cuda::unique::GetWorkspaceSize(kUniqueFlag, n, &ws))); - *workspace_size_in_bytes = static_cast(ws); -} - -template -void UniqueKernelUtil::GetUniqueWithCountsWorkspaceSizeInBytes( - ep::Stream* stream, int64_t n, int64_t* workspace_size_in_bytes) { - size_t ws = 0; - OF_CUDA_CHECK((cuda::unique::GetWorkspaceSize(kUniqueWithCountsFlag, n, &ws))); - *workspace_size_in_bytes = static_cast(ws); -} - -#define INSTANTIATE_UNIQUE_KERNEL_UTIL_CUDA(key_type_pair, idx_type_pair) \ - template struct UniqueKernelUtil; -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_UNIQUE_KERNEL_UTIL_CUDA, ARITHMETIC_DATA_TYPE_SEQ, - INDEX_DATA_TYPE_SEQ); -#undef INSTANTIATE_UNIQUE_KERNEL_UTIL_CUDA - -} // namespace oneflow +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/user/kernels/unique_kernel_util.h" +#include "oneflow/core/hip/unique.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +constexpr cuda::unique::Flag kUniqueFlag = cuda::unique::kOutputInverseIndices; +constexpr cuda::unique::Flag kUniqueWithCountsFlag = + cuda::unique::kOutputInverseIndices | cuda::unique::kOutputCounts; + +} // namespace + +template +struct UniqueKernelUtil { + static void Unique(ep::Stream* stream, int64_t n, const KEY* in, IDX* num_unique, KEY* unique_out, + IDX* idx_out, void* workspace, int64_t workspace_size_in_bytes); + static void UniqueWithCounts(ep::Stream* stream, int64_t n, const KEY* in, IDX* num_unique, + KEY* unique_out, IDX* idx_out, IDX* count, void* workspace, + int64_t workspace_size_in_bytes); + static void GetUniqueWorkspaceSizeInBytes(ep::Stream* stream, int64_t n, + int64_t* workspace_size_in_bytes); + static void GetUniqueWithCountsWorkspaceSizeInBytes(ep::Stream* stream, int64_t n, + int64_t* workspace_size_in_bytes); +}; + +template +void UniqueKernelUtil::Unique(ep::Stream* stream, int64_t n, + const KEY* in, IDX* num_unique, + KEY* unique_out, IDX* idx_out, + void* workspace, + int64_t workspace_size_in_bytes) { + OF_CUDA_CHECK((cuda::unique::Launch(kUniqueFlag, n, in, unique_out, num_unique, idx_out, + nullptr, workspace, workspace_size_in_bytes, + stream->As()->cuda_stream()))); +} + +template +void UniqueKernelUtil::UniqueWithCounts( + ep::Stream* stream, int64_t n, const KEY* in, IDX* num_unique, KEY* unique_out, IDX* idx_out, + IDX* count, void* workspace, int64_t workspace_size_in_bytes) { + OF_CUDA_CHECK((cuda::unique::Launch( + kUniqueWithCountsFlag, n, in, unique_out, num_unique, idx_out, count, workspace, + workspace_size_in_bytes, stream->As()->cuda_stream()))); +} + +template +void UniqueKernelUtil::GetUniqueWorkspaceSizeInBytes( + ep::Stream* stream, int64_t n, int64_t* workspace_size_in_bytes) { + size_t ws = 0; + OF_CUDA_CHECK((cuda::unique::GetWorkspaceSize(kUniqueFlag, n, &ws))); + *workspace_size_in_bytes = static_cast(ws); +} + +template +void UniqueKernelUtil::GetUniqueWithCountsWorkspaceSizeInBytes( + ep::Stream* stream, int64_t n, int64_t* workspace_size_in_bytes) { + size_t ws = 0; + OF_CUDA_CHECK((cuda::unique::GetWorkspaceSize(kUniqueWithCountsFlag, n, &ws))); + *workspace_size_in_bytes = static_cast(ws); +} + +#define INSTANTIATE_UNIQUE_KERNEL_UTIL_CUDA(key_type_pair, idx_type_pair) \ + template struct UniqueKernelUtil; +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_UNIQUE_KERNEL_UTIL_CUDA, ARITHMETIC_DATA_TYPE_SEQ, + INDEX_DATA_TYPE_SEQ); +#undef INSTANTIATE_UNIQUE_KERNEL_UTIL_CUDA + +} // namespace oneflow diff --git a/oneflow/user/kernels/unsorted_segment_sum_kernel_util.hip.cpp b/oneflow/user/kernels/unsorted_segment_sum_kernel_util.hip.cpp index 6334ee7..1b9dfa9 100644 --- a/oneflow/user/kernels/unsorted_segment_sum_kernel_util.hip.cpp +++ b/oneflow/user/kernels/unsorted_segment_sum_kernel_util.hip.cpp @@ -1,222 +1,222 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/common/nd_index_offset_helper.h" -#include "oneflow/user/kernels/unsorted_segment_sum_kernel_util.h" -#include "oneflow/core/hip/atomic.hip.h" -#include "oneflow/core/kernel/kernel.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" -#include - -namespace oneflow { - -namespace { - -template -__device__ __forceinline__ bool IsZero(T v) { - return v == 0; -} - -template<> -__device__ __forceinline__ bool IsZero(half v) { - return v == static_cast(0); -} - -template<> -__device__ __forceinline__ bool IsZero(half2 v) { - // return v.x == static_cast(0) && v.y == static_cast(0); - return v.data.x == 0 && v.data.y == 0; -} - -template -__global__ void UnsortedSegmentSumGpu(const IDX data_elem_cnt, - const NdIndexOffsetHelper in_helper, - const NdIndexOffsetHelper out_helper, const U* data, - const K* segment_ids, const IDX num_segments, - const IDX segment_id_offset, T* out) { - CUDA_1D_KERNEL_LOOP_T(IDX, i, data_elem_cnt) { - const U val = data[i]; - if (!IsZero(val)) { - IDX outer_idx, segment_id_idx, inner_idx; - in_helper.OffsetToNdIndex(i, outer_idx, segment_id_idx, inner_idx); - const K origin_idx = segment_ids[segment_id_idx]; - assert(origin_idx >= 0); - const IDX idx = origin_idx - segment_id_offset; - if (idx >= 0 && idx < num_segments) { - const int64_t out_offset = out_helper.NdIndexToOffset(outer_idx, idx, inner_idx); - if (out_offset >= 0) { cuda::atomic::Add(out + out_offset, static_cast(val)); } - } - } - } -} - -template -__global__ void UnsortedSegmentColSumGpu(const IDX data_elem_cnt, - const NdIndexOffsetHelper in_helper, - const NdIndexOffsetHelper out_helper, - const U* data, const K* segment_ids, - const IDX num_segments, const IDX segment_id_offset, - T* out) { - CUDA_1D_KERNEL_LOOP_T(IDX, i, data_elem_cnt) { - const U val = data[i]; - if (!IsZero(val)) { - IDX outer_idx, segment_id_idx; - in_helper.OffsetToNdIndex(i, outer_idx, segment_id_idx); - const K origin_idx = segment_ids[segment_id_idx]; - assert(origin_idx >= 0); - const IDX idx = origin_idx - segment_id_offset; - if (idx >= 0 && idx < num_segments) { - const int64_t out_offset = out_helper.NdIndexToOffset(outer_idx, idx); - if (out_offset >= 0) { cuda::atomic::Add(out + out_offset, static_cast(val)); } - } - } - } -} - -template -__global__ void UnsortedSegmentRowSumGpu(const IDX data_elem_cnt, - const NdIndexOffsetHelper in_helper, - const NdIndexOffsetHelper out_helper, - const U* data, const K* segment_ids, - const IDX num_segments, const IDX segment_id_offset, - T* out) { - CUDA_1D_KERNEL_LOOP_T(IDX, i, data_elem_cnt) { - const U val = data[i]; - if (!IsZero(val)) { - IDX segment_id_idx, inner_idx; - in_helper.OffsetToNdIndex(i, segment_id_idx, inner_idx); - const K origin_idx = segment_ids[segment_id_idx]; - assert(origin_idx >= 0); - const IDX idx = origin_idx - segment_id_offset; - if (idx >= 0 && idx < num_segments) { - const int64_t out_offset = out_helper.NdIndexToOffset(idx, inner_idx); - if (out_offset >= 0) { cuda::atomic::Add(out + out_offset, static_cast(val)); } - } - } - } -} - -template -void UnsortedSegmentSumUtil(ep::Stream* stream, const K* segment_ids, const U* data, - IDX num_segment_ids, IDX num_segments, IDX outer_dim_size, - IDX inner_dim_size, IDX segment_id_offset, T* out) { - const IDX data_elem_cnt = num_segment_ids * outer_dim_size * inner_dim_size; - if (inner_dim_size == 1) { - NdIndexOffsetHelper in_helper(outer_dim_size, num_segment_ids); - NdIndexOffsetHelper out_helper(outer_dim_size, num_segments); - UnsortedSegmentColSumGpu - <<As()->cuda_stream()>>>(data_elem_cnt, in_helper, out_helper, - data, segment_ids, num_segments, - segment_id_offset, out); - - } else if (outer_dim_size == 1) { - NdIndexOffsetHelper in_helper(num_segment_ids, inner_dim_size); - NdIndexOffsetHelper out_helper(num_segments, inner_dim_size); - UnsortedSegmentRowSumGpu - <<As()->cuda_stream()>>>(data_elem_cnt, in_helper, out_helper, - data, segment_ids, num_segments, - segment_id_offset, out); - - } else { - NdIndexOffsetHelper in_helper(outer_dim_size, num_segment_ids, inner_dim_size); - NdIndexOffsetHelper out_helper(outer_dim_size, num_segments, inner_dim_size); - UnsortedSegmentSumGpu - <<As()->cuda_stream()>>>(data_elem_cnt, in_helper, out_helper, - data, segment_ids, num_segments, - segment_id_offset, out); - } -} - -template -void DispatchDataType(ep::Stream* stream, const K* segment_ids, const U* data, - int64_t num_segment_ids, int64_t num_segments, int64_t outer_dim_size, - int64_t inner_dim_size, int64_t segment_id_offset, T* out) { - auto* cuda_stream = stream->As(); - if (std::is_same::value && std::is_same::value - && cuda_stream->device_properties().major >= 6 - && reinterpret_cast(data) % sizeof(half2) == 0 - && reinterpret_cast(out) % sizeof(half2) == 0 && inner_dim_size % 2 == 0) { - UnsortedSegmentSumUtil( - stream, segment_ids, reinterpret_cast(data), num_segment_ids, num_segments, - outer_dim_size, inner_dim_size / 2, segment_id_offset, reinterpret_cast(out)); - } else { - UnsortedSegmentSumUtil(stream, segment_ids, data, num_segment_ids, num_segments, - outer_dim_size, inner_dim_size, segment_id_offset, out); - } -} - -} // namespace - -template -struct UnsortedSegmentSumKernelUtil final { - static void UnsortedSegmentSum(ep::Stream* stream, const K* segment_ids, const U* data, - int64_t num_segment_ids, int64_t num_segments, - int64_t outer_dim_size, int64_t inner_dim_size, - int64_t segment_id_offset, T* out) { - const int64_t data_elem_cnt = num_segment_ids * outer_dim_size * inner_dim_size; - const int64_t out_elem_cnt = outer_dim_size * num_segments * inner_dim_size; - - if (std::max(data_elem_cnt, out_elem_cnt) < GetMaxVal() / 2) { - DispatchDataType(stream, segment_ids, data, num_segment_ids, num_segments, - outer_dim_size, inner_dim_size, segment_id_offset, out); - } else { - DispatchDataType(stream, segment_ids, data, num_segment_ids, num_segments, - outer_dim_size, inner_dim_size, segment_id_offset, out); - } - } -}; - -template -struct UnsortedSegmentSumKernelUtil final { - static void UnsortedSegmentSum(ep::Stream* stream, const K* segment_ids, const float16* data, - int64_t num_segment_ids, int64_t num_segments, - int64_t outer_dim_size, int64_t inner_dim_size, - int64_t segment_id_offset, float* out) { - UnsortedSegmentSumKernelUtil::UnsortedSegmentSum( - stream, segment_ids, reinterpret_cast(data), num_segment_ids, num_segments, - outer_dim_size, inner_dim_size, segment_id_offset, out); - } -}; - -#define INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_UTIL_CUDA(in_type_pair, index_type_pair) \ - template struct UnsortedSegmentSumKernelUtil; -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_UTIL_CUDA, - UNSORTED_SEGMENT_SUM_DATA_TYPE_SEQ, - UNSORTED_SEGMENT_SUM_INDEX_TYPE_SEQ); -#undef INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_UTIL_CUDA - -#define INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_HALF_CUDA(in_type_pair, index_type_pair, \ - out_type_pair) \ - template struct UnsortedSegmentSumKernelUtil; - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_HALF_CUDA, - OF_PP_MAKE_TUPLE_SEQ(float, DataType::kFloat), - UNSORTED_SEGMENT_SUM_INDEX_TYPE_SEQ, FLOAT16_DATA_TYPE_SEQ); - -#undef INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_HALF_CUDA - -template struct UnsortedSegmentSumKernelUtil; -template struct UnsortedSegmentSumKernelUtil; -template struct UnsortedSegmentSumKernelUtil; - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/common/nd_index_offset_helper.h" +#include "oneflow/user/kernels/unsorted_segment_sum_kernel_util.h" +#include "oneflow/core/hip/atomic.hip.h" +#include "oneflow/core/kernel/kernel.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" +#include + +namespace oneflow { + +namespace { + +template +__device__ __forceinline__ bool IsZero(T v) { + return v == 0; +} + +template<> +__device__ __forceinline__ bool IsZero(half v) { + return v == static_cast(0); +} + +template<> +__device__ __forceinline__ bool IsZero(half2 v) { + // return v.x == static_cast(0) && v.y == static_cast(0); + return v.data.x == 0 && v.data.y == 0; +} + +template +__global__ void UnsortedSegmentSumGpu(const IDX data_elem_cnt, + const NdIndexOffsetHelper in_helper, + const NdIndexOffsetHelper out_helper, const U* data, + const K* segment_ids, const IDX num_segments, + const IDX segment_id_offset, T* out) { + CUDA_1D_KERNEL_LOOP_T(IDX, i, data_elem_cnt) { + const U val = data[i]; + if (!IsZero(val)) { + IDX outer_idx, segment_id_idx, inner_idx; + in_helper.OffsetToNdIndex(i, outer_idx, segment_id_idx, inner_idx); + const K origin_idx = segment_ids[segment_id_idx]; + assert(origin_idx >= 0); + const IDX idx = origin_idx - segment_id_offset; + if (idx >= 0 && idx < num_segments) { + const int64_t out_offset = out_helper.NdIndexToOffset(outer_idx, idx, inner_idx); + if (out_offset >= 0) { cuda::atomic::Add(out + out_offset, static_cast(val)); } + } + } + } +} + +template +__global__ void UnsortedSegmentColSumGpu(const IDX data_elem_cnt, + const NdIndexOffsetHelper in_helper, + const NdIndexOffsetHelper out_helper, + const U* data, const K* segment_ids, + const IDX num_segments, const IDX segment_id_offset, + T* out) { + CUDA_1D_KERNEL_LOOP_T(IDX, i, data_elem_cnt) { + const U val = data[i]; + if (!IsZero(val)) { + IDX outer_idx, segment_id_idx; + in_helper.OffsetToNdIndex(i, outer_idx, segment_id_idx); + const K origin_idx = segment_ids[segment_id_idx]; + assert(origin_idx >= 0); + const IDX idx = origin_idx - segment_id_offset; + if (idx >= 0 && idx < num_segments) { + const int64_t out_offset = out_helper.NdIndexToOffset(outer_idx, idx); + if (out_offset >= 0) { cuda::atomic::Add(out + out_offset, static_cast(val)); } + } + } + } +} + +template +__global__ void UnsortedSegmentRowSumGpu(const IDX data_elem_cnt, + const NdIndexOffsetHelper in_helper, + const NdIndexOffsetHelper out_helper, + const U* data, const K* segment_ids, + const IDX num_segments, const IDX segment_id_offset, + T* out) { + CUDA_1D_KERNEL_LOOP_T(IDX, i, data_elem_cnt) { + const U val = data[i]; + if (!IsZero(val)) { + IDX segment_id_idx, inner_idx; + in_helper.OffsetToNdIndex(i, segment_id_idx, inner_idx); + const K origin_idx = segment_ids[segment_id_idx]; + assert(origin_idx >= 0); + const IDX idx = origin_idx - segment_id_offset; + if (idx >= 0 && idx < num_segments) { + const int64_t out_offset = out_helper.NdIndexToOffset(idx, inner_idx); + if (out_offset >= 0) { cuda::atomic::Add(out + out_offset, static_cast(val)); } + } + } + } +} + +template +void UnsortedSegmentSumUtil(ep::Stream* stream, const K* segment_ids, const U* data, + IDX num_segment_ids, IDX num_segments, IDX outer_dim_size, + IDX inner_dim_size, IDX segment_id_offset, T* out) { + const IDX data_elem_cnt = num_segment_ids * outer_dim_size * inner_dim_size; + if (inner_dim_size == 1) { + NdIndexOffsetHelper in_helper(outer_dim_size, num_segment_ids); + NdIndexOffsetHelper out_helper(outer_dim_size, num_segments); + UnsortedSegmentColSumGpu + <<As()->cuda_stream()>>>(data_elem_cnt, in_helper, out_helper, + data, segment_ids, num_segments, + segment_id_offset, out); + + } else if (outer_dim_size == 1) { + NdIndexOffsetHelper in_helper(num_segment_ids, inner_dim_size); + NdIndexOffsetHelper out_helper(num_segments, inner_dim_size); + UnsortedSegmentRowSumGpu + <<As()->cuda_stream()>>>(data_elem_cnt, in_helper, out_helper, + data, segment_ids, num_segments, + segment_id_offset, out); + + } else { + NdIndexOffsetHelper in_helper(outer_dim_size, num_segment_ids, inner_dim_size); + NdIndexOffsetHelper out_helper(outer_dim_size, num_segments, inner_dim_size); + UnsortedSegmentSumGpu + <<As()->cuda_stream()>>>(data_elem_cnt, in_helper, out_helper, + data, segment_ids, num_segments, + segment_id_offset, out); + } +} + +template +void DispatchDataType(ep::Stream* stream, const K* segment_ids, const U* data, + int64_t num_segment_ids, int64_t num_segments, int64_t outer_dim_size, + int64_t inner_dim_size, int64_t segment_id_offset, T* out) { + auto* cuda_stream = stream->As(); + if (std::is_same::value && std::is_same::value + && cuda_stream->device_properties().major >= 6 + && reinterpret_cast(data) % sizeof(half2) == 0 + && reinterpret_cast(out) % sizeof(half2) == 0 && inner_dim_size % 2 == 0) { + UnsortedSegmentSumUtil( + stream, segment_ids, reinterpret_cast(data), num_segment_ids, num_segments, + outer_dim_size, inner_dim_size / 2, segment_id_offset, reinterpret_cast(out)); + } else { + UnsortedSegmentSumUtil(stream, segment_ids, data, num_segment_ids, num_segments, + outer_dim_size, inner_dim_size, segment_id_offset, out); + } +} + +} // namespace + +template +struct UnsortedSegmentSumKernelUtil final { + static void UnsortedSegmentSum(ep::Stream* stream, const K* segment_ids, const U* data, + int64_t num_segment_ids, int64_t num_segments, + int64_t outer_dim_size, int64_t inner_dim_size, + int64_t segment_id_offset, T* out) { + const int64_t data_elem_cnt = num_segment_ids * outer_dim_size * inner_dim_size; + const int64_t out_elem_cnt = outer_dim_size * num_segments * inner_dim_size; + + if (std::max(data_elem_cnt, out_elem_cnt) < GetMaxVal() / 2) { + DispatchDataType(stream, segment_ids, data, num_segment_ids, num_segments, + outer_dim_size, inner_dim_size, segment_id_offset, out); + } else { + DispatchDataType(stream, segment_ids, data, num_segment_ids, num_segments, + outer_dim_size, inner_dim_size, segment_id_offset, out); + } + } +}; + +template +struct UnsortedSegmentSumKernelUtil final { + static void UnsortedSegmentSum(ep::Stream* stream, const K* segment_ids, const float16* data, + int64_t num_segment_ids, int64_t num_segments, + int64_t outer_dim_size, int64_t inner_dim_size, + int64_t segment_id_offset, float* out) { + UnsortedSegmentSumKernelUtil::UnsortedSegmentSum( + stream, segment_ids, reinterpret_cast(data), num_segment_ids, num_segments, + outer_dim_size, inner_dim_size, segment_id_offset, out); + } +}; + +#define INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_UTIL_CUDA(in_type_pair, index_type_pair) \ + template struct UnsortedSegmentSumKernelUtil; +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_UTIL_CUDA, + UNSORTED_SEGMENT_SUM_DATA_TYPE_SEQ, + UNSORTED_SEGMENT_SUM_INDEX_TYPE_SEQ); +#undef INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_UTIL_CUDA + +#define INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_HALF_CUDA(in_type_pair, index_type_pair, \ + out_type_pair) \ + template struct UnsortedSegmentSumKernelUtil; + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_HALF_CUDA, + OF_PP_MAKE_TUPLE_SEQ(float, DataType::kFloat), + UNSORTED_SEGMENT_SUM_INDEX_TYPE_SEQ, FLOAT16_DATA_TYPE_SEQ); + +#undef INITIATE_UNSORTED_SEGMENT_SUM_KERNEL_HALF_CUDA + +template struct UnsortedSegmentSumKernelUtil; +template struct UnsortedSegmentSumKernelUtil; +template struct UnsortedSegmentSumKernelUtil; + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/upsample_bicubic_2d_kernel.hip.cpp b/oneflow/user/kernels/upsample_bicubic_2d_kernel.hip.cpp index 6525f41..a9324b9 100644 --- a/oneflow/user/kernels/upsample_bicubic_2d_kernel.hip.cpp +++ b/oneflow/user/kernels/upsample_bicubic_2d_kernel.hip.cpp @@ -1,234 +1,234 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/common/nd_index_offset_helper.h" -#include "oneflow/core/hip/atomic.hip.h" -#include "oneflow/user/kernels/upsample_kernel.h" - -namespace oneflow { - -namespace { - -template -__device__ void upsample_increment_value_bounded_cuda(T* data, int64_t width, int64_t height, - int64_t x, int64_t y, T value) { - int64_t access_x = max(min(x, width - 1), static_cast(0)); - int64_t access_y = max(min(y, height - 1), static_cast(0)); - cuda::atomic::Add(data + access_y * width + access_x, value); -} - -template -__global__ void UpsampleBicubic2dForward(const int64_t elem_cnt, const T* in_dptr, - const int64_t nbatch, const int64_t channels, - const int64_t in_height, const int64_t in_width, - const int64_t out_height, const int64_t out_width, - const float scale_height, const float scale_width, - bool align_corners, T* out_dptr) { - CUDA_1D_KERNEL_LOOP(idx, elem_cnt) { - const int output_x = idx % out_width; - const int output_y = idx / out_width; - - const T* in = in_dptr; - T* out = out_dptr; - - const T real_x = GetAreaPixel(scale_width, output_x, align_corners, /*cubic=*/true); - int64_t input_x = floor(1.0 * real_x); - const T t_x = real_x - input_x; - - const T real_y = GetAreaPixel(scale_height, output_y, align_corners, /*cubic=*/true); - int64_t input_y = floor(1.0 * real_y); - const T t_y = real_y - input_y; - - for (int64_t c = 0; c < channels * nbatch; c++) { - T coefficients[4]; - - // Interpolate 4 times in the x direction - for (int64_t i = 0; i < 4; i++) { - coefficients[i] = cubic_interp1d( - upsample_get_value_bounded(in, in_width, in_height, input_x - 1, input_y - 1 + i), - upsample_get_value_bounded(in, in_width, in_height, input_x + 0, input_y - 1 + i), - upsample_get_value_bounded(in, in_width, in_height, input_x + 1, input_y - 1 + i), - upsample_get_value_bounded(in, in_width, in_height, input_x + 2, input_y - 1 + i), - t_x); - } - - // Interpolate in the y direction using x interpolations - out[output_y * out_width + output_x] = cubic_interp1d( - coefficients[0], coefficients[1], coefficients[2], coefficients[3], t_y); - - // Move to next channel - in += in_width * in_height; - out += out_width * out_height; - } - } -} - -template -__global__ void UpsampleBicubic2dBackward(const int64_t elem_cnt, const T* dy_dptr, - const int64_t nbatch, const int64_t channels, - const int64_t in_height, const int64_t in_width, - const int64_t out_height, const int64_t out_width, - const float scale_height, const float scale_width, - bool align_corners, T* dx_dptr) { - CUDA_1D_KERNEL_LOOP(idx, elem_cnt) { - const int output_x = idx % out_width; - const int output_y = idx / out_width; - - T* in = dx_dptr; - const T* out = dy_dptr; - - T real_x = GetAreaPixel(scale_width, output_x, align_corners, true); - int64_t input_x = floor(1.0 * real_x); - T t_x = real_x - input_x; - - T real_y = GetAreaPixel(scale_height, output_y, align_corners, true); - int64_t input_y = floor(1.0 * real_y); - T t_y = real_y - input_y; - - T x_coeffs[4]; - T y_coeffs[4]; - - get_cubic_upsample_coefficients(x_coeffs, t_x); - get_cubic_upsample_coefficients(y_coeffs, t_y); - - for (int64_t c = 0; c < channels * nbatch; c++) { - T out_value = out[output_y * out_width + output_x]; - - for (int64_t i = 0; i < 4; i++) { - for (int64_t j = 0; j < 4; j++) { - upsample_increment_value_bounded_cuda(in, in_width, in_height, input_x - 1 + i, - input_y - 1 + j, - out_value * y_coeffs[j] * x_coeffs[i]); - } - } - - in += in_width * in_height; - out += out_width * out_height; - } - } -} - -} // namespace - -template -class UpsampleBicubic2dGPUKernel final : public user_op::OpKernel { - public: - UpsampleBicubic2dGPUKernel() = default; - ~UpsampleBicubic2dGPUKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); - user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); - const T* in_ptr = x_tensor->dptr(); - T* out_ptr = y_tensor->mut_dptr(); - const bool align_corners = ctx->Attr("align_corners"); - - const int nbatch = x_tensor->shape_view().At(0); - const int channels = x_tensor->shape_view().At(1); - const int64_t in_height = x_tensor->shape_view().At(2); - const int64_t in_width = x_tensor->shape_view().At(3); - const int64_t out_height = y_tensor->shape_view().At(2); - const int64_t out_width = y_tensor->shape_view().At(3); - const std::vector output_size = ctx->Attr>("output_size"); - double height_scale = ctx->Attr("height_scale"); - double width_scale = ctx->Attr("width_scale"); - if (!output_size.empty()) { - height_scale = static_cast(out_height) / static_cast(in_height); - width_scale = static_cast(out_width) / static_cast(in_width); - } - const int64_t elem_cnt = out_height * out_width; - - if (in_height == out_height && in_width == out_width) { - Memcpy( - ctx->stream(), y_tensor->mut_dptr(), x_tensor->dptr(), - x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type())); - } else { - const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale); - const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale); - - RUN_CUDA_KERNEL((UpsampleBicubic2dForward), ctx->stream(), elem_cnt, elem_cnt, - x_tensor->dptr(), nbatch, channels, in_height, in_width, out_height, - out_width, scale_height, scale_width, align_corners, y_tensor->mut_dptr()); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class UpsampleBicubic2dGradGPUKernel final : public user_op::OpKernel { - public: - UpsampleBicubic2dGradGPUKernel() = default; - ~UpsampleBicubic2dGradGPUKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); - Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, - dx_tensor->shape_view().elem_cnt() * sizeof(T)); - const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); - const bool align_corners = ctx->Attr("align_corners"); - - const int nbatch = dx_tensor->shape_view().At(0); - const int channels = dx_tensor->shape_view().At(1); - const int64_t in_height = dx_tensor->shape_view().At(2); - const int64_t in_width = dx_tensor->shape_view().At(3); - const int64_t out_height = dy_tensor->shape_view().At(2); - const int64_t out_width = dy_tensor->shape_view().At(3); - const std::vector output_size = ctx->Attr>("output_size"); - double height_scale = ctx->Attr("height_scale"); - double width_scale = ctx->Attr("width_scale"); - if (!output_size.empty()) { - height_scale = static_cast(out_height) / static_cast(in_height); - width_scale = static_cast(out_width) / static_cast(in_width); - } - const int64_t elem_cnt = out_height * out_width; - - if (in_height == out_height && in_width == out_width) { - Memcpy( - ctx->stream(), dx_tensor->mut_dptr(), dy_tensor->dptr(), - dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type())); - } else { - const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale); - const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale); - - RUN_CUDA_KERNEL((UpsampleBicubic2dBackward), ctx->stream(), elem_cnt, elem_cnt, - dy_tensor->dptr(), nbatch, channels, in_height, in_width, out_height, - out_width, scale_height, scale_width, align_corners, - dx_tensor->mut_dptr()); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_UPSAMPLE_BICUBIC_CUDA_KERNEL(dtype) \ - REGISTER_USER_KERNEL("upsample_bicubic_2d") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value)); \ - REGISTER_USER_KERNEL("upsample_bicubic_2d_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)); - -REGISTER_UPSAMPLE_BICUBIC_CUDA_KERNEL(float) -REGISTER_UPSAMPLE_BICUBIC_CUDA_KERNEL(double) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/common/nd_index_offset_helper.h" +#include "oneflow/core/hip/atomic.hip.h" +#include "oneflow/user/kernels/upsample_kernel.h" + +namespace oneflow { + +namespace { + +template +__device__ void upsample_increment_value_bounded_cuda(T* data, int64_t width, int64_t height, + int64_t x, int64_t y, T value) { + int64_t access_x = max(min(x, width - 1), static_cast(0)); + int64_t access_y = max(min(y, height - 1), static_cast(0)); + cuda::atomic::Add(data + access_y * width + access_x, value); +} + +template +__global__ void UpsampleBicubic2dForward(const int64_t elem_cnt, const T* in_dptr, + const int64_t nbatch, const int64_t channels, + const int64_t in_height, const int64_t in_width, + const int64_t out_height, const int64_t out_width, + const float scale_height, const float scale_width, + bool align_corners, T* out_dptr) { + CUDA_1D_KERNEL_LOOP(idx, elem_cnt) { + const int output_x = idx % out_width; + const int output_y = idx / out_width; + + const T* in = in_dptr; + T* out = out_dptr; + + const T real_x = GetAreaPixel(scale_width, output_x, align_corners, /*cubic=*/true); + int64_t input_x = floor(1.0 * real_x); + const T t_x = real_x - input_x; + + const T real_y = GetAreaPixel(scale_height, output_y, align_corners, /*cubic=*/true); + int64_t input_y = floor(1.0 * real_y); + const T t_y = real_y - input_y; + + for (int64_t c = 0; c < channels * nbatch; c++) { + T coefficients[4]; + + // Interpolate 4 times in the x direction + for (int64_t i = 0; i < 4; i++) { + coefficients[i] = cubic_interp1d( + upsample_get_value_bounded(in, in_width, in_height, input_x - 1, input_y - 1 + i), + upsample_get_value_bounded(in, in_width, in_height, input_x + 0, input_y - 1 + i), + upsample_get_value_bounded(in, in_width, in_height, input_x + 1, input_y - 1 + i), + upsample_get_value_bounded(in, in_width, in_height, input_x + 2, input_y - 1 + i), + t_x); + } + + // Interpolate in the y direction using x interpolations + out[output_y * out_width + output_x] = cubic_interp1d( + coefficients[0], coefficients[1], coefficients[2], coefficients[3], t_y); + + // Move to next channel + in += in_width * in_height; + out += out_width * out_height; + } + } +} + +template +__global__ void UpsampleBicubic2dBackward(const int64_t elem_cnt, const T* dy_dptr, + const int64_t nbatch, const int64_t channels, + const int64_t in_height, const int64_t in_width, + const int64_t out_height, const int64_t out_width, + const float scale_height, const float scale_width, + bool align_corners, T* dx_dptr) { + CUDA_1D_KERNEL_LOOP(idx, elem_cnt) { + const int output_x = idx % out_width; + const int output_y = idx / out_width; + + T* in = dx_dptr; + const T* out = dy_dptr; + + T real_x = GetAreaPixel(scale_width, output_x, align_corners, true); + int64_t input_x = floor(1.0 * real_x); + T t_x = real_x - input_x; + + T real_y = GetAreaPixel(scale_height, output_y, align_corners, true); + int64_t input_y = floor(1.0 * real_y); + T t_y = real_y - input_y; + + T x_coeffs[4]; + T y_coeffs[4]; + + get_cubic_upsample_coefficients(x_coeffs, t_x); + get_cubic_upsample_coefficients(y_coeffs, t_y); + + for (int64_t c = 0; c < channels * nbatch; c++) { + T out_value = out[output_y * out_width + output_x]; + + for (int64_t i = 0; i < 4; i++) { + for (int64_t j = 0; j < 4; j++) { + upsample_increment_value_bounded_cuda(in, in_width, in_height, input_x - 1 + i, + input_y - 1 + j, + out_value * y_coeffs[j] * x_coeffs[i]); + } + } + + in += in_width * in_height; + out += out_width * out_height; + } + } +} + +} // namespace + +template +class UpsampleBicubic2dGPUKernel final : public user_op::OpKernel { + public: + UpsampleBicubic2dGPUKernel() = default; + ~UpsampleBicubic2dGPUKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); + user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); + const T* in_ptr = x_tensor->dptr(); + T* out_ptr = y_tensor->mut_dptr(); + const bool align_corners = ctx->Attr("align_corners"); + + const int nbatch = x_tensor->shape_view().At(0); + const int channels = x_tensor->shape_view().At(1); + const int64_t in_height = x_tensor->shape_view().At(2); + const int64_t in_width = x_tensor->shape_view().At(3); + const int64_t out_height = y_tensor->shape_view().At(2); + const int64_t out_width = y_tensor->shape_view().At(3); + const std::vector output_size = ctx->Attr>("output_size"); + double height_scale = ctx->Attr("height_scale"); + double width_scale = ctx->Attr("width_scale"); + if (!output_size.empty()) { + height_scale = static_cast(out_height) / static_cast(in_height); + width_scale = static_cast(out_width) / static_cast(in_width); + } + const int64_t elem_cnt = out_height * out_width; + + if (in_height == out_height && in_width == out_width) { + Memcpy( + ctx->stream(), y_tensor->mut_dptr(), x_tensor->dptr(), + x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type())); + } else { + const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale); + const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale); + + RUN_CUDA_KERNEL((UpsampleBicubic2dForward), ctx->stream(), elem_cnt, elem_cnt, + x_tensor->dptr(), nbatch, channels, in_height, in_width, out_height, + out_width, scale_height, scale_width, align_corners, y_tensor->mut_dptr()); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class UpsampleBicubic2dGradGPUKernel final : public user_op::OpKernel { + public: + UpsampleBicubic2dGradGPUKernel() = default; + ~UpsampleBicubic2dGradGPUKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); + Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, + dx_tensor->shape_view().elem_cnt() * sizeof(T)); + const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); + const bool align_corners = ctx->Attr("align_corners"); + + const int nbatch = dx_tensor->shape_view().At(0); + const int channels = dx_tensor->shape_view().At(1); + const int64_t in_height = dx_tensor->shape_view().At(2); + const int64_t in_width = dx_tensor->shape_view().At(3); + const int64_t out_height = dy_tensor->shape_view().At(2); + const int64_t out_width = dy_tensor->shape_view().At(3); + const std::vector output_size = ctx->Attr>("output_size"); + double height_scale = ctx->Attr("height_scale"); + double width_scale = ctx->Attr("width_scale"); + if (!output_size.empty()) { + height_scale = static_cast(out_height) / static_cast(in_height); + width_scale = static_cast(out_width) / static_cast(in_width); + } + const int64_t elem_cnt = out_height * out_width; + + if (in_height == out_height && in_width == out_width) { + Memcpy( + ctx->stream(), dx_tensor->mut_dptr(), dy_tensor->dptr(), + dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type())); + } else { + const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale); + const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale); + + RUN_CUDA_KERNEL((UpsampleBicubic2dBackward), ctx->stream(), elem_cnt, elem_cnt, + dy_tensor->dptr(), nbatch, channels, in_height, in_width, out_height, + out_width, scale_height, scale_width, align_corners, + dx_tensor->mut_dptr()); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_UPSAMPLE_BICUBIC_CUDA_KERNEL(dtype) \ + REGISTER_USER_KERNEL("upsample_bicubic_2d") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("y", 0) == GetDataType::value)); \ + REGISTER_USER_KERNEL("upsample_bicubic_2d_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value)); + +REGISTER_UPSAMPLE_BICUBIC_CUDA_KERNEL(float) +REGISTER_UPSAMPLE_BICUBIC_CUDA_KERNEL(double) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/upsample_bilinear_2d_kernel.hip.cpp b/oneflow/user/kernels/upsample_bilinear_2d_kernel.hip.cpp index 1a4eb29..b1756a9 100644 --- a/oneflow/user/kernels/upsample_bilinear_2d_kernel.hip.cpp +++ b/oneflow/user/kernels/upsample_bilinear_2d_kernel.hip.cpp @@ -1,190 +1,190 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/common/nd_index_offset_helper.h" -#include "oneflow/core/hip/atomic.hip.h" -#include "oneflow/user/kernels/upsample_kernel.h" - -namespace oneflow { - -namespace { - -template -__global__ void UpsampleBilinear2DForward(const int64_t elem_cnt, const T* in_dptr, - NdIndexOffsetHelper in_helper, - NdIndexOffsetHelper out_helper, - const int64_t in_height, const int64_t in_width, - const T scale_h, const T scale_w, - const bool align_corners, T* out_dptr) { - CUDA_1D_KERNEL_LOOP(index, elem_cnt) { - int64_t n, c, h, w; - out_helper.OffsetToNdIndex(index, n, c, h, w); - BilinearParam params; - GetBilinearParam(align_corners, h, w, in_height, in_width, scale_h, scale_w, ¶ms); - const int64_t top_offset = in_helper.NdIndexToOffset(n, c, params.top_h_index, 0); - const int64_t bottom_offset = in_helper.NdIndexToOffset(n, c, params.bottom_h_index, 0); - const T top_left = in_dptr[top_offset + params.left_w_index]; - const T top_right = in_dptr[top_offset + params.right_w_index]; - const T bottom_left = in_dptr[bottom_offset + params.left_w_index]; - const T bottom_right = in_dptr[bottom_offset + params.right_w_index]; - out_dptr[index] = - (1 - params.h_lerp) * ((1 - params.w_lerp) * top_left + params.w_lerp * top_right) - + params.h_lerp * ((1 - params.w_lerp) * bottom_left + params.w_lerp * bottom_right); - } -} - -template -__global__ void UpsampleBilinearBackward(const int64_t elem_cnt, const T* dy_dptr, - NdIndexOffsetHelper dy_helper, - NdIndexOffsetHelper dx_helper, - const int64_t dx_height, const int64_t dx_width, - const T scale_h, const T scale_w, const bool align_corners, - T* dx_dptr) { - CUDA_1D_KERNEL_LOOP(index, elem_cnt) { - int64_t n, c, h, w; - dy_helper.OffsetToNdIndex(index, n, c, h, w); - BilinearParam params; - GetBilinearParam(align_corners, h, w, dx_height, dx_width, scale_h, scale_w, ¶ms); - const int64_t top_offset = dx_helper.NdIndexToOffset(n, c, params.top_h_index, 0); - const int64_t bottom_offset = dx_helper.NdIndexToOffset(n, c, params.bottom_h_index, 0); - const T dy = dy_dptr[index]; - const T dbottom = params.h_lerp * dy; - T* dx_dptr_bottom_offset = dx_dptr + bottom_offset; - cuda::atomic::Add(dx_dptr_bottom_offset + params.left_w_index, - static_cast((1 - params.w_lerp) * dbottom)); - cuda::atomic::Add(dx_dptr_bottom_offset + params.right_w_index, - static_cast(params.w_lerp * dbottom)); - const T dtop = dy - dbottom; - T* dx_dptr_top_offset = dx_dptr + top_offset; - cuda::atomic::Add(dx_dptr_top_offset + params.left_w_index, - static_cast((1 - params.w_lerp) * dtop)); - cuda::atomic::Add(dx_dptr_top_offset + params.right_w_index, - static_cast(params.w_lerp * dtop)); - } -} - -} // namespace - -template -class UpsampleBilinear2DGPUKernel final : public user_op::OpKernel { - public: - UpsampleBilinear2DGPUKernel() = default; - ~UpsampleBilinear2DGPUKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); - user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); - const bool align_corners = ctx->Attr("align_corners"); - const std::vector output_size = ctx->Attr>("output_size"); - double height_scale = ctx->Attr("height_scale"); - double width_scale = ctx->Attr("width_scale"); - const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); - NdIndexOffsetHelper in_helper( - x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2), - x_tensor->shape_view().At(3)); - NdIndexOffsetHelper out_helper( - y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2), - y_tensor->shape_view().At(3)); - - const int64_t in_height = x_tensor->shape_view().At(2); - const int64_t in_width = x_tensor->shape_view().At(3); - const int64_t out_height = y_tensor->shape_view().At(2); - const int64_t out_width = y_tensor->shape_view().At(3); - if (!output_size.empty()) { - height_scale = static_cast(out_height) / static_cast(in_height); - width_scale = static_cast(out_width) / static_cast(in_width); - } - if (in_height == out_height && in_width == out_width) { - Memcpy( - ctx->stream(), y_tensor->mut_dptr(), x_tensor->dptr(), - x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type())); - } else { - const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale); - const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale); - RUN_CUDA_KERNEL((UpsampleBilinear2DForward), ctx->stream(), elem_cnt, elem_cnt, - x_tensor->dptr(), in_helper, out_helper, in_height, in_width, scale_height, - scale_width, align_corners, y_tensor->mut_dptr()); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class UpsampleBilinear2DGradGPUKernel final : public user_op::OpKernel { - public: - UpsampleBilinear2DGradGPUKernel() = default; - ~UpsampleBilinear2DGradGPUKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); - Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, - dx_tensor->shape_view().elem_cnt() * sizeof(T)); - const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); - const bool align_corners = ctx->Attr("align_corners"); - const std::vector output_size = ctx->Attr>("output_size"); - double height_scale = ctx->Attr("height_scale"); - double width_scale = ctx->Attr("width_scale"); - const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); - NdIndexOffsetHelper dy_helper( - dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1), dy_tensor->shape_view().At(2), - dy_tensor->shape_view().At(3)); - NdIndexOffsetHelper dx_helper( - dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1), dx_tensor->shape_view().At(2), - dx_tensor->shape_view().At(3)); - - const int64_t in_height = dx_tensor->shape_view().At(2); - const int64_t in_width = dx_tensor->shape_view().At(3); - const int64_t out_height = dy_tensor->shape_view().At(2); - const int64_t out_width = dy_tensor->shape_view().At(3); - if (!output_size.empty()) { - height_scale = static_cast(out_height) / static_cast(in_height); - width_scale = static_cast(out_width) / static_cast(in_width); - } - if (in_height == out_height && in_width == out_width) { - Memcpy( - ctx->stream(), dx_tensor->mut_dptr(), dy_tensor->dptr(), - dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type())); - } else { - const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale); - const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale); - RUN_CUDA_KERNEL((UpsampleBilinearBackward), ctx->stream(), elem_cnt, elem_cnt, - dy_tensor->dptr(), dy_helper, dx_helper, in_height, in_width, scale_height, - scale_width, align_corners, dx_tensor->mut_dptr()); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_UPSAMPLE_BILINEAR_2D_CUDA_KERNEL(dtype) \ - REGISTER_USER_KERNEL("upsample_bilinear_2d") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value)); \ - REGISTER_USER_KERNEL("upsample_bilinear_2d_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)); - -REGISTER_UPSAMPLE_BILINEAR_2D_CUDA_KERNEL(float) -REGISTER_UPSAMPLE_BILINEAR_2D_CUDA_KERNEL(double) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/common/nd_index_offset_helper.h" +#include "oneflow/core/hip/atomic.hip.h" +#include "oneflow/user/kernels/upsample_kernel.h" + +namespace oneflow { + +namespace { + +template +__global__ void UpsampleBilinear2DForward(const int64_t elem_cnt, const T* in_dptr, + NdIndexOffsetHelper in_helper, + NdIndexOffsetHelper out_helper, + const int64_t in_height, const int64_t in_width, + const T scale_h, const T scale_w, + const bool align_corners, T* out_dptr) { + CUDA_1D_KERNEL_LOOP(index, elem_cnt) { + int64_t n, c, h, w; + out_helper.OffsetToNdIndex(index, n, c, h, w); + BilinearParam params; + GetBilinearParam(align_corners, h, w, in_height, in_width, scale_h, scale_w, ¶ms); + const int64_t top_offset = in_helper.NdIndexToOffset(n, c, params.top_h_index, 0); + const int64_t bottom_offset = in_helper.NdIndexToOffset(n, c, params.bottom_h_index, 0); + const T top_left = in_dptr[top_offset + params.left_w_index]; + const T top_right = in_dptr[top_offset + params.right_w_index]; + const T bottom_left = in_dptr[bottom_offset + params.left_w_index]; + const T bottom_right = in_dptr[bottom_offset + params.right_w_index]; + out_dptr[index] = + (1 - params.h_lerp) * ((1 - params.w_lerp) * top_left + params.w_lerp * top_right) + + params.h_lerp * ((1 - params.w_lerp) * bottom_left + params.w_lerp * bottom_right); + } +} + +template +__global__ void UpsampleBilinearBackward(const int64_t elem_cnt, const T* dy_dptr, + NdIndexOffsetHelper dy_helper, + NdIndexOffsetHelper dx_helper, + const int64_t dx_height, const int64_t dx_width, + const T scale_h, const T scale_w, const bool align_corners, + T* dx_dptr) { + CUDA_1D_KERNEL_LOOP(index, elem_cnt) { + int64_t n, c, h, w; + dy_helper.OffsetToNdIndex(index, n, c, h, w); + BilinearParam params; + GetBilinearParam(align_corners, h, w, dx_height, dx_width, scale_h, scale_w, ¶ms); + const int64_t top_offset = dx_helper.NdIndexToOffset(n, c, params.top_h_index, 0); + const int64_t bottom_offset = dx_helper.NdIndexToOffset(n, c, params.bottom_h_index, 0); + const T dy = dy_dptr[index]; + const T dbottom = params.h_lerp * dy; + T* dx_dptr_bottom_offset = dx_dptr + bottom_offset; + cuda::atomic::Add(dx_dptr_bottom_offset + params.left_w_index, + static_cast((1 - params.w_lerp) * dbottom)); + cuda::atomic::Add(dx_dptr_bottom_offset + params.right_w_index, + static_cast(params.w_lerp * dbottom)); + const T dtop = dy - dbottom; + T* dx_dptr_top_offset = dx_dptr + top_offset; + cuda::atomic::Add(dx_dptr_top_offset + params.left_w_index, + static_cast((1 - params.w_lerp) * dtop)); + cuda::atomic::Add(dx_dptr_top_offset + params.right_w_index, + static_cast(params.w_lerp * dtop)); + } +} + +} // namespace + +template +class UpsampleBilinear2DGPUKernel final : public user_op::OpKernel { + public: + UpsampleBilinear2DGPUKernel() = default; + ~UpsampleBilinear2DGPUKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); + user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); + const bool align_corners = ctx->Attr("align_corners"); + const std::vector output_size = ctx->Attr>("output_size"); + double height_scale = ctx->Attr("height_scale"); + double width_scale = ctx->Attr("width_scale"); + const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); + NdIndexOffsetHelper in_helper( + x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2), + x_tensor->shape_view().At(3)); + NdIndexOffsetHelper out_helper( + y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2), + y_tensor->shape_view().At(3)); + + const int64_t in_height = x_tensor->shape_view().At(2); + const int64_t in_width = x_tensor->shape_view().At(3); + const int64_t out_height = y_tensor->shape_view().At(2); + const int64_t out_width = y_tensor->shape_view().At(3); + if (!output_size.empty()) { + height_scale = static_cast(out_height) / static_cast(in_height); + width_scale = static_cast(out_width) / static_cast(in_width); + } + if (in_height == out_height && in_width == out_width) { + Memcpy( + ctx->stream(), y_tensor->mut_dptr(), x_tensor->dptr(), + x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type())); + } else { + const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale); + const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale); + RUN_CUDA_KERNEL((UpsampleBilinear2DForward), ctx->stream(), elem_cnt, elem_cnt, + x_tensor->dptr(), in_helper, out_helper, in_height, in_width, scale_height, + scale_width, align_corners, y_tensor->mut_dptr()); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class UpsampleBilinear2DGradGPUKernel final : public user_op::OpKernel { + public: + UpsampleBilinear2DGradGPUKernel() = default; + ~UpsampleBilinear2DGradGPUKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); + Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, + dx_tensor->shape_view().elem_cnt() * sizeof(T)); + const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); + const bool align_corners = ctx->Attr("align_corners"); + const std::vector output_size = ctx->Attr>("output_size"); + double height_scale = ctx->Attr("height_scale"); + double width_scale = ctx->Attr("width_scale"); + const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); + NdIndexOffsetHelper dy_helper( + dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1), dy_tensor->shape_view().At(2), + dy_tensor->shape_view().At(3)); + NdIndexOffsetHelper dx_helper( + dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1), dx_tensor->shape_view().At(2), + dx_tensor->shape_view().At(3)); + + const int64_t in_height = dx_tensor->shape_view().At(2); + const int64_t in_width = dx_tensor->shape_view().At(3); + const int64_t out_height = dy_tensor->shape_view().At(2); + const int64_t out_width = dy_tensor->shape_view().At(3); + if (!output_size.empty()) { + height_scale = static_cast(out_height) / static_cast(in_height); + width_scale = static_cast(out_width) / static_cast(in_width); + } + if (in_height == out_height && in_width == out_width) { + Memcpy( + ctx->stream(), dx_tensor->mut_dptr(), dy_tensor->dptr(), + dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type())); + } else { + const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale); + const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale); + RUN_CUDA_KERNEL((UpsampleBilinearBackward), ctx->stream(), elem_cnt, elem_cnt, + dy_tensor->dptr(), dy_helper, dx_helper, in_height, in_width, scale_height, + scale_width, align_corners, dx_tensor->mut_dptr()); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_UPSAMPLE_BILINEAR_2D_CUDA_KERNEL(dtype) \ + REGISTER_USER_KERNEL("upsample_bilinear_2d") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("y", 0) == GetDataType::value)); \ + REGISTER_USER_KERNEL("upsample_bilinear_2d_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value)); + +REGISTER_UPSAMPLE_BILINEAR_2D_CUDA_KERNEL(float) +REGISTER_UPSAMPLE_BILINEAR_2D_CUDA_KERNEL(double) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/upsample_linear_1d_kernel.hip.cpp b/oneflow/user/kernels/upsample_linear_1d_kernel.hip.cpp index a949f1e..9850fa2 100644 --- a/oneflow/user/kernels/upsample_linear_1d_kernel.hip.cpp +++ b/oneflow/user/kernels/upsample_linear_1d_kernel.hip.cpp @@ -1,163 +1,163 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/common/nd_index_offset_helper.h" -#include "oneflow/core/hip/atomic.hip.h" -#include "oneflow/user/kernels/upsample_kernel.h" - -namespace oneflow { - -namespace { - -template -__global__ void UpsampleLinear1DForward(const int64_t elem_cnt, const T* in_dptr, - NdIndexOffsetHelper in_helper, - NdIndexOffsetHelper out_helper, - const int in_height, const double scale_factor, - bool align_corners, T* out_dptr) { - CUDA_1D_KERNEL_LOOP(index, elem_cnt) { - int64_t n, c, h; - out_helper.OffsetToNdIndex(index, n, c, h); - const double h1r = GetLinearInputIndex(h, scale_factor, align_corners); - const int64_t h1 = h1r; - const int64_t h1p = (h1 < in_height - 1) ? 1 : 0; - const double h1lambda = h1r - h1; - const double h0lambda = static_cast(1.) - h1lambda; - out_dptr[index] = h0lambda * in_dptr[in_helper.NdIndexToOffset(n, c, h1)] - + h1lambda * in_dptr[in_helper.NdIndexToOffset(n, c, h1 + h1p)]; - } -} - -template -__global__ void UpsampleLinear1DBackward(const int64_t elem_cnt, const T* dy_dptr, - NdIndexOffsetHelper dy_helper, - NdIndexOffsetHelper dx_helper, - const int in_height, const double scale_factor, - bool align_corners, T* dx_dptr) { - CUDA_1D_KERNEL_LOOP(index, elem_cnt) { - int64_t n, c, h; - dy_helper.OffsetToNdIndex(index, n, c, h); - const double h1r = GetLinearInputIndex(h, scale_factor, align_corners); - const int64_t h1 = h1r; - const int64_t h1p = (h1 < in_height - 1) ? 1 : 0; - const double h1lambda = h1r - h1; - const double h0lambda = static_cast(1.) - h1lambda; - - cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, h1), h0lambda * dy_dptr[index]); - cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, h1 + h1p), - h1lambda * dy_dptr[index]); - } -} - -} // namespace - -template -class UpsampleLinear1DGPUKernel final : public user_op::OpKernel { - public: - UpsampleLinear1DGPUKernel() = default; - ~UpsampleLinear1DGPUKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); - user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); - const bool align_corners = ctx->Attr("align_corners"); - const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); - NdIndexOffsetHelper in_helper( - x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2)); - NdIndexOffsetHelper out_helper( - y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2)); - const int64_t in_height = x_tensor->shape_view().At(2); - const int64_t out_height = y_tensor->shape_view().At(2); - const std::vector output_size = ctx->Attr>("output_size"); - double height_scale = ctx->Attr("scale_factor"); - if (!output_size.empty()) { - height_scale = static_cast(out_height) / static_cast(in_height); - } - if (in_height == out_height) { - Memcpy( - ctx->stream(), y_tensor->mut_dptr(), x_tensor->dptr(), - x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type())); - } else { - const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale); - RUN_CUDA_KERNEL((UpsampleLinear1DForward), ctx->stream(), elem_cnt, elem_cnt, - x_tensor->dptr(), in_helper, out_helper, in_height, scale_height, - align_corners, y_tensor->mut_dptr()); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class UpsampleLinearGrad1DGPUKernel final : public user_op::OpKernel { - public: - UpsampleLinearGrad1DGPUKernel() = default; - ~UpsampleLinearGrad1DGPUKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); - Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, - dx_tensor->shape_view().elem_cnt() * sizeof(T)); - const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); - const bool align_corners = ctx->Attr("align_corners"); - - NdIndexOffsetHelper dy_helper(dy_tensor->shape_view().At(0), - dy_tensor->shape_view().At(1), - dy_tensor->shape_view().At(2)); - NdIndexOffsetHelper dx_helper(dx_tensor->shape_view().At(0), - dx_tensor->shape_view().At(1), - dx_tensor->shape_view().At(2)); - const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); - const int64_t in_height = dx_tensor->shape_view().At(2); - const int64_t out_height = dy_tensor->shape_view().At(2); - const std::vector output_size = ctx->Attr>("output_size"); - double height_scale = ctx->Attr("scale_factor"); - if (!output_size.empty()) { - height_scale = static_cast(out_height) / static_cast(in_height); - } - if (in_height == out_height) { - Memcpy( - ctx->stream(), dx_tensor->mut_dptr(), dy_tensor->dptr(), - dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type())); - } else { - const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale); - RUN_CUDA_KERNEL((UpsampleLinear1DBackward), ctx->stream(), elem_cnt, elem_cnt, - dy_tensor->dptr(), dy_helper, dx_helper, in_height, scale_height, - align_corners, dx_tensor->mut_dptr()); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_UPSAMPLELINEAR1D_CUDA_KERNEL(dtype) \ - REGISTER_USER_KERNEL("upsample_linear_1d") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value)); \ - REGISTER_USER_KERNEL("upsample_linear_1d_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)); - -REGISTER_UPSAMPLELINEAR1D_CUDA_KERNEL(float) -REGISTER_UPSAMPLELINEAR1D_CUDA_KERNEL(double) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/common/nd_index_offset_helper.h" +#include "oneflow/core/hip/atomic.hip.h" +#include "oneflow/user/kernels/upsample_kernel.h" + +namespace oneflow { + +namespace { + +template +__global__ void UpsampleLinear1DForward(const int64_t elem_cnt, const T* in_dptr, + NdIndexOffsetHelper in_helper, + NdIndexOffsetHelper out_helper, + const int in_height, const double scale_factor, + bool align_corners, T* out_dptr) { + CUDA_1D_KERNEL_LOOP(index, elem_cnt) { + int64_t n, c, h; + out_helper.OffsetToNdIndex(index, n, c, h); + const double h1r = GetLinearInputIndex(h, scale_factor, align_corners); + const int64_t h1 = h1r; + const int64_t h1p = (h1 < in_height - 1) ? 1 : 0; + const double h1lambda = h1r - h1; + const double h0lambda = static_cast(1.) - h1lambda; + out_dptr[index] = h0lambda * in_dptr[in_helper.NdIndexToOffset(n, c, h1)] + + h1lambda * in_dptr[in_helper.NdIndexToOffset(n, c, h1 + h1p)]; + } +} + +template +__global__ void UpsampleLinear1DBackward(const int64_t elem_cnt, const T* dy_dptr, + NdIndexOffsetHelper dy_helper, + NdIndexOffsetHelper dx_helper, + const int in_height, const double scale_factor, + bool align_corners, T* dx_dptr) { + CUDA_1D_KERNEL_LOOP(index, elem_cnt) { + int64_t n, c, h; + dy_helper.OffsetToNdIndex(index, n, c, h); + const double h1r = GetLinearInputIndex(h, scale_factor, align_corners); + const int64_t h1 = h1r; + const int64_t h1p = (h1 < in_height - 1) ? 1 : 0; + const double h1lambda = h1r - h1; + const double h0lambda = static_cast(1.) - h1lambda; + + cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, h1), h0lambda * dy_dptr[index]); + cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, h1 + h1p), + h1lambda * dy_dptr[index]); + } +} + +} // namespace + +template +class UpsampleLinear1DGPUKernel final : public user_op::OpKernel { + public: + UpsampleLinear1DGPUKernel() = default; + ~UpsampleLinear1DGPUKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); + user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); + const bool align_corners = ctx->Attr("align_corners"); + const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); + NdIndexOffsetHelper in_helper( + x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2)); + NdIndexOffsetHelper out_helper( + y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2)); + const int64_t in_height = x_tensor->shape_view().At(2); + const int64_t out_height = y_tensor->shape_view().At(2); + const std::vector output_size = ctx->Attr>("output_size"); + double height_scale = ctx->Attr("scale_factor"); + if (!output_size.empty()) { + height_scale = static_cast(out_height) / static_cast(in_height); + } + if (in_height == out_height) { + Memcpy( + ctx->stream(), y_tensor->mut_dptr(), x_tensor->dptr(), + x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type())); + } else { + const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale); + RUN_CUDA_KERNEL((UpsampleLinear1DForward), ctx->stream(), elem_cnt, elem_cnt, + x_tensor->dptr(), in_helper, out_helper, in_height, scale_height, + align_corners, y_tensor->mut_dptr()); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class UpsampleLinearGrad1DGPUKernel final : public user_op::OpKernel { + public: + UpsampleLinearGrad1DGPUKernel() = default; + ~UpsampleLinearGrad1DGPUKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); + Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, + dx_tensor->shape_view().elem_cnt() * sizeof(T)); + const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); + const bool align_corners = ctx->Attr("align_corners"); + + NdIndexOffsetHelper dy_helper(dy_tensor->shape_view().At(0), + dy_tensor->shape_view().At(1), + dy_tensor->shape_view().At(2)); + NdIndexOffsetHelper dx_helper(dx_tensor->shape_view().At(0), + dx_tensor->shape_view().At(1), + dx_tensor->shape_view().At(2)); + const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); + const int64_t in_height = dx_tensor->shape_view().At(2); + const int64_t out_height = dy_tensor->shape_view().At(2); + const std::vector output_size = ctx->Attr>("output_size"); + double height_scale = ctx->Attr("scale_factor"); + if (!output_size.empty()) { + height_scale = static_cast(out_height) / static_cast(in_height); + } + if (in_height == out_height) { + Memcpy( + ctx->stream(), dx_tensor->mut_dptr(), dy_tensor->dptr(), + dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type())); + } else { + const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale); + RUN_CUDA_KERNEL((UpsampleLinear1DBackward), ctx->stream(), elem_cnt, elem_cnt, + dy_tensor->dptr(), dy_helper, dx_helper, in_height, scale_height, + align_corners, dx_tensor->mut_dptr()); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_UPSAMPLELINEAR1D_CUDA_KERNEL(dtype) \ + REGISTER_USER_KERNEL("upsample_linear_1d") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("y", 0) == GetDataType::value)); \ + REGISTER_USER_KERNEL("upsample_linear_1d_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value)); + +REGISTER_UPSAMPLELINEAR1D_CUDA_KERNEL(float) +REGISTER_UPSAMPLELINEAR1D_CUDA_KERNEL(double) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/upsample_nearest_kernel.hip.cpp b/oneflow/user/kernels/upsample_nearest_kernel.hip.cpp index c007355..4de05b7 100644 --- a/oneflow/user/kernels/upsample_nearest_kernel.hip.cpp +++ b/oneflow/user/kernels/upsample_nearest_kernel.hip.cpp @@ -1,412 +1,412 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/common/nd_index_offset_helper.h" -#include "oneflow/core/hip/atomic.hip.h" -#include "oneflow/user/kernels/upsample_kernel.h" - -namespace oneflow { - -namespace { - -template -__global__ void UpsampleNearest1DForward(const int64_t elem_cnt, const T* in_dptr, - NdIndexOffsetHelper in_helper, - NdIndexOffsetHelper out_helper, - const int64_t in_height, const double scale_factor, - T* out_dptr) { - CUDA_1D_KERNEL_LOOP(index, elem_cnt) { - int64_t n, c, h; - out_helper.OffsetToNdIndex(index, n, c, h); - const int64_t in_h = GetNearestInputIndex(h, scale_factor, in_height); - out_dptr[index] = in_dptr[in_helper.NdIndexToOffset(n, c, in_h)]; - } -} - -template -__global__ void UpsampleNearest1DBackward(const int64_t elem_cnt, const T* dy_dptr, - NdIndexOffsetHelper dy_helper, - NdIndexOffsetHelper dx_helper, - const int64_t in_height, const double scale_factor, - T* dx_dptr) { - CUDA_1D_KERNEL_LOOP(index, elem_cnt) { - int64_t n, c, h; - dy_helper.OffsetToNdIndex(index, n, c, h); - const int64_t dx_h = GetNearestInputIndex(h, scale_factor, in_height); - cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, dx_h), dy_dptr[index]); - } -} - -template -__global__ void UpsampleNearest2DForward(const int64_t elem_cnt, const T* in_dptr, - NdIndexOffsetHelper in_helper, - NdIndexOffsetHelper out_helper, - const int64_t in_height, const int64_t in_width, - const double scale_h, const double scale_w, T* out_dptr) { - CUDA_1D_KERNEL_LOOP(index, elem_cnt) { - int64_t n, c, h, w; - out_helper.OffsetToNdIndex(index, n, c, h, w); - const int64_t in_h = GetNearestInputIndex(h, scale_h, in_height); - const int64_t in_w = GetNearestInputIndex(w, scale_w, in_width); - out_dptr[index] = in_dptr[in_helper.NdIndexToOffset(n, c, in_h, in_w)]; - } -} - -template -__global__ void UpsampleNearest2DBackward(const int64_t elem_cnt, const T* dy_dptr, - NdIndexOffsetHelper dy_helper, - NdIndexOffsetHelper dx_helper, - const int64_t dx_height, const int64_t dx_width, - const double scale_h, const double scale_w, T* dx_dptr) { - CUDA_1D_KERNEL_LOOP(index, elem_cnt) { - int64_t n, c, h, w; - dy_helper.OffsetToNdIndex(index, n, c, h, w); - const int64_t dx_h = GetNearestInputIndex(h, scale_h, dx_height); - const int64_t dx_w = GetNearestInputIndex(w, scale_w, dx_width); - cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, dx_h, dx_w), dy_dptr[index]); - } -} - -template -__global__ void UpsampleNearest3DForward(const int64_t elem_cnt, const T* in_dptr, - NdIndexOffsetHelper in_helper, - NdIndexOffsetHelper out_helper, - const int64_t in_depth, const int64_t in_height, - const int64_t in_width, const float scale_d, - const float scale_h, const float scale_w, T* out_dptr) { - CUDA_1D_KERNEL_LOOP(index, elem_cnt) { - int64_t n, c, d, h, w; - out_helper.OffsetToNdIndex(index, n, c, d, h, w); - const int64_t in_h = GetNearestInputIndex(h, scale_h, in_height); - const int64_t in_w = GetNearestInputIndex(w, scale_w, in_width); - const int64_t in_d = GetNearestInputIndex(d, scale_d, in_depth); - out_dptr[index] = in_dptr[in_helper.NdIndexToOffset(n, c, in_d, in_h, in_w)]; - } -} - -template -__global__ void UpsampleNearest3DBackward(const int64_t elem_cnt, const T* dy_dptr, - NdIndexOffsetHelper dy_helper, - NdIndexOffsetHelper dx_helper, - const int64_t in_depth, const int64_t in_height, - const int64_t in_width, const float scale_d, - const float scale_h, const float scale_w, T* dx_dptr) { - CUDA_1D_KERNEL_LOOP(index, elem_cnt) { - int64_t n, c, d, h, w; - dy_helper.OffsetToNdIndex(index, n, c, d, h, w); - const int64_t dx_h = GetNearestInputIndex(h, scale_h, in_height); - const int64_t dx_w = GetNearestInputIndex(w, scale_w, in_width); - const int64_t in_d = GetNearestInputIndex(d, scale_d, in_depth); - cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, in_d, dx_h, dx_w), dy_dptr[index]); - } -} - -} // namespace - -template -class UpsampleNearest1DGPUKernel final : public user_op::OpKernel { - public: - UpsampleNearest1DGPUKernel() = default; - ~UpsampleNearest1DGPUKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); - user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); - const std::vector output_size = ctx->Attr>("output_size"); - double height_scale = ctx->Attr("scale_factor"); - const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); - const int64_t in_height = x_tensor->shape_view().At(2); - const int64_t out_height = y_tensor->shape_view().At(2); - if (!output_size.empty()) { - height_scale = static_cast(out_height) / static_cast(in_height); - } - if (in_height == out_height) { - Memcpy( - ctx->stream(), y_tensor->mut_dptr(), x_tensor->dptr(), - x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type())); - } else { - NdIndexOffsetHelper in_helper( - x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2)); - NdIndexOffsetHelper out_helper( - y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2)); - RUN_CUDA_KERNEL((UpsampleNearest1DForward), ctx->stream(), elem_cnt, elem_cnt, - x_tensor->dptr(), in_helper, out_helper, x_tensor->shape_view().At(2), - 1.f / height_scale, y_tensor->mut_dptr()); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class UpsampleNearestGrad1DGPUKernel final : public user_op::OpKernel { - public: - UpsampleNearestGrad1DGPUKernel() = default; - ~UpsampleNearestGrad1DGPUKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); - - Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, - dx_tensor->shape_view().elem_cnt() * sizeof(T)); - const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); - const std::vector output_size = ctx->Attr>("output_size"); - double height_scale = ctx->Attr("scale_factor"); - const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); - const int64_t in_height = dx_tensor->shape_view().At(2); - const int64_t out_height = dy_tensor->shape_view().At(2); - if (!output_size.empty()) { - height_scale = static_cast(out_height) / static_cast(in_height); - } - if (in_height == out_height) { - Memcpy( - ctx->stream(), dx_tensor->mut_dptr(), dy_tensor->dptr(), - dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type())); - } else { - NdIndexOffsetHelper dy_helper(dy_tensor->shape_view().At(0), - dy_tensor->shape_view().At(1), - dy_tensor->shape_view().At(2)); - NdIndexOffsetHelper dx_helper(dx_tensor->shape_view().At(0), - dx_tensor->shape_view().At(1), - dx_tensor->shape_view().At(2)); - RUN_CUDA_KERNEL((UpsampleNearest1DBackward), ctx->stream(), elem_cnt, elem_cnt, - dy_tensor->dptr(), dy_helper, dx_helper, dx_tensor->shape_view().At(2), - 1.f / height_scale, dx_tensor->mut_dptr()); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_UPSAMPNEAREST1D_CUDA_KERNEL(dtype) \ - REGISTER_USER_KERNEL("upsample_nearest_1d") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value)); \ - REGISTER_USER_KERNEL("upsample_nearest_1d_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)); - -REGISTER_UPSAMPNEAREST1D_CUDA_KERNEL(float) -REGISTER_UPSAMPNEAREST1D_CUDA_KERNEL(double) - -template -class UpsampleNearest2DGPUKernel final : public user_op::OpKernel { - public: - UpsampleNearest2DGPUKernel() = default; - ~UpsampleNearest2DGPUKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); - user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); - const std::vector output_size = ctx->Attr>("output_size"); - double height_scale = ctx->Attr("height_scale"); - double width_scale = ctx->Attr("width_scale"); - const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); - const int64_t in_height = x_tensor->shape_view().At(2); - const int64_t in_width = x_tensor->shape_view().At(3); - const int64_t out_height = y_tensor->shape_view().At(2); - const int64_t out_width = y_tensor->shape_view().At(3); - if (!output_size.empty()) { - height_scale = static_cast(out_height) / static_cast(in_height); - width_scale = static_cast(out_width) / static_cast(in_width); - } - - if (in_height == out_height && in_width == out_width) { - Memcpy( - ctx->stream(), y_tensor->mut_dptr(), x_tensor->dptr(), - x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type())); - } else { - NdIndexOffsetHelper in_helper( - x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2), - x_tensor->shape_view().At(3)); - NdIndexOffsetHelper out_helper( - y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2), - y_tensor->shape_view().At(3)); - RUN_CUDA_KERNEL((UpsampleNearest2DForward), ctx->stream(), elem_cnt, elem_cnt, - x_tensor->dptr(), in_helper, out_helper, x_tensor->shape_view().At(2), - x_tensor->shape_view().At(3), 1.f / height_scale, 1.f / width_scale, - y_tensor->mut_dptr()); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class UpsampleNearest2DGradGPUKernel final : public user_op::OpKernel { - public: - UpsampleNearest2DGradGPUKernel() = default; - ~UpsampleNearest2DGradGPUKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); - - Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, - dx_tensor->shape_view().elem_cnt() * sizeof(T)); - const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); - const std::vector output_size = ctx->Attr>("output_size"); - double height_scale = ctx->Attr("height_scale"); - double width_scale = ctx->Attr("width_scale"); - const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); - const int64_t in_height = dx_tensor->shape_view().At(2); - const int64_t in_width = dx_tensor->shape_view().At(3); - const int64_t out_height = dy_tensor->shape_view().At(2); - const int64_t out_width = dy_tensor->shape_view().At(3); - if (!output_size.empty()) { - height_scale = static_cast(out_height) / static_cast(in_height); - width_scale = static_cast(out_width) / static_cast(in_width); - } - if (in_height == out_height && in_width == out_width) { - Memcpy( - ctx->stream(), dx_tensor->mut_dptr(), dy_tensor->dptr(), - dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type())); - } else { - NdIndexOffsetHelper dy_helper( - dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1), - dy_tensor->shape_view().At(2), dy_tensor->shape_view().At(3)); - NdIndexOffsetHelper dx_helper( - dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1), - dx_tensor->shape_view().At(2), dx_tensor->shape_view().At(3)); - RUN_CUDA_KERNEL((UpsampleNearest2DBackward), ctx->stream(), elem_cnt, elem_cnt, - dy_tensor->dptr(), dy_helper, dx_helper, dx_tensor->shape_view().At(2), - dx_tensor->shape_view().At(3), 1.f / height_scale, 1.f / width_scale, - dx_tensor->mut_dptr()); - } - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_UPSAMPLE_NEAREST_2D_CUDA_KERNEL(dtype) \ - REGISTER_USER_KERNEL("upsample_nearest_2d") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value)); \ - REGISTER_USER_KERNEL("upsample_nearest_2d_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)); - -REGISTER_UPSAMPLE_NEAREST_2D_CUDA_KERNEL(float) -REGISTER_UPSAMPLE_NEAREST_2D_CUDA_KERNEL(double) - -template -class UpsampleNearest3DGPUKernel final : public user_op::OpKernel { - public: - UpsampleNearest3DGPUKernel() = default; - ~UpsampleNearest3DGPUKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); - user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); - const std::vector output_size = ctx->Attr>("output_size"); - double depth_scale = ctx->Attr("depth_scale"); - double height_scale = ctx->Attr("height_scale"); - double width_scale = ctx->Attr("width_scale"); - const int64_t in_depth = x_tensor->shape_view().At(2); - const int64_t in_height = x_tensor->shape_view().At(3); - const int64_t in_width = x_tensor->shape_view().At(4); - const int64_t out_depth = y_tensor->shape_view().At(2); - const int64_t out_height = y_tensor->shape_view().At(3); - const int64_t out_width = y_tensor->shape_view().At(4); - const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); - if (!output_size.empty()) { - depth_scale = static_cast(out_depth) / static_cast(in_depth); - height_scale = static_cast(out_height) / static_cast(in_height); - width_scale = static_cast(out_width) / static_cast(in_width); - } - NdIndexOffsetHelper in_helper( - x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2), - x_tensor->shape_view().At(3), x_tensor->shape_view().At(4)); - NdIndexOffsetHelper out_helper( - y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2), - y_tensor->shape_view().At(3), y_tensor->shape_view().At(4)); - RUN_CUDA_KERNEL((UpsampleNearest3DForward), ctx->stream(), elem_cnt, elem_cnt, - x_tensor->dptr(), in_helper, out_helper, x_tensor->shape_view().At(2), - x_tensor->shape_view().At(3), x_tensor->shape_view().At(4), 1.f / depth_scale, - 1.f / height_scale, 1.f / width_scale, y_tensor->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class UpsampleNearestGrad3DGPUKernel final : public user_op::OpKernel { - public: - UpsampleNearestGrad3DGPUKernel() = default; - ~UpsampleNearestGrad3DGPUKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); - - Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, - dx_tensor->shape_view().elem_cnt() * sizeof(T)); - const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); - const std::vector output_size = ctx->Attr>("output_size"); - double depth_scale = ctx->Attr("depth_scale"); - double height_scale = ctx->Attr("height_scale"); - double width_scale = ctx->Attr("width_scale"); - const int64_t in_depth = dx_tensor->shape_view().At(2); - const int64_t in_height = dx_tensor->shape_view().At(3); - const int64_t in_width = dx_tensor->shape_view().At(4); - const int64_t out_depth = dy_tensor->shape_view().At(2); - const int64_t out_height = dy_tensor->shape_view().At(3); - const int64_t out_width = dy_tensor->shape_view().At(4); - const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); - if (!output_size.empty()) { - depth_scale = static_cast(out_depth) / static_cast(in_depth); - height_scale = static_cast(out_height) / static_cast(in_height); - width_scale = static_cast(out_width) / static_cast(in_width); - } - NdIndexOffsetHelper dy_helper( - dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1), dy_tensor->shape_view().At(2), - dy_tensor->shape_view().At(3), dy_tensor->shape_view().At(4)); - NdIndexOffsetHelper dx_helper( - dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1), dx_tensor->shape_view().At(2), - dx_tensor->shape_view().At(3), dx_tensor->shape_view().At(4)); - RUN_CUDA_KERNEL((UpsampleNearest3DBackward), ctx->stream(), elem_cnt, elem_cnt, - dy_tensor->dptr(), dy_helper, dx_helper, dx_tensor->shape_view().At(2), - dx_tensor->shape_view().At(3), dx_tensor->shape_view().At(4), 1.f / depth_scale, - 1.f / height_scale, 1.f / width_scale, dx_tensor->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_UPSAMPNEAREST3D_CUDA_KERNEL(dtype) \ - REGISTER_USER_KERNEL("upsample_nearest_3d") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value)); \ - REGISTER_USER_KERNEL("upsample_nearest_3d_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)); - -REGISTER_UPSAMPNEAREST3D_CUDA_KERNEL(float) -REGISTER_UPSAMPNEAREST3D_CUDA_KERNEL(double) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/common/nd_index_offset_helper.h" +#include "oneflow/core/hip/atomic.hip.h" +#include "oneflow/user/kernels/upsample_kernel.h" + +namespace oneflow { + +namespace { + +template +__global__ void UpsampleNearest1DForward(const int64_t elem_cnt, const T* in_dptr, + NdIndexOffsetHelper in_helper, + NdIndexOffsetHelper out_helper, + const int64_t in_height, const double scale_factor, + T* out_dptr) { + CUDA_1D_KERNEL_LOOP(index, elem_cnt) { + int64_t n, c, h; + out_helper.OffsetToNdIndex(index, n, c, h); + const int64_t in_h = GetNearestInputIndex(h, scale_factor, in_height); + out_dptr[index] = in_dptr[in_helper.NdIndexToOffset(n, c, in_h)]; + } +} + +template +__global__ void UpsampleNearest1DBackward(const int64_t elem_cnt, const T* dy_dptr, + NdIndexOffsetHelper dy_helper, + NdIndexOffsetHelper dx_helper, + const int64_t in_height, const double scale_factor, + T* dx_dptr) { + CUDA_1D_KERNEL_LOOP(index, elem_cnt) { + int64_t n, c, h; + dy_helper.OffsetToNdIndex(index, n, c, h); + const int64_t dx_h = GetNearestInputIndex(h, scale_factor, in_height); + cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, dx_h), dy_dptr[index]); + } +} + +template +__global__ void UpsampleNearest2DForward(const int64_t elem_cnt, const T* in_dptr, + NdIndexOffsetHelper in_helper, + NdIndexOffsetHelper out_helper, + const int64_t in_height, const int64_t in_width, + const double scale_h, const double scale_w, T* out_dptr) { + CUDA_1D_KERNEL_LOOP(index, elem_cnt) { + int64_t n, c, h, w; + out_helper.OffsetToNdIndex(index, n, c, h, w); + const int64_t in_h = GetNearestInputIndex(h, scale_h, in_height); + const int64_t in_w = GetNearestInputIndex(w, scale_w, in_width); + out_dptr[index] = in_dptr[in_helper.NdIndexToOffset(n, c, in_h, in_w)]; + } +} + +template +__global__ void UpsampleNearest2DBackward(const int64_t elem_cnt, const T* dy_dptr, + NdIndexOffsetHelper dy_helper, + NdIndexOffsetHelper dx_helper, + const int64_t dx_height, const int64_t dx_width, + const double scale_h, const double scale_w, T* dx_dptr) { + CUDA_1D_KERNEL_LOOP(index, elem_cnt) { + int64_t n, c, h, w; + dy_helper.OffsetToNdIndex(index, n, c, h, w); + const int64_t dx_h = GetNearestInputIndex(h, scale_h, dx_height); + const int64_t dx_w = GetNearestInputIndex(w, scale_w, dx_width); + cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, dx_h, dx_w), dy_dptr[index]); + } +} + +template +__global__ void UpsampleNearest3DForward(const int64_t elem_cnt, const T* in_dptr, + NdIndexOffsetHelper in_helper, + NdIndexOffsetHelper out_helper, + const int64_t in_depth, const int64_t in_height, + const int64_t in_width, const float scale_d, + const float scale_h, const float scale_w, T* out_dptr) { + CUDA_1D_KERNEL_LOOP(index, elem_cnt) { + int64_t n, c, d, h, w; + out_helper.OffsetToNdIndex(index, n, c, d, h, w); + const int64_t in_h = GetNearestInputIndex(h, scale_h, in_height); + const int64_t in_w = GetNearestInputIndex(w, scale_w, in_width); + const int64_t in_d = GetNearestInputIndex(d, scale_d, in_depth); + out_dptr[index] = in_dptr[in_helper.NdIndexToOffset(n, c, in_d, in_h, in_w)]; + } +} + +template +__global__ void UpsampleNearest3DBackward(const int64_t elem_cnt, const T* dy_dptr, + NdIndexOffsetHelper dy_helper, + NdIndexOffsetHelper dx_helper, + const int64_t in_depth, const int64_t in_height, + const int64_t in_width, const float scale_d, + const float scale_h, const float scale_w, T* dx_dptr) { + CUDA_1D_KERNEL_LOOP(index, elem_cnt) { + int64_t n, c, d, h, w; + dy_helper.OffsetToNdIndex(index, n, c, d, h, w); + const int64_t dx_h = GetNearestInputIndex(h, scale_h, in_height); + const int64_t dx_w = GetNearestInputIndex(w, scale_w, in_width); + const int64_t in_d = GetNearestInputIndex(d, scale_d, in_depth); + cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, in_d, dx_h, dx_w), dy_dptr[index]); + } +} + +} // namespace + +template +class UpsampleNearest1DGPUKernel final : public user_op::OpKernel { + public: + UpsampleNearest1DGPUKernel() = default; + ~UpsampleNearest1DGPUKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); + user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); + const std::vector output_size = ctx->Attr>("output_size"); + double height_scale = ctx->Attr("scale_factor"); + const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); + const int64_t in_height = x_tensor->shape_view().At(2); + const int64_t out_height = y_tensor->shape_view().At(2); + if (!output_size.empty()) { + height_scale = static_cast(out_height) / static_cast(in_height); + } + if (in_height == out_height) { + Memcpy( + ctx->stream(), y_tensor->mut_dptr(), x_tensor->dptr(), + x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type())); + } else { + NdIndexOffsetHelper in_helper( + x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2)); + NdIndexOffsetHelper out_helper( + y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2)); + RUN_CUDA_KERNEL((UpsampleNearest1DForward), ctx->stream(), elem_cnt, elem_cnt, + x_tensor->dptr(), in_helper, out_helper, x_tensor->shape_view().At(2), + 1.f / height_scale, y_tensor->mut_dptr()); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class UpsampleNearestGrad1DGPUKernel final : public user_op::OpKernel { + public: + UpsampleNearestGrad1DGPUKernel() = default; + ~UpsampleNearestGrad1DGPUKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); + + Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, + dx_tensor->shape_view().elem_cnt() * sizeof(T)); + const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); + const std::vector output_size = ctx->Attr>("output_size"); + double height_scale = ctx->Attr("scale_factor"); + const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); + const int64_t in_height = dx_tensor->shape_view().At(2); + const int64_t out_height = dy_tensor->shape_view().At(2); + if (!output_size.empty()) { + height_scale = static_cast(out_height) / static_cast(in_height); + } + if (in_height == out_height) { + Memcpy( + ctx->stream(), dx_tensor->mut_dptr(), dy_tensor->dptr(), + dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type())); + } else { + NdIndexOffsetHelper dy_helper(dy_tensor->shape_view().At(0), + dy_tensor->shape_view().At(1), + dy_tensor->shape_view().At(2)); + NdIndexOffsetHelper dx_helper(dx_tensor->shape_view().At(0), + dx_tensor->shape_view().At(1), + dx_tensor->shape_view().At(2)); + RUN_CUDA_KERNEL((UpsampleNearest1DBackward), ctx->stream(), elem_cnt, elem_cnt, + dy_tensor->dptr(), dy_helper, dx_helper, dx_tensor->shape_view().At(2), + 1.f / height_scale, dx_tensor->mut_dptr()); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_UPSAMPNEAREST1D_CUDA_KERNEL(dtype) \ + REGISTER_USER_KERNEL("upsample_nearest_1d") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("y", 0) == GetDataType::value)); \ + REGISTER_USER_KERNEL("upsample_nearest_1d_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value)); + +REGISTER_UPSAMPNEAREST1D_CUDA_KERNEL(float) +REGISTER_UPSAMPNEAREST1D_CUDA_KERNEL(double) + +template +class UpsampleNearest2DGPUKernel final : public user_op::OpKernel { + public: + UpsampleNearest2DGPUKernel() = default; + ~UpsampleNearest2DGPUKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); + user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); + const std::vector output_size = ctx->Attr>("output_size"); + double height_scale = ctx->Attr("height_scale"); + double width_scale = ctx->Attr("width_scale"); + const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); + const int64_t in_height = x_tensor->shape_view().At(2); + const int64_t in_width = x_tensor->shape_view().At(3); + const int64_t out_height = y_tensor->shape_view().At(2); + const int64_t out_width = y_tensor->shape_view().At(3); + if (!output_size.empty()) { + height_scale = static_cast(out_height) / static_cast(in_height); + width_scale = static_cast(out_width) / static_cast(in_width); + } + + if (in_height == out_height && in_width == out_width) { + Memcpy( + ctx->stream(), y_tensor->mut_dptr(), x_tensor->dptr(), + x_tensor->shape_view().elem_cnt() * GetSizeOfDataType(x_tensor->data_type())); + } else { + NdIndexOffsetHelper in_helper( + x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2), + x_tensor->shape_view().At(3)); + NdIndexOffsetHelper out_helper( + y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2), + y_tensor->shape_view().At(3)); + RUN_CUDA_KERNEL((UpsampleNearest2DForward), ctx->stream(), elem_cnt, elem_cnt, + x_tensor->dptr(), in_helper, out_helper, x_tensor->shape_view().At(2), + x_tensor->shape_view().At(3), 1.f / height_scale, 1.f / width_scale, + y_tensor->mut_dptr()); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class UpsampleNearest2DGradGPUKernel final : public user_op::OpKernel { + public: + UpsampleNearest2DGradGPUKernel() = default; + ~UpsampleNearest2DGradGPUKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); + + Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, + dx_tensor->shape_view().elem_cnt() * sizeof(T)); + const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); + const std::vector output_size = ctx->Attr>("output_size"); + double height_scale = ctx->Attr("height_scale"); + double width_scale = ctx->Attr("width_scale"); + const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); + const int64_t in_height = dx_tensor->shape_view().At(2); + const int64_t in_width = dx_tensor->shape_view().At(3); + const int64_t out_height = dy_tensor->shape_view().At(2); + const int64_t out_width = dy_tensor->shape_view().At(3); + if (!output_size.empty()) { + height_scale = static_cast(out_height) / static_cast(in_height); + width_scale = static_cast(out_width) / static_cast(in_width); + } + if (in_height == out_height && in_width == out_width) { + Memcpy( + ctx->stream(), dx_tensor->mut_dptr(), dy_tensor->dptr(), + dy_tensor->shape_view().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type())); + } else { + NdIndexOffsetHelper dy_helper( + dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1), + dy_tensor->shape_view().At(2), dy_tensor->shape_view().At(3)); + NdIndexOffsetHelper dx_helper( + dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1), + dx_tensor->shape_view().At(2), dx_tensor->shape_view().At(3)); + RUN_CUDA_KERNEL((UpsampleNearest2DBackward), ctx->stream(), elem_cnt, elem_cnt, + dy_tensor->dptr(), dy_helper, dx_helper, dx_tensor->shape_view().At(2), + dx_tensor->shape_view().At(3), 1.f / height_scale, 1.f / width_scale, + dx_tensor->mut_dptr()); + } + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_UPSAMPLE_NEAREST_2D_CUDA_KERNEL(dtype) \ + REGISTER_USER_KERNEL("upsample_nearest_2d") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("y", 0) == GetDataType::value)); \ + REGISTER_USER_KERNEL("upsample_nearest_2d_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value)); + +REGISTER_UPSAMPLE_NEAREST_2D_CUDA_KERNEL(float) +REGISTER_UPSAMPLE_NEAREST_2D_CUDA_KERNEL(double) + +template +class UpsampleNearest3DGPUKernel final : public user_op::OpKernel { + public: + UpsampleNearest3DGPUKernel() = default; + ~UpsampleNearest3DGPUKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); + user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); + const std::vector output_size = ctx->Attr>("output_size"); + double depth_scale = ctx->Attr("depth_scale"); + double height_scale = ctx->Attr("height_scale"); + double width_scale = ctx->Attr("width_scale"); + const int64_t in_depth = x_tensor->shape_view().At(2); + const int64_t in_height = x_tensor->shape_view().At(3); + const int64_t in_width = x_tensor->shape_view().At(4); + const int64_t out_depth = y_tensor->shape_view().At(2); + const int64_t out_height = y_tensor->shape_view().At(3); + const int64_t out_width = y_tensor->shape_view().At(4); + const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); + if (!output_size.empty()) { + depth_scale = static_cast(out_depth) / static_cast(in_depth); + height_scale = static_cast(out_height) / static_cast(in_height); + width_scale = static_cast(out_width) / static_cast(in_width); + } + NdIndexOffsetHelper in_helper( + x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2), + x_tensor->shape_view().At(3), x_tensor->shape_view().At(4)); + NdIndexOffsetHelper out_helper( + y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2), + y_tensor->shape_view().At(3), y_tensor->shape_view().At(4)); + RUN_CUDA_KERNEL((UpsampleNearest3DForward), ctx->stream(), elem_cnt, elem_cnt, + x_tensor->dptr(), in_helper, out_helper, x_tensor->shape_view().At(2), + x_tensor->shape_view().At(3), x_tensor->shape_view().At(4), 1.f / depth_scale, + 1.f / height_scale, 1.f / width_scale, y_tensor->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class UpsampleNearestGrad3DGPUKernel final : public user_op::OpKernel { + public: + UpsampleNearestGrad3DGPUKernel() = default; + ~UpsampleNearestGrad3DGPUKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); + + Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, + dx_tensor->shape_view().elem_cnt() * sizeof(T)); + const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); + const std::vector output_size = ctx->Attr>("output_size"); + double depth_scale = ctx->Attr("depth_scale"); + double height_scale = ctx->Attr("height_scale"); + double width_scale = ctx->Attr("width_scale"); + const int64_t in_depth = dx_tensor->shape_view().At(2); + const int64_t in_height = dx_tensor->shape_view().At(3); + const int64_t in_width = dx_tensor->shape_view().At(4); + const int64_t out_depth = dy_tensor->shape_view().At(2); + const int64_t out_height = dy_tensor->shape_view().At(3); + const int64_t out_width = dy_tensor->shape_view().At(4); + const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); + if (!output_size.empty()) { + depth_scale = static_cast(out_depth) / static_cast(in_depth); + height_scale = static_cast(out_height) / static_cast(in_height); + width_scale = static_cast(out_width) / static_cast(in_width); + } + NdIndexOffsetHelper dy_helper( + dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1), dy_tensor->shape_view().At(2), + dy_tensor->shape_view().At(3), dy_tensor->shape_view().At(4)); + NdIndexOffsetHelper dx_helper( + dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1), dx_tensor->shape_view().At(2), + dx_tensor->shape_view().At(3), dx_tensor->shape_view().At(4)); + RUN_CUDA_KERNEL((UpsampleNearest3DBackward), ctx->stream(), elem_cnt, elem_cnt, + dy_tensor->dptr(), dy_helper, dx_helper, dx_tensor->shape_view().At(2), + dx_tensor->shape_view().At(3), dx_tensor->shape_view().At(4), 1.f / depth_scale, + 1.f / height_scale, 1.f / width_scale, dx_tensor->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_UPSAMPNEAREST3D_CUDA_KERNEL(dtype) \ + REGISTER_USER_KERNEL("upsample_nearest_3d") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("y", 0) == GetDataType::value)); \ + REGISTER_USER_KERNEL("upsample_nearest_3d_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value)); + +REGISTER_UPSAMPNEAREST3D_CUDA_KERNEL(float) +REGISTER_UPSAMPNEAREST3D_CUDA_KERNEL(double) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/upsample_trilinear_3d_kernel.hip.cpp b/oneflow/user/kernels/upsample_trilinear_3d_kernel.hip.cpp index 489312f..030c651 100644 --- a/oneflow/user/kernels/upsample_trilinear_3d_kernel.hip.cpp +++ b/oneflow/user/kernels/upsample_trilinear_3d_kernel.hip.cpp @@ -1,237 +1,237 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include "oneflow/core/framework/framework.h" -#include "oneflow/core/kernel/new_kernel_util.h" -#include "oneflow/core/common/nd_index_offset_helper.h" -#include "oneflow/core/hip/atomic.hip.h" -#include "oneflow/user/kernels/upsample_kernel.h" - -namespace oneflow { - -namespace { - -template -__global__ void UpsampleTrilinear3DForward(const int64_t elem_cnt, const T* in_dptr, - NdIndexOffsetHelper in_helper, - NdIndexOffsetHelper out_helper, - const int64_t in_depth, const int64_t in_height, - const int64_t in_width, const T rdepth, const T rheight, - const T rwidth, const bool align_corners, T* out_dptr) { - CUDA_1D_KERNEL_LOOP(index, elem_cnt) { - int64_t n, c, d, h, w; - out_helper.OffsetToNdIndex(index, n, c, d, h, w); - - const T t1r = GetAreaPixel(rdepth, d, align_corners); - const int64_t t1 = t1r; - const int64_t t1p = (t1 < in_depth - 1) ? 1 : 0; - const T t1lambda = t1r - t1; - const T t0lambda = static_cast(1.) - t1lambda; - - const T h1r = GetAreaPixel(rheight, h, align_corners); - const int64_t h1 = h1r; - const int64_t h1p = (h1 < in_height - 1) ? 1 : 0; - const T h1lambda = h1r - h1; - const T h0lambda = static_cast(1.) - h1lambda; - - const T w1r = GetAreaPixel(rwidth, w, align_corners); - const int64_t w1 = w1r; - const int64_t w1p = (w1 < in_width - 1) ? 1 : 0; - const T w1lambda = w1r - w1; - const T w0lambda = static_cast(1.) - w1lambda; - - const T* pos1 = &in_dptr[in_helper.NdIndexToOffset(n, c, t1, h1, w1)]; - - out_dptr[index] = - t0lambda - * (h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p]) - + h1lambda - * (w0lambda * pos1[h1p * in_width] + w1lambda * pos1[h1p * in_width + w1p])) - + t1lambda - * (h0lambda - * (w0lambda * pos1[t1p * in_height * in_width] - + w1lambda * pos1[t1p * in_height * in_width + w1p]) - + h1lambda - * (w0lambda * pos1[t1p * in_height * in_width + h1p * in_width] - + w1lambda * pos1[t1p * in_height * in_width + h1p * in_width + w1p])); - } -} - -template -__global__ void UpsampleTrilinear3DBackward(const int64_t elem_cnt, const T* dy_dptr, - NdIndexOffsetHelper dy_helper, - NdIndexOffsetHelper dx_helper, - const int64_t in_depth, const int64_t in_height, - const int64_t in_width, const T rdepth, const T rheight, - const T rwidth, const bool align_corners, T* dx_dptr) { - CUDA_1D_KERNEL_LOOP(index, elem_cnt) { - int64_t n, c, d, h, w; - dy_helper.OffsetToNdIndex(index, n, c, d, h, w); - - const T t1r = GetAreaPixel(rdepth, d, align_corners); - const int64_t t1 = t1r; - const int64_t t1p = (t1 < in_depth - 1) ? 1 : 0; - const T t1lambda = t1r - t1; - const T t0lambda = static_cast(1.) - t1lambda; - - const T h1r = GetAreaPixel(rheight, h, align_corners); - const int64_t h1 = h1r; - const int64_t h1p = (h1 < in_height - 1) ? 1 : 0; - const T h1lambda = h1r - h1; - const T h0lambda = static_cast(1.) - h1lambda; - - const T w1r = GetAreaPixel(rwidth, w, align_corners); - const int64_t w1 = w1r; - const int64_t w1p = (w1 < in_width - 1) ? 1 : 0; - const T w1lambda = w1r - w1; - const T w0lambda = static_cast(1.) - w1lambda; - - T* pos1 = &dx_dptr[dx_helper.NdIndexToOffset(n, c, t1, h1, w1)]; - const T* pos2 = &dy_dptr[index]; - - cuda::atomic::Add(pos1 + 0, t0lambda * h0lambda * w0lambda * pos2[0]); - cuda::atomic::Add(pos1 + w1p, t0lambda * h0lambda * w1lambda * pos2[0]); - cuda::atomic::Add(pos1 + h1p * in_width, t0lambda * h1lambda * w0lambda * pos2[0]); - cuda::atomic::Add(pos1 + h1p * in_width + w1p, t0lambda * h1lambda * w1lambda * pos2[0]); - cuda::atomic::Add(pos1 + t1p * in_height * in_width, t1lambda * h0lambda * w0lambda * pos2[0]); - cuda::atomic::Add(pos1 + t1p * in_height * in_width + w1p, - t1lambda * h0lambda * w1lambda * pos2[0]); - cuda::atomic::Add(pos1 + t1p * in_height * in_width + h1p * in_width, - t1lambda * h1lambda * w0lambda * pos2[0]); - cuda::atomic::Add(pos1 + t1p * in_height * in_width + h1p * in_width + w1p, - t1lambda * h1lambda * w1lambda * pos2[0]); - } -} - -} // namespace - -template -class UpsampleTrilinear3DGPUKernel final : public user_op::OpKernel { - public: - UpsampleTrilinear3DGPUKernel() = default; - ~UpsampleTrilinear3DGPUKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); - user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); - const bool align_corners = ctx->Attr("align_corners"); - const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); - NdIndexOffsetHelper in_helper( - x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2), - x_tensor->shape_view().At(3), x_tensor->shape_view().At(4)); - NdIndexOffsetHelper out_helper( - y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2), - y_tensor->shape_view().At(3), y_tensor->shape_view().At(4)); - - const int64_t in_depth = x_tensor->shape_view().At(2); - const int64_t in_height = x_tensor->shape_view().At(3); - const int64_t in_width = x_tensor->shape_view().At(4); - - const int64_t out_depth = y_tensor->shape_view().At(2); - const int64_t out_height = y_tensor->shape_view().At(3); - const int64_t out_width = y_tensor->shape_view().At(4); - - const std::vector output_size = ctx->Attr>("output_size"); - double depth_scale = ctx->Attr("depth_scale"); - double height_scale = ctx->Attr("height_scale"); - double width_scale = ctx->Attr("width_scale"); - if (!output_size.empty()) { - depth_scale = static_cast(out_depth) / static_cast(in_depth); - height_scale = static_cast(out_height) / static_cast(in_height); - width_scale = static_cast(out_width) / static_cast(in_width); - } - - const T scale_depth = GetAreaPixelScale(in_depth, out_depth, align_corners, depth_scale); - const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale); - const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale); - - RUN_CUDA_KERNEL((UpsampleTrilinear3DForward), ctx->stream(), elem_cnt, elem_cnt, - x_tensor->dptr(), in_helper, out_helper, x_tensor->shape_view().At(2), - x_tensor->shape_view().At(3), x_tensor->shape_view().At(4), scale_depth, - scale_height, scale_width, align_corners, y_tensor->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -template -class UpsampleTrilinearGrad3DGPUKernel final : public user_op::OpKernel { - public: - UpsampleTrilinearGrad3DGPUKernel() = default; - ~UpsampleTrilinearGrad3DGPUKernel() = default; - - private: - using user_op::OpKernel::Compute; - void Compute(user_op::KernelComputeContext* ctx) const override { - user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); - - Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, - dx_tensor->shape_view().elem_cnt() * sizeof(T)); - const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); - const bool align_corners = ctx->Attr("align_corners"); - const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); - NdIndexOffsetHelper dy_helper( - dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1), dy_tensor->shape_view().At(2), - dy_tensor->shape_view().At(3), dy_tensor->shape_view().At(4)); - NdIndexOffsetHelper dx_helper( - dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1), dx_tensor->shape_view().At(2), - dx_tensor->shape_view().At(3), dx_tensor->shape_view().At(4)); - - const int64_t in_depth = dx_tensor->shape_view().At(2); - const int64_t in_height = dx_tensor->shape_view().At(3); - const int64_t in_width = dx_tensor->shape_view().At(4); - - const int64_t out_depth = dy_tensor->shape_view().At(2); - const int64_t out_height = dy_tensor->shape_view().At(3); - const int64_t out_width = dy_tensor->shape_view().At(4); - - const std::vector output_size = ctx->Attr>("output_size"); - double depth_scale = ctx->Attr("depth_scale"); - double height_scale = ctx->Attr("height_scale"); - double width_scale = ctx->Attr("width_scale"); - if (!output_size.empty()) { - depth_scale = static_cast(out_depth) / static_cast(in_depth); - height_scale = static_cast(out_height) / static_cast(in_height); - width_scale = static_cast(out_width) / static_cast(in_width); - } - - const T scale_depth = GetAreaPixelScale(in_depth, out_depth, align_corners, depth_scale); - const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale); - const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale); - - RUN_CUDA_KERNEL((UpsampleTrilinear3DBackward), ctx->stream(), elem_cnt, elem_cnt, - dy_tensor->dptr(), dy_helper, dx_helper, dx_tensor->shape_view().At(2), - dx_tensor->shape_view().At(3), dx_tensor->shape_view().At(4), scale_depth, - scale_height, scale_width, align_corners, dx_tensor->mut_dptr()); - } - bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } -}; - -#define REGISTER_UPSAMPTRILINEAR3D_CUDA_KERNEL(dtype) \ - REGISTER_USER_KERNEL("upsample_trilinear_3d") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("y", 0) == GetDataType::value)); \ - REGISTER_USER_KERNEL("upsample_trilinear_3d_grad") \ - .SetCreateFn>() \ - .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ - && (user_op::HobDataType("dx", 0) == GetDataType::value)); - -REGISTER_UPSAMPTRILINEAR3D_CUDA_KERNEL(float) -REGISTER_UPSAMPTRILINEAR3D_CUDA_KERNEL(double) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include "oneflow/core/framework/framework.h" +#include "oneflow/core/kernel/new_kernel_util.h" +#include "oneflow/core/common/nd_index_offset_helper.h" +#include "oneflow/core/hip/atomic.hip.h" +#include "oneflow/user/kernels/upsample_kernel.h" + +namespace oneflow { + +namespace { + +template +__global__ void UpsampleTrilinear3DForward(const int64_t elem_cnt, const T* in_dptr, + NdIndexOffsetHelper in_helper, + NdIndexOffsetHelper out_helper, + const int64_t in_depth, const int64_t in_height, + const int64_t in_width, const T rdepth, const T rheight, + const T rwidth, const bool align_corners, T* out_dptr) { + CUDA_1D_KERNEL_LOOP(index, elem_cnt) { + int64_t n, c, d, h, w; + out_helper.OffsetToNdIndex(index, n, c, d, h, w); + + const T t1r = GetAreaPixel(rdepth, d, align_corners); + const int64_t t1 = t1r; + const int64_t t1p = (t1 < in_depth - 1) ? 1 : 0; + const T t1lambda = t1r - t1; + const T t0lambda = static_cast(1.) - t1lambda; + + const T h1r = GetAreaPixel(rheight, h, align_corners); + const int64_t h1 = h1r; + const int64_t h1p = (h1 < in_height - 1) ? 1 : 0; + const T h1lambda = h1r - h1; + const T h0lambda = static_cast(1.) - h1lambda; + + const T w1r = GetAreaPixel(rwidth, w, align_corners); + const int64_t w1 = w1r; + const int64_t w1p = (w1 < in_width - 1) ? 1 : 0; + const T w1lambda = w1r - w1; + const T w0lambda = static_cast(1.) - w1lambda; + + const T* pos1 = &in_dptr[in_helper.NdIndexToOffset(n, c, t1, h1, w1)]; + + out_dptr[index] = + t0lambda + * (h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p]) + + h1lambda + * (w0lambda * pos1[h1p * in_width] + w1lambda * pos1[h1p * in_width + w1p])) + + t1lambda + * (h0lambda + * (w0lambda * pos1[t1p * in_height * in_width] + + w1lambda * pos1[t1p * in_height * in_width + w1p]) + + h1lambda + * (w0lambda * pos1[t1p * in_height * in_width + h1p * in_width] + + w1lambda * pos1[t1p * in_height * in_width + h1p * in_width + w1p])); + } +} + +template +__global__ void UpsampleTrilinear3DBackward(const int64_t elem_cnt, const T* dy_dptr, + NdIndexOffsetHelper dy_helper, + NdIndexOffsetHelper dx_helper, + const int64_t in_depth, const int64_t in_height, + const int64_t in_width, const T rdepth, const T rheight, + const T rwidth, const bool align_corners, T* dx_dptr) { + CUDA_1D_KERNEL_LOOP(index, elem_cnt) { + int64_t n, c, d, h, w; + dy_helper.OffsetToNdIndex(index, n, c, d, h, w); + + const T t1r = GetAreaPixel(rdepth, d, align_corners); + const int64_t t1 = t1r; + const int64_t t1p = (t1 < in_depth - 1) ? 1 : 0; + const T t1lambda = t1r - t1; + const T t0lambda = static_cast(1.) - t1lambda; + + const T h1r = GetAreaPixel(rheight, h, align_corners); + const int64_t h1 = h1r; + const int64_t h1p = (h1 < in_height - 1) ? 1 : 0; + const T h1lambda = h1r - h1; + const T h0lambda = static_cast(1.) - h1lambda; + + const T w1r = GetAreaPixel(rwidth, w, align_corners); + const int64_t w1 = w1r; + const int64_t w1p = (w1 < in_width - 1) ? 1 : 0; + const T w1lambda = w1r - w1; + const T w0lambda = static_cast(1.) - w1lambda; + + T* pos1 = &dx_dptr[dx_helper.NdIndexToOffset(n, c, t1, h1, w1)]; + const T* pos2 = &dy_dptr[index]; + + cuda::atomic::Add(pos1 + 0, t0lambda * h0lambda * w0lambda * pos2[0]); + cuda::atomic::Add(pos1 + w1p, t0lambda * h0lambda * w1lambda * pos2[0]); + cuda::atomic::Add(pos1 + h1p * in_width, t0lambda * h1lambda * w0lambda * pos2[0]); + cuda::atomic::Add(pos1 + h1p * in_width + w1p, t0lambda * h1lambda * w1lambda * pos2[0]); + cuda::atomic::Add(pos1 + t1p * in_height * in_width, t1lambda * h0lambda * w0lambda * pos2[0]); + cuda::atomic::Add(pos1 + t1p * in_height * in_width + w1p, + t1lambda * h0lambda * w1lambda * pos2[0]); + cuda::atomic::Add(pos1 + t1p * in_height * in_width + h1p * in_width, + t1lambda * h1lambda * w0lambda * pos2[0]); + cuda::atomic::Add(pos1 + t1p * in_height * in_width + h1p * in_width + w1p, + t1lambda * h1lambda * w1lambda * pos2[0]); + } +} + +} // namespace + +template +class UpsampleTrilinear3DGPUKernel final : public user_op::OpKernel { + public: + UpsampleTrilinear3DGPUKernel() = default; + ~UpsampleTrilinear3DGPUKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); + user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); + const bool align_corners = ctx->Attr("align_corners"); + const int64_t elem_cnt = y_tensor->shape_view().elem_cnt(); + NdIndexOffsetHelper in_helper( + x_tensor->shape_view().At(0), x_tensor->shape_view().At(1), x_tensor->shape_view().At(2), + x_tensor->shape_view().At(3), x_tensor->shape_view().At(4)); + NdIndexOffsetHelper out_helper( + y_tensor->shape_view().At(0), y_tensor->shape_view().At(1), y_tensor->shape_view().At(2), + y_tensor->shape_view().At(3), y_tensor->shape_view().At(4)); + + const int64_t in_depth = x_tensor->shape_view().At(2); + const int64_t in_height = x_tensor->shape_view().At(3); + const int64_t in_width = x_tensor->shape_view().At(4); + + const int64_t out_depth = y_tensor->shape_view().At(2); + const int64_t out_height = y_tensor->shape_view().At(3); + const int64_t out_width = y_tensor->shape_view().At(4); + + const std::vector output_size = ctx->Attr>("output_size"); + double depth_scale = ctx->Attr("depth_scale"); + double height_scale = ctx->Attr("height_scale"); + double width_scale = ctx->Attr("width_scale"); + if (!output_size.empty()) { + depth_scale = static_cast(out_depth) / static_cast(in_depth); + height_scale = static_cast(out_height) / static_cast(in_height); + width_scale = static_cast(out_width) / static_cast(in_width); + } + + const T scale_depth = GetAreaPixelScale(in_depth, out_depth, align_corners, depth_scale); + const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale); + const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale); + + RUN_CUDA_KERNEL((UpsampleTrilinear3DForward), ctx->stream(), elem_cnt, elem_cnt, + x_tensor->dptr(), in_helper, out_helper, x_tensor->shape_view().At(2), + x_tensor->shape_view().At(3), x_tensor->shape_view().At(4), scale_depth, + scale_height, scale_width, align_corners, y_tensor->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +template +class UpsampleTrilinearGrad3DGPUKernel final : public user_op::OpKernel { + public: + UpsampleTrilinearGrad3DGPUKernel() = default; + ~UpsampleTrilinearGrad3DGPUKernel() = default; + + private: + using user_op::OpKernel::Compute; + void Compute(user_op::KernelComputeContext* ctx) const override { + user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); + + Memset(ctx->stream(), dx_tensor->mut_dptr(), 0, + dx_tensor->shape_view().elem_cnt() * sizeof(T)); + const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); + const bool align_corners = ctx->Attr("align_corners"); + const int64_t elem_cnt = dy_tensor->shape_view().elem_cnt(); + NdIndexOffsetHelper dy_helper( + dy_tensor->shape_view().At(0), dy_tensor->shape_view().At(1), dy_tensor->shape_view().At(2), + dy_tensor->shape_view().At(3), dy_tensor->shape_view().At(4)); + NdIndexOffsetHelper dx_helper( + dx_tensor->shape_view().At(0), dx_tensor->shape_view().At(1), dx_tensor->shape_view().At(2), + dx_tensor->shape_view().At(3), dx_tensor->shape_view().At(4)); + + const int64_t in_depth = dx_tensor->shape_view().At(2); + const int64_t in_height = dx_tensor->shape_view().At(3); + const int64_t in_width = dx_tensor->shape_view().At(4); + + const int64_t out_depth = dy_tensor->shape_view().At(2); + const int64_t out_height = dy_tensor->shape_view().At(3); + const int64_t out_width = dy_tensor->shape_view().At(4); + + const std::vector output_size = ctx->Attr>("output_size"); + double depth_scale = ctx->Attr("depth_scale"); + double height_scale = ctx->Attr("height_scale"); + double width_scale = ctx->Attr("width_scale"); + if (!output_size.empty()) { + depth_scale = static_cast(out_depth) / static_cast(in_depth); + height_scale = static_cast(out_height) / static_cast(in_height); + width_scale = static_cast(out_width) / static_cast(in_width); + } + + const T scale_depth = GetAreaPixelScale(in_depth, out_depth, align_corners, depth_scale); + const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale); + const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale); + + RUN_CUDA_KERNEL((UpsampleTrilinear3DBackward), ctx->stream(), elem_cnt, elem_cnt, + dy_tensor->dptr(), dy_helper, dx_helper, dx_tensor->shape_view().At(2), + dx_tensor->shape_view().At(3), dx_tensor->shape_view().At(4), scale_depth, + scale_height, scale_width, align_corners, dx_tensor->mut_dptr()); + } + bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } +}; + +#define REGISTER_UPSAMPTRILINEAR3D_CUDA_KERNEL(dtype) \ + REGISTER_USER_KERNEL("upsample_trilinear_3d") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("y", 0) == GetDataType::value)); \ + REGISTER_USER_KERNEL("upsample_trilinear_3d_grad") \ + .SetCreateFn>() \ + .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ + && (user_op::HobDataType("dx", 0) == GetDataType::value)); + +REGISTER_UPSAMPTRILINEAR3D_CUDA_KERNEL(float) +REGISTER_UPSAMPTRILINEAR3D_CUDA_KERNEL(double) + } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/variance_kernel_util.hip.cpp b/oneflow/user/kernels/variance_kernel_util.hip.cpp index 44eb164..47245f3 100644 --- a/oneflow/user/kernels/variance_kernel_util.hip.cpp +++ b/oneflow/user/kernels/variance_kernel_util.hip.cpp @@ -1,192 +1,192 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "hip/hip_runtime.h" -#include -#include "oneflow/user/kernels/variance_kernel_util.h" -#include "oneflow/core/hip/layer_norm.hip.h" - -namespace oneflow { -namespace user_op { - -namespace { -template -__inline__ __device__ T Nan(); - -template<> -__inline__ __device__ float Nan() { - return __int_as_float(0x7fffffffU); -} - -template<> -__inline__ __device__ double Nan() { - return __longlong_as_double(0xfff8000000000000ULL); -} -} // namespace - -template -__global__ void ComputeVarUsingWelfordWrapper(const T* in_ptr, T* out_ptr, const VarParam var_param, - bool is_nan) { - if (is_nan) { - CUDA_1D_KERNEL_LOOP(i, var_param.parallel_num) { out_ptr[i] = Nan(); } - } else { - CUDA_1D_KERNEL_LOOP(i, var_param.parallel_num) { - const size_t input_offset = LinearIndex2Offset( - i, var_param.dim_size_in_caxis, var_param.stride_in_caxis, var_param.caxis_size); - ComputeVarUsingWelford(&in_ptr[input_offset], &out_ptr[i], var_param); - } - } -} - -namespace { -template -inline __device__ void WelfordReduce(const T* in_ptr, T* mean, T* m2, T* count, - const size_t total_elem_cnt, const size_t start, - const size_t step) { - T old_mean = 0.0; - for (size_t i = start; i < total_elem_cnt; i += step) { - ++(*count); - old_mean = *mean; - *mean += (in_ptr[i] - *mean) / *count; - *m2 += (in_ptr[i] - *mean) * (in_ptr[i] - old_mean); - } -} - -template -inline __device__ void WelfordCombine(const T* b_mean, const T* b_m2, const T* b_count, T* mean, - T* m2, T* count, const size_t total_elem_cnt, - const size_t start, const size_t step) { - for (size_t i = start; i < total_elem_cnt; i += step) { - cuda::layer_norm::WelfordCombine(b_mean[i], b_m2[i], b_count[i], mean, m2, count); - } -} -__device__ int32_t done_block_count = 0; -} // namespace - -template -__global__ void ComputeVarScalarOut(const T* in_ptr, T* out_ptr, T* tmp_buffer_ptr, - const VarParam var_param) { - if (var_param.elem_cnt == 1 && var_param.unbiased == true) { - if (blockIdx.x == 0 && threadIdx.x == 0) { *out_ptr = Nan(); } - return; - } - const size_t elems_per_block = var_param.elem_cnt / gridDim.x; - const size_t elems_per_thread = elems_per_block / blockDim.x; - // tail element number in block - size_t tail_elems = elems_per_block % blockDim.x; - - T thread_mean = 0.0; - T thread_m2 = 0.0; - T thread_count = 0.0; - // every thread deal it's elems - if (elems_per_thread > 0) { - const size_t block_offset = blockIdx.x * elems_per_block; - WelfordReduce(&in_ptr[block_offset], &thread_mean, &thread_m2, &thread_count, - elems_per_block - tail_elems, threadIdx.x, blockDim.x); - } - // thread 0 of last block handles tail element between blocks - if (blockIdx.x == gridDim.x - 1 && threadIdx.x == 0) { - tail_elems += var_param.elem_cnt % gridDim.x; - } - // thread 0 deal tail elems - if (tail_elems != 0 && threadIdx.x == 0) { - const size_t tail_offset = blockIdx.x * elems_per_block + blockDim.x * elems_per_thread; - WelfordReduce(&in_ptr[tail_offset], &thread_mean, &thread_m2, &thread_count, tail_elems, - /*tail start=*/0, /*step=*/1); - } - - T block_mean = 0; - T block_m2 = 0; - T block_count = 0; - cuda::layer_norm::WelfordBlockAllReduce(thread_mean, thread_m2, thread_count, &block_mean, - &block_m2, &block_count); - - if (gridDim.x == 1) { - if (threadIdx.x == 0) { - *out_ptr = - cuda::layer_norm::Div(block_m2, (var_param.unbiased ? block_count - 1 : block_count)); - } - return; - } - - T* tmp_mean_ptr = tmp_buffer_ptr; - T* tmp_m2_ptr = &tmp_mean_ptr[gridDim.x]; - T* tmp_count_ptr = &tmp_m2_ptr[gridDim.x]; - if (threadIdx.x == 0) { - tmp_mean_ptr[blockIdx.x] = block_mean; - tmp_m2_ptr[blockIdx.x] = block_m2; - tmp_count_ptr[blockIdx.x] = block_count; - } - __shared__ bool is_last_block; - if (threadIdx.x == 0) { is_last_block = atomicAdd(&done_block_count, 1) == gridDim.x - 1; } - __syncthreads(); - if (is_last_block) { - T last_block_thread_mean = 0; - T last_block_thread_m2 = 0; - T last_block_thread_count = 0; - const size_t welforddatas_per_thread = gridDim.x / blockDim.x; - const size_t tail_welforddatas = gridDim.x % blockDim.x; - - if (welforddatas_per_thread > 0) { - WelfordCombine(tmp_mean_ptr, tmp_m2_ptr, tmp_count_ptr, &last_block_thread_mean, - &last_block_thread_m2, &last_block_thread_count, gridDim.x - tail_welforddatas, - threadIdx.x, blockDim.x); - } - // thread 0 deal tail welford data - if (tail_welforddatas != 0 && threadIdx.x == 0) { - const size_t last_block_tail_offset = blockDim.x * welforddatas_per_thread; - WelfordCombine(&tmp_mean_ptr[last_block_tail_offset], &tmp_m2_ptr[last_block_tail_offset], - &tmp_count_ptr[last_block_tail_offset], &last_block_thread_mean, - &last_block_thread_m2, &last_block_thread_count, tail_welforddatas, - /*tail start=*/0, /*step=*/1); - } - T final_mean = 0; - T final_m2 = 0; - T final_count = 0; - cuda::layer_norm::WelfordBlockAllReduce(last_block_thread_mean, last_block_thread_m2, - last_block_thread_count, &final_mean, &final_m2, - &final_count); - if (threadIdx.x == 0) { - *out_ptr = - cuda::layer_norm::Div(final_m2, (var_param.unbiased ? final_count - 1 : final_count)); - done_block_count = 0; - } - } -} - -template -struct VarFunctor final { - void operator()(ep::Stream* stream, const T* in_ptr, T* out_ptr, T* tmp_buffer_ptr, - const VarParam var_param) { - int grid_dim = 0; - int block_dim = 0; - SetGridDimAndBlockDim(var_param.elem_cnt, &grid_dim, &block_dim); - if (var_param.parallel_num == 1) { - ComputeVarScalarOut - <<As()->cuda_stream()>>>( - in_ptr, out_ptr, tmp_buffer_ptr, var_param); - } else { - // if var_param.parallel_num is 0, do nothing, return 0-size tensor - if (var_param.parallel_num == 0) { return; } - RUN_CUDA_KERNEL(ComputeVarUsingWelfordWrapper, stream, var_param.parallel_num, in_ptr, - out_ptr, var_param, IsNanOut(var_param)); - } - } -}; - -template struct VarFunctor; -template struct VarFunctor; -} // namespace user_op +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "hip/hip_runtime.h" +#include +#include "oneflow/user/kernels/variance_kernel_util.h" +#include "oneflow/core/hip/layer_norm.hip.h" + +namespace oneflow { +namespace user_op { + +namespace { +template +__inline__ __device__ T Nan(); + +template<> +__inline__ __device__ float Nan() { + return __int_as_float(0x7fffffffU); +} + +template<> +__inline__ __device__ double Nan() { + return __longlong_as_double(0xfff8000000000000ULL); +} +} // namespace + +template +__global__ void ComputeVarUsingWelfordWrapper(const T* in_ptr, T* out_ptr, const VarParam var_param, + bool is_nan) { + if (is_nan) { + CUDA_1D_KERNEL_LOOP(i, var_param.parallel_num) { out_ptr[i] = Nan(); } + } else { + CUDA_1D_KERNEL_LOOP(i, var_param.parallel_num) { + const size_t input_offset = LinearIndex2Offset( + i, var_param.dim_size_in_caxis, var_param.stride_in_caxis, var_param.caxis_size); + ComputeVarUsingWelford(&in_ptr[input_offset], &out_ptr[i], var_param); + } + } +} + +namespace { +template +inline __device__ void WelfordReduce(const T* in_ptr, T* mean, T* m2, T* count, + const size_t total_elem_cnt, const size_t start, + const size_t step) { + T old_mean = 0.0; + for (size_t i = start; i < total_elem_cnt; i += step) { + ++(*count); + old_mean = *mean; + *mean += (in_ptr[i] - *mean) / *count; + *m2 += (in_ptr[i] - *mean) * (in_ptr[i] - old_mean); + } +} + +template +inline __device__ void WelfordCombine(const T* b_mean, const T* b_m2, const T* b_count, T* mean, + T* m2, T* count, const size_t total_elem_cnt, + const size_t start, const size_t step) { + for (size_t i = start; i < total_elem_cnt; i += step) { + cuda::layer_norm::WelfordCombine(b_mean[i], b_m2[i], b_count[i], mean, m2, count); + } +} +__device__ int32_t done_block_count = 0; +} // namespace + +template +__global__ void ComputeVarScalarOut(const T* in_ptr, T* out_ptr, T* tmp_buffer_ptr, + const VarParam var_param) { + if (var_param.elem_cnt == 1 && var_param.unbiased == true) { + if (blockIdx.x == 0 && threadIdx.x == 0) { *out_ptr = Nan(); } + return; + } + const size_t elems_per_block = var_param.elem_cnt / gridDim.x; + const size_t elems_per_thread = elems_per_block / blockDim.x; + // tail element number in block + size_t tail_elems = elems_per_block % blockDim.x; + + T thread_mean = 0.0; + T thread_m2 = 0.0; + T thread_count = 0.0; + // every thread deal it's elems + if (elems_per_thread > 0) { + const size_t block_offset = blockIdx.x * elems_per_block; + WelfordReduce(&in_ptr[block_offset], &thread_mean, &thread_m2, &thread_count, + elems_per_block - tail_elems, threadIdx.x, blockDim.x); + } + // thread 0 of last block handles tail element between blocks + if (blockIdx.x == gridDim.x - 1 && threadIdx.x == 0) { + tail_elems += var_param.elem_cnt % gridDim.x; + } + // thread 0 deal tail elems + if (tail_elems != 0 && threadIdx.x == 0) { + const size_t tail_offset = blockIdx.x * elems_per_block + blockDim.x * elems_per_thread; + WelfordReduce(&in_ptr[tail_offset], &thread_mean, &thread_m2, &thread_count, tail_elems, + /*tail start=*/0, /*step=*/1); + } + + T block_mean = 0; + T block_m2 = 0; + T block_count = 0; + cuda::layer_norm::WelfordBlockAllReduce(thread_mean, thread_m2, thread_count, &block_mean, + &block_m2, &block_count); + + if (gridDim.x == 1) { + if (threadIdx.x == 0) { + *out_ptr = + cuda::layer_norm::Div(block_m2, (var_param.unbiased ? block_count - 1 : block_count)); + } + return; + } + + T* tmp_mean_ptr = tmp_buffer_ptr; + T* tmp_m2_ptr = &tmp_mean_ptr[gridDim.x]; + T* tmp_count_ptr = &tmp_m2_ptr[gridDim.x]; + if (threadIdx.x == 0) { + tmp_mean_ptr[blockIdx.x] = block_mean; + tmp_m2_ptr[blockIdx.x] = block_m2; + tmp_count_ptr[blockIdx.x] = block_count; + } + __shared__ bool is_last_block; + if (threadIdx.x == 0) { is_last_block = atomicAdd(&done_block_count, 1) == gridDim.x - 1; } + __syncthreads(); + if (is_last_block) { + T last_block_thread_mean = 0; + T last_block_thread_m2 = 0; + T last_block_thread_count = 0; + const size_t welforddatas_per_thread = gridDim.x / blockDim.x; + const size_t tail_welforddatas = gridDim.x % blockDim.x; + + if (welforddatas_per_thread > 0) { + WelfordCombine(tmp_mean_ptr, tmp_m2_ptr, tmp_count_ptr, &last_block_thread_mean, + &last_block_thread_m2, &last_block_thread_count, gridDim.x - tail_welforddatas, + threadIdx.x, blockDim.x); + } + // thread 0 deal tail welford data + if (tail_welforddatas != 0 && threadIdx.x == 0) { + const size_t last_block_tail_offset = blockDim.x * welforddatas_per_thread; + WelfordCombine(&tmp_mean_ptr[last_block_tail_offset], &tmp_m2_ptr[last_block_tail_offset], + &tmp_count_ptr[last_block_tail_offset], &last_block_thread_mean, + &last_block_thread_m2, &last_block_thread_count, tail_welforddatas, + /*tail start=*/0, /*step=*/1); + } + T final_mean = 0; + T final_m2 = 0; + T final_count = 0; + cuda::layer_norm::WelfordBlockAllReduce(last_block_thread_mean, last_block_thread_m2, + last_block_thread_count, &final_mean, &final_m2, + &final_count); + if (threadIdx.x == 0) { + *out_ptr = + cuda::layer_norm::Div(final_m2, (var_param.unbiased ? final_count - 1 : final_count)); + done_block_count = 0; + } + } +} + +template +struct VarFunctor final { + void operator()(ep::Stream* stream, const T* in_ptr, T* out_ptr, T* tmp_buffer_ptr, + const VarParam var_param) { + int grid_dim = 0; + int block_dim = 0; + SetGridDimAndBlockDim(var_param.elem_cnt, &grid_dim, &block_dim); + if (var_param.parallel_num == 1) { + ComputeVarScalarOut + <<As()->cuda_stream()>>>( + in_ptr, out_ptr, tmp_buffer_ptr, var_param); + } else { + // if var_param.parallel_num is 0, do nothing, return 0-size tensor + if (var_param.parallel_num == 0) { return; } + RUN_CUDA_KERNEL(ComputeVarUsingWelfordWrapper, stream, var_param.parallel_num, in_ptr, + out_ptr, var_param, IsNanOut(var_param)); + } + } +}; + +template struct VarFunctor; +template struct VarFunctor; +} // namespace user_op } // namespace oneflow \ No newline at end of file diff --git a/oneflow/user/kernels/where_kernel_util.hip.cpp b/oneflow/user/kernels/where_kernel_util.hip.cpp index 276b1c1..b3a619a 100644 --- a/oneflow/user/kernels/where_kernel_util.hip.cpp +++ b/oneflow/user/kernels/where_kernel_util.hip.cpp @@ -1,90 +1,90 @@ -/* -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -#include "oneflow/user/kernels/where_kernel_util.h" -#include "oneflow/core/hip/elementwise.hip.h" -#include "oneflow/core/ep/rocm/cuda_stream.h" - -namespace oneflow { - -namespace { - -template -struct WhereFunctor { - OF_DEVICE_FUNC T operator()(CondT cond, T lhs, T rhs) const { - return static_cast(cond) ? lhs : rhs; - } -}; - -template -struct WhereScalarXFunctor { - OF_DEVICE_FUNC explicit WhereScalarXFunctor(T scalar) : x_scalar(scalar) {} - OF_DEVICE_FUNC T operator()(CondT cond, T rhs) const { - return static_cast(cond) ? x_scalar : rhs; - } - const T x_scalar; -}; - -template -struct WhereScalarYFunctor { - OF_DEVICE_FUNC explicit WhereScalarYFunctor(T scalar) : y_scalar(scalar) {} - OF_DEVICE_FUNC T operator()(CondT cond, T lhs) const { - return static_cast(cond) ? lhs : y_scalar; - } - const T y_scalar; -}; - -template -struct WhereScalarXYFunctor { - OF_DEVICE_FUNC explicit WhereScalarXYFunctor(T x_scalar, T y_scalar) - : x_scalar(x_scalar), y_scalar(y_scalar) {} - OF_DEVICE_FUNC T operator()(CondT cond) const { - return static_cast(cond) ? x_scalar : y_scalar; - } - const T x_scalar; - const T y_scalar; -}; - -} // namespace - -template -struct WhereKernelUtil { - static void Where(ep::Stream* stream, const int64_t elem_cnt, const CondT* cond, const T* lhs, - const T* rhs, T* out) { - cuda::elementwise::Ternary(WhereFunctor(), elem_cnt, out, cond, lhs, rhs, - stream->As()->cuda_stream()); - } - static void WhereXScalar(ep::Stream* stream, const int64_t elem_cnt, const CondT* cond, - const T x_scalar, const T* rhs, T* out) { - cuda::elementwise::Binary(WhereScalarXFunctor(x_scalar), elem_cnt, out, cond, rhs, - stream->As()->cuda_stream()); - } - static void WhereYScalar(ep::Stream* stream, const int64_t elem_cnt, const CondT* cond, - const T* lhs, const T y_scalar, T* out) { - cuda::elementwise::Binary(WhereScalarYFunctor(y_scalar), elem_cnt, out, cond, lhs, - stream->As()->cuda_stream()); - } - static void WhereXYScalar(ep::Stream* stream, const int64_t elem_cnt, const CondT* cond, - const T x_scalar, const T y_scalar, T* out) { - cuda::elementwise::Unary(WhereScalarXYFunctor(x_scalar, y_scalar), elem_cnt, out, - cond, stream->As()->cuda_stream()); - } -}; - -OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_WHERE_FUNCTOR, (DeviceType::kCUDA), - ARITHMETIC_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, - INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ) - +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "oneflow/user/kernels/where_kernel_util.h" +#include "oneflow/core/hip/elementwise.hip.h" +#include "oneflow/core/ep/rocm/cuda_stream.h" + +namespace oneflow { + +namespace { + +template +struct WhereFunctor { + OF_DEVICE_FUNC T operator()(CondT cond, T lhs, T rhs) const { + return static_cast(cond) ? lhs : rhs; + } +}; + +template +struct WhereScalarXFunctor { + OF_DEVICE_FUNC explicit WhereScalarXFunctor(T scalar) : x_scalar(scalar) {} + OF_DEVICE_FUNC T operator()(CondT cond, T rhs) const { + return static_cast(cond) ? x_scalar : rhs; + } + const T x_scalar; +}; + +template +struct WhereScalarYFunctor { + OF_DEVICE_FUNC explicit WhereScalarYFunctor(T scalar) : y_scalar(scalar) {} + OF_DEVICE_FUNC T operator()(CondT cond, T lhs) const { + return static_cast(cond) ? lhs : y_scalar; + } + const T y_scalar; +}; + +template +struct WhereScalarXYFunctor { + OF_DEVICE_FUNC explicit WhereScalarXYFunctor(T x_scalar, T y_scalar) + : x_scalar(x_scalar), y_scalar(y_scalar) {} + OF_DEVICE_FUNC T operator()(CondT cond) const { + return static_cast(cond) ? x_scalar : y_scalar; + } + const T x_scalar; + const T y_scalar; +}; + +} // namespace + +template +struct WhereKernelUtil { + static void Where(ep::Stream* stream, const int64_t elem_cnt, const CondT* cond, const T* lhs, + const T* rhs, T* out) { + cuda::elementwise::Ternary(WhereFunctor(), elem_cnt, out, cond, lhs, rhs, + stream->As()->cuda_stream()); + } + static void WhereXScalar(ep::Stream* stream, const int64_t elem_cnt, const CondT* cond, + const T x_scalar, const T* rhs, T* out) { + cuda::elementwise::Binary(WhereScalarXFunctor(x_scalar), elem_cnt, out, cond, rhs, + stream->As()->cuda_stream()); + } + static void WhereYScalar(ep::Stream* stream, const int64_t elem_cnt, const CondT* cond, + const T* lhs, const T y_scalar, T* out) { + cuda::elementwise::Binary(WhereScalarYFunctor(y_scalar), elem_cnt, out, cond, lhs, + stream->As()->cuda_stream()); + } + static void WhereXYScalar(ep::Stream* stream, const int64_t elem_cnt, const CondT* cond, + const T x_scalar, const T y_scalar, T* out) { + cuda::elementwise::Unary(WhereScalarXYFunctor(x_scalar, y_scalar), elem_cnt, out, + cond, stream->As()->cuda_stream()); + } +}; + +OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_WHERE_FUNCTOR, (DeviceType::kCUDA), + ARITHMETIC_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, + INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ) + } // namespace oneflow \ No newline at end of file diff --git a/python/oneflow/test/modules/fused_dot_feature_interaction.py b/python/oneflow/test/modules/fused_dot_feature_interaction.py index 6712f04..7b86daa 100644 --- a/python/oneflow/test/modules/fused_dot_feature_interaction.py +++ b/python/oneflow/test/modules/fused_dot_feature_interaction.py @@ -1,43 +1,43 @@ -import numpy as np -import oneflow as flow - -def fused_dot_feature_interaction(x, - y, - self_interaction=False, - output_padding=0, - output_concat=None, - dtype=flow.float32 - ): - # (bs, es) = x.shape - (bs, dims, es) = y.shape - - if self_interaction: - offset = 1 - else: - offset = 0 - li = flow.tensor([i for i in range(dims + 1) for j in range(i + offset)]) - lj = flow.tensor([j for i in range(dims + 1) for j in range(i + offset)]) - T = flow.cat( - [ - flow.reshape(x, (bs, 1, es)), - y, - ], - dim=1, - ) - Z = flow.matmul(T, T, transpose_b=True) - # gather_nd not support half, so cast to float32 - Z = flow.cast(Z, flow.float32) - Zflat = Z[:, li, lj] - Zflat = flow.cast(Zflat, dtype) - if output_concat is not None: - R = flow.cat([output_concat, Zflat], dim=1) - else: - R = Zflat - if output_padding != 0: - padding_tensor = flow.tensor( - np.zeros((bs, output_padding)).astype(np.float32), - device="cuda", - requires_grad=False, - ) - R = flow.cat([R, padding_tensor], dim=1) - return R +import numpy as np +import oneflow as flow + +def fused_dot_feature_interaction(x, + y, + self_interaction=False, + output_padding=0, + output_concat=None, + dtype=flow.float32 + ): + # (bs, es) = x.shape + (bs, dims, es) = y.shape + + if self_interaction: + offset = 1 + else: + offset = 0 + li = flow.tensor([i for i in range(dims + 1) for j in range(i + offset)]) + lj = flow.tensor([j for i in range(dims + 1) for j in range(i + offset)]) + T = flow.cat( + [ + flow.reshape(x, (bs, 1, es)), + y, + ], + dim=1, + ) + Z = flow.matmul(T, T, transpose_b=True) + # gather_nd not support half, so cast to float32 + Z = flow.cast(Z, flow.float32) + Zflat = Z[:, li, lj] + Zflat = flow.cast(Zflat, dtype) + if output_concat is not None: + R = flow.cat([output_concat, Zflat], dim=1) + else: + R = Zflat + if output_padding != 0: + padding_tensor = flow.tensor( + np.zeros((bs, output_padding)).astype(np.float32), + device="cuda", + requires_grad=False, + ) + R = flow.cat([R, padding_tensor], dim=1) + return R diff --git a/python/oneflow/test/modules/test_conv.py b/python/oneflow/test/modules/test_conv.py index 001d35e..201f9b7 100644 --- a/python/oneflow/test/modules/test_conv.py +++ b/python/oneflow/test/modules/test_conv.py @@ -1,346 +1,346 @@ -import unittest -from collections import OrderedDict - -import numpy as np -from oneflow.test_utils.test_util import GenArgList -from oneflow.test_utils.automated_test_util import * - -import oneflow as flow -import oneflow.nn as nn -import oneflow.unittest - -np_arr = np.array([[[1.28795946, -0.2921792, 0.20338029, 0.78604293, -1.89607573]]]) -input = flow.tensor( - np_arr, dtype=flow.float32, device=flow.device("cuda"), requires_grad=True -) -weight = np.array( - [ - [[0.10197904, 0.3372305, -0.25743008]], - [[0.27720425, -0.52435774, -0.38381988]], - [[0.56016803, -0.10063095, -0.10760903]], - ] -) -m = nn.Conv1d(1, 3, 3, stride=1, bias=False) -m.weight = flow.nn.Parameter(flow.Tensor(weight)) -m = m.to("cuda") -output = m(input) -np_out = np.array( - [ - [ - [-0.01954307, -0.16356121, 0.77392507], - [0.43217283, -0.48933625, 0.37196174], - [0.72899038, -0.2687211, 0.23886177], - ] - ] -) -if np.allclose(output.numpy(), np_out, 1e-06, 1e-06): - print("conv1d Passed") -output = output.sum() -output.backward() -np_grad = np.array( - [[[0.93935132, 0.65159315, -0.09726584, -1.03661716, -0.74885899]]] -) -if np.allclose(input.grad.numpy(), np_grad, 1e-06, 1e-06): - print("conv1d_back Passed") - - - -test_conv2d_weight = np.array( - [ - [ - [ - [0.8586049675941467, -0.2279418259859085, 0.2013147622346878], - [0.35005471110343933, 0.5360521078109741, 1.5194443464279175], - [1.9040879011154175, -1.5734431743621826, -0.14007866382598877], - ] - ], - [ - [ - [0.29670074582099915, 1.3111951351165771, 0.5035904049873352], - [-1.1894450187683105, -0.5502137541770935, -1.591875672340393], - [-1.1081947088241577, 0.07872020453214645, -0.9185634255409241], - ] - ], - [ - [ - [-0.7457143664360046, -1.2080862522125244, 1.8140212297439575], - [-1.5227429866790771, -2.515244960784912, -1.3549325466156006], - [-0.9574840068817139, -0.7248556613922119, 1.1119636297225952], - ] - ], - ] -) -test_conv2d_data = np.array( - [ - [ - [ - [ - 1.1630785465240479, - 0.4838046133518219, - 0.299563467502594, - 0.15302546322345734, - -1.168814778327942, - ], - [ - 1.5580710172653198, - -0.5459445714950562, - -2.3556296825408936, - 0.5414402484893799, - 2.678506374359131, - ], - [ - 1.2546343803405762, - -0.5487740635871887, - -0.6810643672943115, - -0.13531559705734253, - 0.37723132967948914, - ], - [ - 0.41016456484794617, - 0.5712682008743286, - -2.757962703704834, - 1.0762799978256226, - -0.6141325235366821, - ], - [ - 1.830764889717102, - -1.1468064785003662, - 0.053837940096855164, - -2.5074806213378906, - -0.5916498899459839, - ], - ] - ] - ] -) -test_conv2d_data_grad = np.array( - [ - [ - [ - [ - 0.4095913469791412, - 0.2847584038972855, - 2.803684800863266, - 2.3940934538841248, - 2.5189263969659805, - ], - [ - -1.9525419473648071, - -4.606781497597694, - -3.51521897315979, - -1.562677025794983, - 1.0915625244379044, - ], - [ - -2.1141327619552612, - -6.987950943410397, - -5.84306687861681, - -3.7289341166615486, - 1.1448840647935867, - ], - [ - -2.5237241089344025, - -7.272709347307682, - -8.646751679480076, - -6.123027570545673, - -1.3740423321723938, - ], - [ - -0.1615908145904541, - -2.381169445812702, - -2.32784790545702, - -2.1662570908665657, - 0.0533215403556824, - ], - ] - ] - ] -) -test_conv2d_weight_grad = np.array( - [ - [ - [ - [0.6277393400669098, -2.7888944894075394, -0.2910575419664383], - [-3.095237225294113, -4.835702538490295, -1.8706469237804413], - [-1.0139376372098923, -6.076017692685127, -5.780256435275078], - ] - ], - [ - [ - [0.6277393400669098, -2.7888944894075394, -0.2910575419664383], - [-3.095237225294113, -4.835702538490295, -1.8706469237804413], - [-1.0139376372098923, -6.076017692685127, -5.780256435275078], - ] - ], - [ - [ - [0.6277393400669098, -2.7888944894075394, -0.2910575419664383], - [-3.095237225294113, -4.835702538490295, -1.8706469237804413], - [-1.0139376372098923, -6.076017692685127, -5.780256435275078], - ] - ], - ] -) -test_conv2d_output = np.array( - [ - [ - [ - [0.9699610471725464, -0.20758534967899323, 2.3857712745666504], - [0.3666309118270874, 4.690882682800293, -8.203354835510254], - [2.6072847843170166, -1.9033538103103638, 2.331153154373169], - ], - [ - [2.519343852996826, 2.3757898807525635, -1.6613528728485107], - [0.5777544379234314, -3.5739502906799316, 5.349126815795898], - [0.729295015335083, 1.5791023969650269, 3.7627718448638916], - ], - [ - [-0.27685487270355225, 6.446267127990723, -2.762883424758911], - [-8.25644588470459, 9.616064071655273, 8.005367279052734], - [-0.6944921016693115, 3.866114854812622, 4.788446426391602], - ], - ] - ] -) -test_conv2d_with_bias_weight = np.array( - [ - [ - [ - [1.8271433115005493, -1.0446699857711792, 1.0062190294265747], - [0.5174201130867004, -0.806931734085083, 1.3769007921218872], - [0.205885112285614, 0.9943519234657288, -0.23580588400363922], - ] - ], - [ - [ - [0.29881811141967773, -1.9982075691223145, 0.3511354625225067], - [-0.7644741535186768, 1.2594351768493652, -0.9629734754562378], - [0.5080506205558777, 0.7561734318733215, 1.6839302778244019], - ] - ], - [ - [ - [1.2573646306991577, 0.13123232126235962, 1.6403018236160278], - [-1.2138012647628784, 2.399970531463623, -0.38509097695350647], - [-0.9878040552139282, 0.9585888385772705, -1.4976465702056885], - ] - ], - ] -) -test_conv2d_with_bias_bias = np.array( - [0.6605162620544434, -0.18903568387031555, -0.27302607893943787] -) -test_conv2d_with_bias_data = np.array( - [ - [ - [ - [ - -0.47827261686325073, - -1.1739492416381836, - -0.7921845316886902, - 0.9321041703224182, - -3.1557741165161133, - ], - [ - 2.1935296058654785, - -0.5385921001434326, - -0.8611332774162292, - -1.881519079208374, - -0.7205708026885986, - ], - [ - -0.35601571202278137, - -0.15963983535766602, - 1.797447681427002, - 0.19594945013523102, - -1.7376397848129272, - ], - [ - 0.047347065061330795, - 0.14580930769443512, - 0.32604914903640747, - 0.4578782916069031, - -0.8942581415176392, - ], - [ - 0.49383941292762756, - -0.9043426513671875, - -1.2140793800354004, - 2.1564064025878906, - 1.0938222408294678, - ], - ] - ] - ] -) -test_conv2d_with_bias_output = np.array( - [ - [ - [ - [-0.05607491731643677, -0.185230553150177, -3.8808679580688477], - [6.861937046051025, -2.3341472148895264, -0.5597308874130249], - [1.8299254179000854, -2.770848274230957, 2.1958212852478027], - ], - [ - [2.9348952770233154, 4.117504119873047, -6.278541088104248], - [0.2638452351093292, 3.998856782913208, 2.612290620803833], - [-1.9891828298568726, -1.6476304531097412, 3.39066219329834], - ], - [ - [-8.44466781616211, 0.5747121572494507, -8.501373291015625], - [-0.036642804741859436, -0.23458999395370483, -2.370849370956421], - [2.8372013568878174, -2.987276077270508, 1.8382092714309692], - ], - ] - ] -) - -to_device = flow.device("cuda") - -conv = flow.nn.Conv2d(1, 3, (3, 3), bias=True).to(to_device) -x = flow.tensor(test_conv2d_with_bias_data, dtype=flow.float32, device=to_device) -conv.weight = flow.nn.Parameter(flow.Tensor(test_conv2d_with_bias_weight)) -conv.bias = flow.nn.Parameter(flow.Tensor(test_conv2d_with_bias_bias)) -conv.to(to_device) -of_out = conv(x) -if np.allclose(of_out.numpy(), test_conv2d_with_bias_output, rtol=1e-4, atol=1e-8): - print("conv2d_bias Passed") - -conv = flow.nn.Conv2d(1, 3, (3, 3), bias=False).to(flow.device("cuda")) -x = flow.tensor(test_conv2d_data, dtype=flow.float32, device=to_device, requires_grad=True) -conv.weight = flow.nn.Parameter(flow.Tensor(test_conv2d_weight), requires_grad=True) -conv.to(to_device) -of_out = conv(x) -of_out.sum().backward() -if np.allclose(x.grad.numpy(), test_conv2d_data_grad, rtol=1e-4, atol=1e-8): - print("con2d_back_data_grad Passed") - -if np.allclose(conv.weight.grad.numpy(), test_conv2d_weight_grad, rtol=1e-4, atol=1e-8): - print("con2d_back_weight_grad Passed") - -conv = flow.nn.Conv2d(1, 3, (3, 3), bias=True).to(to_device) -x = flow.tensor(test_conv2d_with_bias_data, dtype=flow.float32, device=to_device) -conv.weight = flow.nn.Parameter(flow.Tensor(test_conv2d_with_bias_weight)) -conv.bias = flow.nn.Parameter(flow.Tensor(test_conv2d_with_bias_bias)) -conv.to(to_device) -of_out = conv(x) -if np.allclose(of_out.numpy(), test_conv2d_with_bias_output, rtol=1e-4, atol=1e-8): - print("conv2d_bias Passed") - -conv = flow.nn.Conv2d(1, 3, (3, 3), bias=False).to(flow.device("cuda")) -x = flow.tensor(test_conv2d_data, dtype=flow.float32, device=to_device, requires_grad=True) -conv.weight = flow.nn.Parameter(flow.Tensor(test_conv2d_weight), requires_grad=True) -conv.to(to_device) -of_out = conv(x) -of_out.sum().backward() -if np.allclose(x.grad.numpy(), test_conv2d_data_grad, rtol=1e-4, atol=1e-8): - print("con2d_back_data_grad Passed") - -if np.allclose(conv.weight.grad.numpy(), test_conv2d_weight_grad, rtol=1e-4, atol=1e-8): - print("con2d_back_weight_grad Passed") - - - - - - +import unittest +from collections import OrderedDict + +import numpy as np +from oneflow.test_utils.test_util import GenArgList +from oneflow.test_utils.automated_test_util import * + +import oneflow as flow +import oneflow.nn as nn +import oneflow.unittest + +np_arr = np.array([[[1.28795946, -0.2921792, 0.20338029, 0.78604293, -1.89607573]]]) +input = flow.tensor( + np_arr, dtype=flow.float32, device=flow.device("cuda"), requires_grad=True +) +weight = np.array( + [ + [[0.10197904, 0.3372305, -0.25743008]], + [[0.27720425, -0.52435774, -0.38381988]], + [[0.56016803, -0.10063095, -0.10760903]], + ] +) +m = nn.Conv1d(1, 3, 3, stride=1, bias=False) +m.weight = flow.nn.Parameter(flow.Tensor(weight)) +m = m.to("cuda") +output = m(input) +np_out = np.array( + [ + [ + [-0.01954307, -0.16356121, 0.77392507], + [0.43217283, -0.48933625, 0.37196174], + [0.72899038, -0.2687211, 0.23886177], + ] + ] +) +if np.allclose(output.numpy(), np_out, 1e-06, 1e-06): + print("conv1d Passed") +output = output.sum() +output.backward() +np_grad = np.array( + [[[0.93935132, 0.65159315, -0.09726584, -1.03661716, -0.74885899]]] +) +if np.allclose(input.grad.numpy(), np_grad, 1e-06, 1e-06): + print("conv1d_back Passed") + + + +test_conv2d_weight = np.array( + [ + [ + [ + [0.8586049675941467, -0.2279418259859085, 0.2013147622346878], + [0.35005471110343933, 0.5360521078109741, 1.5194443464279175], + [1.9040879011154175, -1.5734431743621826, -0.14007866382598877], + ] + ], + [ + [ + [0.29670074582099915, 1.3111951351165771, 0.5035904049873352], + [-1.1894450187683105, -0.5502137541770935, -1.591875672340393], + [-1.1081947088241577, 0.07872020453214645, -0.9185634255409241], + ] + ], + [ + [ + [-0.7457143664360046, -1.2080862522125244, 1.8140212297439575], + [-1.5227429866790771, -2.515244960784912, -1.3549325466156006], + [-0.9574840068817139, -0.7248556613922119, 1.1119636297225952], + ] + ], + ] +) +test_conv2d_data = np.array( + [ + [ + [ + [ + 1.1630785465240479, + 0.4838046133518219, + 0.299563467502594, + 0.15302546322345734, + -1.168814778327942, + ], + [ + 1.5580710172653198, + -0.5459445714950562, + -2.3556296825408936, + 0.5414402484893799, + 2.678506374359131, + ], + [ + 1.2546343803405762, + -0.5487740635871887, + -0.6810643672943115, + -0.13531559705734253, + 0.37723132967948914, + ], + [ + 0.41016456484794617, + 0.5712682008743286, + -2.757962703704834, + 1.0762799978256226, + -0.6141325235366821, + ], + [ + 1.830764889717102, + -1.1468064785003662, + 0.053837940096855164, + -2.5074806213378906, + -0.5916498899459839, + ], + ] + ] + ] +) +test_conv2d_data_grad = np.array( + [ + [ + [ + [ + 0.4095913469791412, + 0.2847584038972855, + 2.803684800863266, + 2.3940934538841248, + 2.5189263969659805, + ], + [ + -1.9525419473648071, + -4.606781497597694, + -3.51521897315979, + -1.562677025794983, + 1.0915625244379044, + ], + [ + -2.1141327619552612, + -6.987950943410397, + -5.84306687861681, + -3.7289341166615486, + 1.1448840647935867, + ], + [ + -2.5237241089344025, + -7.272709347307682, + -8.646751679480076, + -6.123027570545673, + -1.3740423321723938, + ], + [ + -0.1615908145904541, + -2.381169445812702, + -2.32784790545702, + -2.1662570908665657, + 0.0533215403556824, + ], + ] + ] + ] +) +test_conv2d_weight_grad = np.array( + [ + [ + [ + [0.6277393400669098, -2.7888944894075394, -0.2910575419664383], + [-3.095237225294113, -4.835702538490295, -1.8706469237804413], + [-1.0139376372098923, -6.076017692685127, -5.780256435275078], + ] + ], + [ + [ + [0.6277393400669098, -2.7888944894075394, -0.2910575419664383], + [-3.095237225294113, -4.835702538490295, -1.8706469237804413], + [-1.0139376372098923, -6.076017692685127, -5.780256435275078], + ] + ], + [ + [ + [0.6277393400669098, -2.7888944894075394, -0.2910575419664383], + [-3.095237225294113, -4.835702538490295, -1.8706469237804413], + [-1.0139376372098923, -6.076017692685127, -5.780256435275078], + ] + ], + ] +) +test_conv2d_output = np.array( + [ + [ + [ + [0.9699610471725464, -0.20758534967899323, 2.3857712745666504], + [0.3666309118270874, 4.690882682800293, -8.203354835510254], + [2.6072847843170166, -1.9033538103103638, 2.331153154373169], + ], + [ + [2.519343852996826, 2.3757898807525635, -1.6613528728485107], + [0.5777544379234314, -3.5739502906799316, 5.349126815795898], + [0.729295015335083, 1.5791023969650269, 3.7627718448638916], + ], + [ + [-0.27685487270355225, 6.446267127990723, -2.762883424758911], + [-8.25644588470459, 9.616064071655273, 8.005367279052734], + [-0.6944921016693115, 3.866114854812622, 4.788446426391602], + ], + ] + ] +) +test_conv2d_with_bias_weight = np.array( + [ + [ + [ + [1.8271433115005493, -1.0446699857711792, 1.0062190294265747], + [0.5174201130867004, -0.806931734085083, 1.3769007921218872], + [0.205885112285614, 0.9943519234657288, -0.23580588400363922], + ] + ], + [ + [ + [0.29881811141967773, -1.9982075691223145, 0.3511354625225067], + [-0.7644741535186768, 1.2594351768493652, -0.9629734754562378], + [0.5080506205558777, 0.7561734318733215, 1.6839302778244019], + ] + ], + [ + [ + [1.2573646306991577, 0.13123232126235962, 1.6403018236160278], + [-1.2138012647628784, 2.399970531463623, -0.38509097695350647], + [-0.9878040552139282, 0.9585888385772705, -1.4976465702056885], + ] + ], + ] +) +test_conv2d_with_bias_bias = np.array( + [0.6605162620544434, -0.18903568387031555, -0.27302607893943787] +) +test_conv2d_with_bias_data = np.array( + [ + [ + [ + [ + -0.47827261686325073, + -1.1739492416381836, + -0.7921845316886902, + 0.9321041703224182, + -3.1557741165161133, + ], + [ + 2.1935296058654785, + -0.5385921001434326, + -0.8611332774162292, + -1.881519079208374, + -0.7205708026885986, + ], + [ + -0.35601571202278137, + -0.15963983535766602, + 1.797447681427002, + 0.19594945013523102, + -1.7376397848129272, + ], + [ + 0.047347065061330795, + 0.14580930769443512, + 0.32604914903640747, + 0.4578782916069031, + -0.8942581415176392, + ], + [ + 0.49383941292762756, + -0.9043426513671875, + -1.2140793800354004, + 2.1564064025878906, + 1.0938222408294678, + ], + ] + ] + ] +) +test_conv2d_with_bias_output = np.array( + [ + [ + [ + [-0.05607491731643677, -0.185230553150177, -3.8808679580688477], + [6.861937046051025, -2.3341472148895264, -0.5597308874130249], + [1.8299254179000854, -2.770848274230957, 2.1958212852478027], + ], + [ + [2.9348952770233154, 4.117504119873047, -6.278541088104248], + [0.2638452351093292, 3.998856782913208, 2.612290620803833], + [-1.9891828298568726, -1.6476304531097412, 3.39066219329834], + ], + [ + [-8.44466781616211, 0.5747121572494507, -8.501373291015625], + [-0.036642804741859436, -0.23458999395370483, -2.370849370956421], + [2.8372013568878174, -2.987276077270508, 1.8382092714309692], + ], + ] + ] +) + +to_device = flow.device("cuda") + +conv = flow.nn.Conv2d(1, 3, (3, 3), bias=True).to(to_device) +x = flow.tensor(test_conv2d_with_bias_data, dtype=flow.float32, device=to_device) +conv.weight = flow.nn.Parameter(flow.Tensor(test_conv2d_with_bias_weight)) +conv.bias = flow.nn.Parameter(flow.Tensor(test_conv2d_with_bias_bias)) +conv.to(to_device) +of_out = conv(x) +if np.allclose(of_out.numpy(), test_conv2d_with_bias_output, rtol=1e-4, atol=1e-8): + print("conv2d_bias Passed") + +conv = flow.nn.Conv2d(1, 3, (3, 3), bias=False).to(flow.device("cuda")) +x = flow.tensor(test_conv2d_data, dtype=flow.float32, device=to_device, requires_grad=True) +conv.weight = flow.nn.Parameter(flow.Tensor(test_conv2d_weight), requires_grad=True) +conv.to(to_device) +of_out = conv(x) +of_out.sum().backward() +if np.allclose(x.grad.numpy(), test_conv2d_data_grad, rtol=1e-4, atol=1e-8): + print("con2d_back_data_grad Passed") + +if np.allclose(conv.weight.grad.numpy(), test_conv2d_weight_grad, rtol=1e-4, atol=1e-8): + print("con2d_back_weight_grad Passed") + +conv = flow.nn.Conv2d(1, 3, (3, 3), bias=True).to(to_device) +x = flow.tensor(test_conv2d_with_bias_data, dtype=flow.float32, device=to_device) +conv.weight = flow.nn.Parameter(flow.Tensor(test_conv2d_with_bias_weight)) +conv.bias = flow.nn.Parameter(flow.Tensor(test_conv2d_with_bias_bias)) +conv.to(to_device) +of_out = conv(x) +if np.allclose(of_out.numpy(), test_conv2d_with_bias_output, rtol=1e-4, atol=1e-8): + print("conv2d_bias Passed") + +conv = flow.nn.Conv2d(1, 3, (3, 3), bias=False).to(flow.device("cuda")) +x = flow.tensor(test_conv2d_data, dtype=flow.float32, device=to_device, requires_grad=True) +conv.weight = flow.nn.Parameter(flow.Tensor(test_conv2d_weight), requires_grad=True) +conv.to(to_device) +of_out = conv(x) +of_out.sum().backward() +if np.allclose(x.grad.numpy(), test_conv2d_data_grad, rtol=1e-4, atol=1e-8): + print("con2d_back_data_grad Passed") + +if np.allclose(conv.weight.grad.numpy(), test_conv2d_weight_grad, rtol=1e-4, atol=1e-8): + print("con2d_back_weight_grad Passed") + + + + + + diff --git a/python/oneflow/test/modules/test_softmax_cross_entropy b/python/oneflow/test/modules/test_softmax_cross_entropy index 017d01a..a30b1cc 100644 --- a/python/oneflow/test/modules/test_softmax_cross_entropy +++ b/python/oneflow/test/modules/test_softmax_cross_entropy @@ -1,174 +1,174 @@ -""" -Copyright 2020 The OneFlow Authors. All rights reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -import unittest -import oneflow as flow -import oneflow.unittest -import numpy as np -import torch - -class TestSoftmaxCrossEntropyError(flow.unittest.TestCase): - def test_softmax_cross_entropy_prediction_numaxes_err(test_case): - with test_case.assertRaises(RuntimeError) as context: - prediction = flow.randn(10) - label = flow.randn(1, 10) - flow._C.softmax_cross_entropy(prediction, label) - test_case.assertTrue( - "The dimension of prediction must be greater than or equal to 2, but found" - in str(context.exception) - ) - - def test_softmax_cross_entropy_prediction_shape_err(test_case): - with test_case.assertRaises(RuntimeError) as context: - prediction = flow.randn(1, 10) - label = flow.randn(1, 11) - flow._C.softmax_cross_entropy(prediction, label) - test_case.assertTrue( - "must match the size of prediction" in str(context.exception) - ) - - def test_softmax_cross_entropy_dtype_err(test_case): - with test_case.assertRaises(TypeError) as context: - prediction = flow.randn(1, 10, dtype=flow.float32) - label = flow.randn(1, 10, dtype=flow.float64) - flow._C.softmax_cross_entropy(prediction, label) - test_case.assertTrue( - "label and prediction are expected to have the same dtype, but found" - in str(context.exception) - ) - - def test_softmax_cross_entropy_grad_prob_numaxes_err(test_case): - with test_case.assertRaises(RuntimeError) as context: - dy = flow.randn(10, 5) - label = flow.randn(10, 10, 5) - prob = flow.randn(10) - flow._C.softmax_cross_entropy_grad(dy, label, prob) - test_case.assertTrue( - "The dimension of prob must be greater than or equal to 2, but found " - in str(context.exception) - ) - - def test_softmax_cross_entropy_grad_dy_numaxes_err(test_case): - with test_case.assertRaises(RuntimeError) as context: - dy = flow.randn(10, 10, 5) - label = flow.randn(10, 10, 5) - prob = flow.randn(10, 10, 5) - flow._C.softmax_cross_entropy_grad(dy, label, prob) - test_case.assertTrue( - "The dimension of dy is expected to be less than that of prob by 1, but found" - in str(context.exception) - ) - - def test_softmax_cross_entropy_grad_dy_i_shape_err(test_case): - with test_case.assertRaises(RuntimeError) as context: - dy = flow.randn(10, 8) - label = flow.randn(10, 10, 5) - prob = flow.randn(10, 10, 5) - flow._C.softmax_cross_entropy_grad(dy, label, prob) - test_case.assertTrue("must match the size of label" in str(context.exception)) - - def test_softmax_cross_entropy_grad_prob_shape_err(test_case): - with test_case.assertRaises(RuntimeError) as context: - dy = flow.randn(10, 10) - label = flow.randn(10, 10, 5) - prob = flow.randn(10, 10, 6) - flow._C.softmax_cross_entropy_grad(dy, label, prob) - test_case.assertTrue("must match the size of prob" in str(context.exception)) - - def test_softmax_cross_entropy_grad_label_dtype_err(test_case): - with test_case.assertRaises(TypeError) as context: - dy = flow.randn(10, 10, dtype=flow.float64) - label = flow.randn(10, 10, 5, dtype=flow.float32) - prob = flow.randn(10, 10, 5, dtype=flow.float64) - flow._C.softmax_cross_entropy_grad(dy, label, prob) - test_case.assertTrue( - "label and prob are expected to have the same dtype, but found" - in str(context.exception) - ) - - def test_softmax_cross_entropy_grad_dy_dtype_err(test_case): - with test_case.assertRaises(TypeError) as context: - dy = flow.randn(10, 10, dtype=flow.float32) - label = flow.randn(10, 10, 5, dtype=flow.float64) - prob = flow.randn(10, 10, 5, dtype=flow.float64) - flow._C.softmax_cross_entropy_grad(dy, label, prob) - print(str(context.exception)) - test_case.assertTrue( - "dy and prob are expected to have the same dtype, but found" - in str(context.exception) - ) - - -if __name__ == "__main__": - - np_prediction = np.random.random((1, 10)).astype(np.float32) - np_label = np.random.random((1, 10)).astype(np.float32) - - of_prediction = flow.tensor( - np_prediction, device=flow.device("cuda"), dtype=flow.float32, requires_grad=True) - of_label = flow.tensor(np_label, device=flow.device("cuda"), dtype=flow.float32) - of_output = flow._C.softmax_cross_entropy(of_prediction, of_label).to("cuda") - of_output.sum() - - torch_prediction = torch.tensor(np_prediction, dtype=torch.float32, requires_grad=True) - torch_label = torch.tensor(np_label, dtype=torch.float32) - torch_output = torch.nn.functional.cross_entropy( - torch_prediction, torch_label, reduction="none") - torch_output.sum() - - if np.allclose( - of_output.numpy(), torch_output.detach().numpy(), rtol=1e-03, atol=1e-04 - ): - print("test_softmax_cross_entropy Passed") - else: - print("test_softmax_cross_entropy Failed") - - np_prediction = np.random.random((1, 10, 2)).astype(np.float32) - np_label = np.random.random((1, 10, 2)).astype(np.float32) - - of_prediction = flow.tensor( - np_prediction, device=flow.device("cpu"), dtype=flow.float32, requires_grad=True) - of_label = flow.tensor(np_label, device=flow.device("cpu"), dtype=flow.float32) - of_output = flow._C.softmax_cross_entropy(of_prediction, of_label) - of_output.sum().backward() - print("of cpu res:") - print(of_prediction.grad.numpy()) - - of_prediction = flow.tensor( - np_prediction, device=flow.device("cuda"), dtype=flow.float32, requires_grad=True) - of_label = flow.tensor(np_label, device=flow.device("cuda"), dtype=flow.float32) - of_output = flow._C.softmax_cross_entropy(of_prediction, of_label).to("cuda") - of_output.sum().backward() - print("of gpu res:") - print(of_prediction.grad.numpy()) - - torch_prediction = torch.tensor(np_prediction, dtype=torch.float32, requires_grad=True) - torch_label = torch.tensor(np_label, dtype=torch.float32) - torch_output = torch.nn.functional.cross_entropy( - torch_prediction, torch_label, reduction="none") - torch_output.sum().backward() - - print("*************************") - print(torch_prediction.grad) - # if np.allclose( - # of_prediction.grad.numpy(), torch_prediction.grad, rtol=1e-03, atol=1e-04 - # ): - # print("test_softmax_cross_entropy_grad Passed") - # else: - # print("test_softmax_cross_entropy_grad Failed") - - - - - - +""" +Copyright 2020 The OneFlow Authors. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import unittest +import oneflow as flow +import oneflow.unittest +import numpy as np +import torch + +class TestSoftmaxCrossEntropyError(flow.unittest.TestCase): + def test_softmax_cross_entropy_prediction_numaxes_err(test_case): + with test_case.assertRaises(RuntimeError) as context: + prediction = flow.randn(10) + label = flow.randn(1, 10) + flow._C.softmax_cross_entropy(prediction, label) + test_case.assertTrue( + "The dimension of prediction must be greater than or equal to 2, but found" + in str(context.exception) + ) + + def test_softmax_cross_entropy_prediction_shape_err(test_case): + with test_case.assertRaises(RuntimeError) as context: + prediction = flow.randn(1, 10) + label = flow.randn(1, 11) + flow._C.softmax_cross_entropy(prediction, label) + test_case.assertTrue( + "must match the size of prediction" in str(context.exception) + ) + + def test_softmax_cross_entropy_dtype_err(test_case): + with test_case.assertRaises(TypeError) as context: + prediction = flow.randn(1, 10, dtype=flow.float32) + label = flow.randn(1, 10, dtype=flow.float64) + flow._C.softmax_cross_entropy(prediction, label) + test_case.assertTrue( + "label and prediction are expected to have the same dtype, but found" + in str(context.exception) + ) + + def test_softmax_cross_entropy_grad_prob_numaxes_err(test_case): + with test_case.assertRaises(RuntimeError) as context: + dy = flow.randn(10, 5) + label = flow.randn(10, 10, 5) + prob = flow.randn(10) + flow._C.softmax_cross_entropy_grad(dy, label, prob) + test_case.assertTrue( + "The dimension of prob must be greater than or equal to 2, but found " + in str(context.exception) + ) + + def test_softmax_cross_entropy_grad_dy_numaxes_err(test_case): + with test_case.assertRaises(RuntimeError) as context: + dy = flow.randn(10, 10, 5) + label = flow.randn(10, 10, 5) + prob = flow.randn(10, 10, 5) + flow._C.softmax_cross_entropy_grad(dy, label, prob) + test_case.assertTrue( + "The dimension of dy is expected to be less than that of prob by 1, but found" + in str(context.exception) + ) + + def test_softmax_cross_entropy_grad_dy_i_shape_err(test_case): + with test_case.assertRaises(RuntimeError) as context: + dy = flow.randn(10, 8) + label = flow.randn(10, 10, 5) + prob = flow.randn(10, 10, 5) + flow._C.softmax_cross_entropy_grad(dy, label, prob) + test_case.assertTrue("must match the size of label" in str(context.exception)) + + def test_softmax_cross_entropy_grad_prob_shape_err(test_case): + with test_case.assertRaises(RuntimeError) as context: + dy = flow.randn(10, 10) + label = flow.randn(10, 10, 5) + prob = flow.randn(10, 10, 6) + flow._C.softmax_cross_entropy_grad(dy, label, prob) + test_case.assertTrue("must match the size of prob" in str(context.exception)) + + def test_softmax_cross_entropy_grad_label_dtype_err(test_case): + with test_case.assertRaises(TypeError) as context: + dy = flow.randn(10, 10, dtype=flow.float64) + label = flow.randn(10, 10, 5, dtype=flow.float32) + prob = flow.randn(10, 10, 5, dtype=flow.float64) + flow._C.softmax_cross_entropy_grad(dy, label, prob) + test_case.assertTrue( + "label and prob are expected to have the same dtype, but found" + in str(context.exception) + ) + + def test_softmax_cross_entropy_grad_dy_dtype_err(test_case): + with test_case.assertRaises(TypeError) as context: + dy = flow.randn(10, 10, dtype=flow.float32) + label = flow.randn(10, 10, 5, dtype=flow.float64) + prob = flow.randn(10, 10, 5, dtype=flow.float64) + flow._C.softmax_cross_entropy_grad(dy, label, prob) + print(str(context.exception)) + test_case.assertTrue( + "dy and prob are expected to have the same dtype, but found" + in str(context.exception) + ) + + +if __name__ == "__main__": + + np_prediction = np.random.random((1, 10)).astype(np.float32) + np_label = np.random.random((1, 10)).astype(np.float32) + + of_prediction = flow.tensor( + np_prediction, device=flow.device("cuda"), dtype=flow.float32, requires_grad=True) + of_label = flow.tensor(np_label, device=flow.device("cuda"), dtype=flow.float32) + of_output = flow._C.softmax_cross_entropy(of_prediction, of_label).to("cuda") + of_output.sum() + + torch_prediction = torch.tensor(np_prediction, dtype=torch.float32, requires_grad=True) + torch_label = torch.tensor(np_label, dtype=torch.float32) + torch_output = torch.nn.functional.cross_entropy( + torch_prediction, torch_label, reduction="none") + torch_output.sum() + + if np.allclose( + of_output.numpy(), torch_output.detach().numpy(), rtol=1e-03, atol=1e-04 + ): + print("test_softmax_cross_entropy Passed") + else: + print("test_softmax_cross_entropy Failed") + + np_prediction = np.random.random((1, 10, 2)).astype(np.float32) + np_label = np.random.random((1, 10, 2)).astype(np.float32) + + of_prediction = flow.tensor( + np_prediction, device=flow.device("cpu"), dtype=flow.float32, requires_grad=True) + of_label = flow.tensor(np_label, device=flow.device("cpu"), dtype=flow.float32) + of_output = flow._C.softmax_cross_entropy(of_prediction, of_label) + of_output.sum().backward() + print("of cpu res:") + print(of_prediction.grad.numpy()) + + of_prediction = flow.tensor( + np_prediction, device=flow.device("cuda"), dtype=flow.float32, requires_grad=True) + of_label = flow.tensor(np_label, device=flow.device("cuda"), dtype=flow.float32) + of_output = flow._C.softmax_cross_entropy(of_prediction, of_label).to("cuda") + of_output.sum().backward() + print("of gpu res:") + print(of_prediction.grad.numpy()) + + torch_prediction = torch.tensor(np_prediction, dtype=torch.float32, requires_grad=True) + torch_label = torch.tensor(np_label, dtype=torch.float32) + torch_output = torch.nn.functional.cross_entropy( + torch_prediction, torch_label, reduction="none") + torch_output.sum().backward() + + print("*************************") + print(torch_prediction.grad) + # if np.allclose( + # of_prediction.grad.numpy(), torch_prediction.grad, rtol=1e-03, atol=1e-04 + # ): + # print("test_softmax_cross_entropy_grad Passed") + # else: + # print("test_softmax_cross_entropy_grad Failed") + + + + + + diff --git a/python/oneflow/test/profiler/test_profile_lenet.py b/python/oneflow/test/profiler/test_profile_lenet.py index 07d3a3c..d573277 100644 --- a/python/oneflow/test/profiler/test_profile_lenet.py +++ b/python/oneflow/test/profiler/test_profile_lenet.py @@ -1,148 +1,148 @@ -""" -Copyright 2020 The OneFlow Authors. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" -import os -import unittest -import oneflow.unittest -import oneflow as flow -import oneflow.nn as nn -import oneflow.nn.functional as F -import oneflow.profiler -from oneflow.profiler.events import CustomEvent, KernelEvent - - -class LeNet(nn.Module): - def __init__(self): - super(LeNet, self).__init__() - self.conv1 = nn.Conv2d(3, 6, 5) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16 * 5 * 5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - out = F.relu(self.conv1(x)) - out = F.max_pool2d(out, 2) - out = F.relu(self.conv2(out)) - out = F.max_pool2d(out, 2) - out = out.view(out.size(0), -1) - out = F.relu(self.fc1(out)) - out = F.relu(self.fc2(out)) - out = self.fc3(out) - return out - - -def get_event(events, name: str, input_shapes: str = "-"): - for item in events: - if isinstance(item, CustomEvent): - if item.name == name: - return item - if isinstance(item, KernelEvent): - if item.name == name and item.input_shapes == input_shapes: - return item - return None - - -def _test_lenet( - test_case, - on_cuda: bool, - record_shapes: bool, - record_bandwidth_for_cuda: bool = False, -): - x = flow.randn(2, 3, 32, 32) - lenet = LeNet() - if on_cuda: - x = x.to("cuda") - lenet.to("cuda") - activities = [oneflow.profiler.ProfilerActivity.CPU] - if on_cuda: - activities.append(oneflow.profiler.ProfilerActivity.CUDA) - with oneflow.profiler.profile( - activities=activities, - record_shapes=record_shapes, - record_bandwidth_for_cuda=record_bandwidth_for_cuda, - ) as prof: - with oneflow.profiler.record_function("lenet_forward_total_time") as f: - for _ in range(2): - eager_res = lenet(x) - with oneflow.profiler.record_function("lenet_backward_total_time") as f: - eager_res.sum().backward() - events = prof.key_averages(group_by_input_shape=True) - print(events) - conv_event = get_event( - events, "conv2d", "[(2,3,32,32), (6,3,5,5)]" if record_shapes else "-" - ) - test_case.assertIsNotNone(conv_event) - - if on_cuda: - test_case.assertGreater(conv_event.cpu_time, 0.0) - test_case.assertGreater(conv_event.cpu_time_total, 0.0) - test_case.assertGreater(conv_event.cuda_time, 0.0) - test_case.assertGreater(conv_event.cuda_time_total, 0.0) - else: - test_case.assertGreater(conv_event.cpu_time, 0.0) - test_case.assertGreater(conv_event.cpu_time_total, 0.0) - - test_case.assertEqual(conv_event.count, 2 if record_shapes else 4) - if record_bandwidth_for_cuda and on_cuda: - test_case.assertNotEqual(conv_event.bandwidth, -1) - - relu_grad_event = get_event( - events, "relu_grad", "[(2,6,28,28), (2,6,28,28)]" if record_shapes else "-" - ) - test_case.assertIsNotNone(relu_grad_event) - if on_cuda: - test_case.assertGreater(relu_grad_event.cpu_time, 0.0) - test_case.assertGreater(relu_grad_event.cpu_time_total, 0.0) - test_case.assertGreater(relu_grad_event.cuda_time, 0.0) - test_case.assertGreater(relu_grad_event.cuda_time_total, 0.0) - else: - test_case.assertGreater(relu_grad_event.cpu_time, 0.0) - test_case.assertGreater(relu_grad_event.cpu_time_total, 0.0) - - test_case.assertEqual(relu_grad_event.count, 1 if record_shapes else 4) - if record_bandwidth_for_cuda and on_cuda: - test_case.assertNotEqual(relu_grad_event.bandwidth, -1) - - test_case.assertIsNotNone(get_event(events, "lenet_forward_total_time")) - test_case.assertIsNotNone(get_event(events, "lenet_backward_total_time")) - - -class TestProfileLenet(flow.unittest.TestCase): - def test_lenet_cpu(test_case): - _test_lenet(test_case, on_cuda=False, record_shapes=True) - _test_lenet(test_case, on_cuda=False, record_shapes=False) - - @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") - def test_lenet_cuda(test_case): - _test_lenet( - test_case, on_cuda=True, record_shapes=True, record_bandwidth_for_cuda=False - ) - _test_lenet( - test_case, - on_cuda=True, - record_shapes=False, - record_bandwidth_for_cuda=False, - ) - _test_lenet( - test_case, on_cuda=True, record_shapes=True, record_bandwidth_for_cuda=True - ) - _test_lenet( - test_case, on_cuda=True, record_shapes=False, record_bandwidth_for_cuda=True - ) - - -if __name__ == "__main__": - unittest.main() +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import os +import unittest +import oneflow.unittest +import oneflow as flow +import oneflow.nn as nn +import oneflow.nn.functional as F +import oneflow.profiler +from oneflow.profiler.events import CustomEvent, KernelEvent + + +class LeNet(nn.Module): + def __init__(self): + super(LeNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + out = F.relu(self.conv1(x)) + out = F.max_pool2d(out, 2) + out = F.relu(self.conv2(out)) + out = F.max_pool2d(out, 2) + out = out.view(out.size(0), -1) + out = F.relu(self.fc1(out)) + out = F.relu(self.fc2(out)) + out = self.fc3(out) + return out + + +def get_event(events, name: str, input_shapes: str = "-"): + for item in events: + if isinstance(item, CustomEvent): + if item.name == name: + return item + if isinstance(item, KernelEvent): + if item.name == name and item.input_shapes == input_shapes: + return item + return None + + +def _test_lenet( + test_case, + on_cuda: bool, + record_shapes: bool, + record_bandwidth_for_cuda: bool = False, +): + x = flow.randn(2, 3, 32, 32) + lenet = LeNet() + if on_cuda: + x = x.to("cuda") + lenet.to("cuda") + activities = [oneflow.profiler.ProfilerActivity.CPU] + if on_cuda: + activities.append(oneflow.profiler.ProfilerActivity.CUDA) + with oneflow.profiler.profile( + activities=activities, + record_shapes=record_shapes, + record_bandwidth_for_cuda=record_bandwidth_for_cuda, + ) as prof: + with oneflow.profiler.record_function("lenet_forward_total_time") as f: + for _ in range(2): + eager_res = lenet(x) + with oneflow.profiler.record_function("lenet_backward_total_time") as f: + eager_res.sum().backward() + events = prof.key_averages(group_by_input_shape=True) + print(events) + conv_event = get_event( + events, "conv2d", "[(2,3,32,32), (6,3,5,5)]" if record_shapes else "-" + ) + test_case.assertIsNotNone(conv_event) + + if on_cuda: + test_case.assertGreater(conv_event.cpu_time, 0.0) + test_case.assertGreater(conv_event.cpu_time_total, 0.0) + test_case.assertGreater(conv_event.cuda_time, 0.0) + test_case.assertGreater(conv_event.cuda_time_total, 0.0) + else: + test_case.assertGreater(conv_event.cpu_time, 0.0) + test_case.assertGreater(conv_event.cpu_time_total, 0.0) + + test_case.assertEqual(conv_event.count, 2 if record_shapes else 4) + if record_bandwidth_for_cuda and on_cuda: + test_case.assertNotEqual(conv_event.bandwidth, -1) + + relu_grad_event = get_event( + events, "relu_grad", "[(2,6,28,28), (2,6,28,28)]" if record_shapes else "-" + ) + test_case.assertIsNotNone(relu_grad_event) + if on_cuda: + test_case.assertGreater(relu_grad_event.cpu_time, 0.0) + test_case.assertGreater(relu_grad_event.cpu_time_total, 0.0) + test_case.assertGreater(relu_grad_event.cuda_time, 0.0) + test_case.assertGreater(relu_grad_event.cuda_time_total, 0.0) + else: + test_case.assertGreater(relu_grad_event.cpu_time, 0.0) + test_case.assertGreater(relu_grad_event.cpu_time_total, 0.0) + + test_case.assertEqual(relu_grad_event.count, 1 if record_shapes else 4) + if record_bandwidth_for_cuda and on_cuda: + test_case.assertNotEqual(relu_grad_event.bandwidth, -1) + + test_case.assertIsNotNone(get_event(events, "lenet_forward_total_time")) + test_case.assertIsNotNone(get_event(events, "lenet_backward_total_time")) + + +class TestProfileLenet(flow.unittest.TestCase): + def test_lenet_cpu(test_case): + _test_lenet(test_case, on_cuda=False, record_shapes=True) + _test_lenet(test_case, on_cuda=False, record_shapes=False) + + @unittest.skipIf(os.getenv("ONEFLOW_TEST_CPU_ONLY"), "only test cpu cases") + def test_lenet_cuda(test_case): + _test_lenet( + test_case, on_cuda=True, record_shapes=True, record_bandwidth_for_cuda=False + ) + _test_lenet( + test_case, + on_cuda=True, + record_shapes=False, + record_bandwidth_for_cuda=False, + ) + _test_lenet( + test_case, on_cuda=True, record_shapes=True, record_bandwidth_for_cuda=True + ) + _test_lenet( + test_case, on_cuda=True, record_shapes=False, record_bandwidth_for_cuda=True + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/version_script.lds b/version_script.lds index 665a2f9..4737392 100644 --- a/version_script.lds +++ b/version_script.lds @@ -1,7 +1,7 @@ -{ - global: - *; - local: - *llvm*; -}; - +{ + global: + *; + local: + *llvm*; +}; + -- GitLab