From deb763b70bc9c93a1e41977d3e64dbe0f2c40b01 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 6 Feb 2026 14:07:36 +0800 Subject: [PATCH] clone code from github --- CITATION.bib | 7 + CODE_OF_CONDUCT.md | 7 + LICENSE | 20 + MANIFEST.in | 18 + codecov.yml | 13 + cupy/__init__.py | 920 +++ cupy/_binary/__init__.py | 2 + cupy/_binary/elementwise.py | 22 + cupy/_binary/packing.py | 104 + cupy/_core/__init__.pxd | 0 cupy/_core/__init__.py | 79 + cupy/_core/_accelerator.pxd | 10 + cupy/_core/_accelerator.pyx | 59 + cupy/_core/_carray.pxd | 55 + cupy/_core/_carray.pyx | 57 + cupy/_core/_codeblock.py | 38 + cupy/_core/_cub_reduction.pxd | 12 + cupy/_core/_cub_reduction.pyx | 712 +++ cupy/_core/_dtype.pxd | 10 + cupy/_core/_dtype.pyx | 126 + cupy/_core/_fusion_interface.py | 272 + cupy/_core/_fusion_kernel.pyx | 364 ++ cupy/_core/_fusion_op.py | 316 + cupy/_core/_fusion_optimization.py | 90 + cupy/_core/_fusion_thread_local.pyx | 46 + cupy/_core/_fusion_trace.pyx | 616 ++ cupy/_core/_fusion_variable.pxd | 5 + cupy/_core/_fusion_variable.pyx | 340 ++ cupy/_core/_gufuncs.py | 729 +++ cupy/_core/_kernel.pxd | 170 + cupy/_core/_kernel.pyx | 1622 ++++++ cupy/_core/_memory_range.pxd | 7 + cupy/_core/_memory_range.pyx | 40 + cupy/_core/_optimize_config.pxd | 22 + cupy/_core/_optimize_config.pyx | 81 + cupy/_core/_reduction.pxd | 77 + cupy/_core/_reduction.pyx | 906 +++ cupy/_core/_routines_binary.pxd | 6 + cupy/_core/_routines_binary.pyx | 96 + cupy/_core/_routines_indexing.pxd | 15 + cupy/_core/_routines_indexing.pyx | 1161 ++++ cupy/_core/_routines_linalg.pxd | 27 + cupy/_core/_routines_linalg.pyx | 1067 ++++ cupy/_core/_routines_logic.pxd | 11 + cupy/_core/_routines_logic.pyx | 141 + cupy/_core/_routines_manipulation.pxd | 40 + cupy/_core/_routines_manipulation.pyx | 885 +++ cupy/_core/_routines_math.pxd | 39 + cupy/_core/_routines_math.pyx | 1145 ++++ cupy/_core/_routines_sorting.pxd | 7 + cupy/_core/_routines_sorting.pyx | 534 ++ cupy/_core/_routines_statistics.pxd | 30 + cupy/_core/_routines_statistics.pyx | 763 +++ cupy/_core/_scalar.pxd | 37 + cupy/_core/_scalar.pyx | 386 ++ cupy/_core/_ufuncs.py | 9 + cupy/_core/core.pxd | 115 + cupy/_core/core.pyx | 2813 +++++++++ cupy/_core/dlpack.pxd | 12 + cupy/_core/dlpack.pyx | 412 ++ cupy/_core/flags.pyx | 34 + cupy/_core/fusion.pyx | 1004 ++++ cupy/_core/halffloat.h | 125 + cupy/_core/include/cupy/README.md | 22 + cupy/_core/include/cupy/_cuda/README.md | 9 + .../include/cupy/_cuda/cuda-10.2/cuda_fp16.h | 3052 ++++++++++ .../cupy/_cuda/cuda-10.2/cuda_fp16.hpp | 2071 +++++++ .../include/cupy/_cuda/cuda-11.0/cuda_fp16.h | 3612 ++++++++++++ .../cupy/_cuda/cuda-11.0/cuda_fp16.hpp | 2285 ++++++++ .../include/cupy/_cuda/cuda-11.1/cuda_fp16.h | 3631 ++++++++++++ .../cupy/_cuda/cuda-11.1/cuda_fp16.hpp | 2453 ++++++++ .../include/cupy/_cuda/cuda-11/cuda_fp16.h | 3794 ++++++++++++ .../include/cupy/_cuda/cuda-11/cuda_fp16.hpp | 2614 +++++++++ .../include/cupy/_cuda/cuda-12/cuda_fp16.h | 4023 +++++++++++++ .../include/cupy/_cuda/cuda-12/cuda_fp16.hpp | 2738 +++++++++ cupy/_core/include/cupy/atomics.cuh | 114 + cupy/_core/include/cupy/carray.cuh | 849 +++ cupy/_core/include/cupy/complex.cuh | 100 + cupy/_core/include/cupy/complex/README.md | 3 + cupy/_core/include/cupy/complex/arithmetic.h | 314 + cupy/_core/include/cupy/complex/catrig.h | 730 +++ cupy/_core/include/cupy/complex/catrigf.h | 444 ++ cupy/_core/include/cupy/complex/ccosh.h | 205 + cupy/_core/include/cupy/complex/ccoshf.h | 135 + cupy/_core/include/cupy/complex/cexp.h | 173 + cupy/_core/include/cupy/complex/cexpf.h | 153 + cupy/_core/include/cupy/complex/clog.h | 203 + cupy/_core/include/cupy/complex/clogf.h | 192 + cupy/_core/include/cupy/complex/complex.h | 674 +++ cupy/_core/include/cupy/complex/complex_inl.h | 164 + cupy/_core/include/cupy/complex/cpow.h | 44 + cupy/_core/include/cupy/complex/cproj.h | 64 + cupy/_core/include/cupy/complex/csinh.h | 192 + cupy/_core/include/cupy/complex/csinhf.h | 133 + cupy/_core/include/cupy/complex/csqrt.h | 144 + cupy/_core/include/cupy/complex/csqrtf.h | 141 + cupy/_core/include/cupy/complex/ctanh.h | 191 + cupy/_core/include/cupy/complex/ctanhf.h | 116 + .../_core/include/cupy/complex/math_private.h | 192 + cupy/_core/include/cupy/cuComplex_bridge.h | 34 + cupy/_core/include/cupy/cub/.gitattributes | 1 + cupy/_core/include/cupy/cub/LICENSE.TXT | 1 + cupy/_core/include/cupy/cub/cub | 1 + cupy/_core/include/cupy/cuda_workaround.h | 13 + cupy/_core/include/cupy/dlpack/README.md | 4 + cupy/_core/include/cupy/dlpack/dlpack.h | 232 + cupy/_core/include/cupy/hip_workaround.cuh | 20 + cupy/_core/include/cupy/jitify/.clang-format | 149 + cupy/_core/include/cupy/jitify/.gitignore | 10 + cupy/_core/include/cupy/jitify/Doxyfile | 2427 ++++++++ cupy/_core/include/cupy/jitify/LICENSE | 29 + cupy/_core/include/cupy/jitify/Makefile | 105 + cupy/_core/include/cupy/jitify/README.md | 116 + .../example_headers/class_arg_kernel.cuh | 69 + .../example_headers/constant_header.cuh | 43 + .../jitify/example_headers/my_header1.cuh | 34 + .../jitify/example_headers/my_header2.cuh | 34 + .../jitify/example_headers/my_header3.cuh | 34 + cupy/_core/include/cupy/jitify/jitify.hpp | 4454 ++++++++++++++ .../include/cupy/jitify/jitify_example.cpp | 359 ++ cupy/_core/include/cupy/jitify/jitify_test.cu | 1081 ++++ cupy/_core/include/cupy/jitify/stringify.cpp | 86 + cupy/_core/include/cupy/math_constants.h | 20 + cupy/_core/include/cupy/pair.cuh | 279 + cupy/_core/include/cupy/swap.cuh | 37 + cupy/_core/include/cupy/tuple.cuh | 568 ++ cupy/_core/include/cupy/tuple/pair.h | 228 + cupy/_core/include/cupy/tuple/tuple.h | 960 +++ cupy/_core/include/cupy/tuple/type_traits.h | 70 + cupy/_core/include/cupy/type_dispatcher.cuh | 63 + cupy/_core/internal.pxd | 66 + cupy/_core/internal.pyx | 536 ++ cupy/_core/new_fusion.pyx | 176 + cupy/_core/raw.pxd | 30 + cupy/_core/raw.pyx | 545 ++ cupy/_core/syncdetect.py | 68 + cupy/_creation/__init__.py | 2 + cupy/_creation/basic.py | 314 + cupy/_creation/from_data.py | 226 + cupy/_creation/matrix.py | 178 + cupy/_creation/ranges.py | 442 ++ cupy/_environment.py | 529 ++ cupy/_functional/__init__.py | 0 cupy/_functional/piecewise.py | 61 + cupy/_functional/vectorize.py | 104 + cupy/_indexing/__init__.py | 2 + cupy/_indexing/generate.py | 588 ++ cupy/_indexing/indexing.py | 223 + cupy/_indexing/insert.py | 260 + cupy/_indexing/iterate.py | 155 + cupy/_io/__init__.py | 2 + cupy/_io/formatting.py | 75 + cupy/_io/npz.py | 146 + cupy/_io/text.py | 13 + cupy/_logic/__init__.py | 2 + cupy/_logic/comparison.py | 174 + cupy/_logic/content.py | 131 + cupy/_logic/ops.py | 43 + cupy/_logic/truth.py | 316 + cupy/_logic/type_testing.py | 183 + cupy/_manipulation/__init__.py | 2 + cupy/_manipulation/add_remove.py | 213 + cupy/_manipulation/basic.py | 114 + cupy/_manipulation/dims.py | 167 + cupy/_manipulation/join.py | 135 + cupy/_manipulation/kind.py | 122 + cupy/_manipulation/rearrange.py | 200 + cupy/_manipulation/shape.py | 89 + cupy/_manipulation/split.py | 91 + cupy/_manipulation/tiling.py | 70 + cupy/_manipulation/transpose.py | 82 + cupy/_math/__init__.py | 2 + cupy/_math/arithmetic.py | 163 + cupy/_math/explog.py | 103 + cupy/_math/floating.py | 62 + cupy/_math/hyperbolic.py | 55 + cupy/_math/misc.py | 569 ++ cupy/_math/rational.py | 62 + cupy/_math/rounding.py | 77 + cupy/_math/special.py | 26 + cupy/_math/sumprod.py | 626 ++ cupy/_math/trigonometric.py | 163 + cupy/_math/ufunc.py | 17 + cupy/_math/window.py | 197 + cupy/_misc/__init__.py | 0 cupy/_misc/byte_bounds.py | 23 + cupy/_misc/memory_ranges.py | 44 + cupy/_misc/who.py | 106 + cupy/_padding/__init__.py | 2 + cupy/_padding/pad.py | 752 +++ cupy/_sorting/__init__.py | 2 + cupy/_sorting/count.py | 31 + cupy/_sorting/search.py | 466 ++ cupy/_sorting/sort.py | 217 + cupy/_statistics/__init__.py | 2 + cupy/_statistics/correlation.py | 212 + cupy/_statistics/histogram.py | 565 ++ cupy/_statistics/meanvar.py | 288 + cupy/_statistics/order.py | 401 ++ cupy/_util.pyi | 0 cupy/_util.pyx | 218 + cupy/_version.py | 1 + cupy/array_api/__init__.py | 388 ++ cupy/array_api/_array_object.py | 1138 ++++ cupy/array_api/_constants.py | 6 + cupy/array_api/_creation_functions.py | 448 ++ cupy/array_api/_data_type_functions.py | 150 + cupy/array_api/_dtypes.py | 143 + cupy/array_api/_elementwise_functions.py | 729 +++ cupy/array_api/_indexing_functions.py | 17 + cupy/array_api/_manipulation_functions.py | 98 + cupy/array_api/_searching_functions.py | 47 + cupy/array_api/_set_functions.py | 106 + cupy/array_api/_sorting_functions.py | 53 + cupy/array_api/_statistical_functions.py | 115 + cupy/array_api/_typing.py | 75 + cupy/array_api/_utility_functions.py | 37 + cupy/array_api/linalg.py | 445 ++ cupy/cublas.py | 1012 ++++ cupy/cuda/__init__.pxd | 0 cupy/cuda/__init__.py | 184 + cupy/cuda/common.pxd | 22 + cupy/cuda/common.pyx | 60 + cupy/cuda/compiler.py | 943 +++ cupy/cuda/cub.pxd | 20 + cupy/cuda/cub.pyx | 585 ++ cupy/cuda/cudnn.py | 17 + cupy/cuda/cufft.pxd | 99 + cupy/cuda/cufft.pyx | 1205 ++++ cupy/cuda/cupy_cub.cu | 1030 ++++ cupy/cuda/cupy_cub.h | 97 + cupy/cuda/cupy_cufft.h | 324 ++ cupy/cuda/cupy_cufftXt.cu | 68 + cupy/cuda/cupy_cufftXt.h | 10 + cupy/cuda/cupy_jitify.h | 41 + cupy/cuda/cupy_thrust.cu | 526 ++ cupy/cuda/cupy_thrust.h | 26 + cupy/cuda/cutensor.py | 14 + cupy/cuda/device.pxd | 23 + cupy/cuda/device.pyx | 368 ++ cupy/cuda/function.pxd | 39 + cupy/cuda/function.pyx | 302 + cupy/cuda/graph.pxd | 15 + cupy/cuda/graph.pyx | 80 + cupy/cuda/jitify.pyx | 108 + cupy/cuda/memory.pxd | 95 + cupy/cuda/memory.pyx | 1982 +++++++ cupy/cuda/memory_hook.pxd | 2 + cupy/cuda/memory_hook.pyx | 178 + cupy/cuda/memory_hooks/__init__.py | 6 + cupy/cuda/memory_hooks/debug_print.py | 78 + cupy/cuda/memory_hooks/line_profile.py | 171 + cupy/cuda/nccl.py | 14 + cupy/cuda/nvtx.py | 1 + cupy/cuda/pinned_memory.pxd | 38 + cupy/cuda/pinned_memory.pyx | 352 ++ cupy/cuda/profiler.py | 1 + cupy/cuda/runtime.py | 1 + cupy/cuda/stream.pxd | 5 + cupy/cuda/stream.pyx | 524 ++ cupy/cuda/texture.pxd | 47 + cupy/cuda/texture.pyx | 561 ++ cupy/cuda/thrust.pyx | 134 + cupy/cudnn.py | 8 + cupy/cusolver.py | 8 + cupy/cusparse.py | 8 + cupy/cutensor.py | 8 + cupy/fft/__init__.pxd | 0 cupy/fft/__init__.py | 19 + cupy/fft/_cache.pyx | 682 +++ cupy/fft/_callback.pyx | 605 ++ cupy/fft/_fft.py | 1103 ++++ cupy/fft/config.py | 61 + cupy/lib/__init__.py | 1 + cupy/lib/_polynomial.pyx | 257 + cupy/lib/_routines_poly.py | 396 ++ cupy/lib/_shape_base.py | 63 + cupy/lib/stride_tricks.py | 39 + cupy/linalg/__init__.py | 43 + cupy/linalg/_decomposition.py | 585 ++ cupy/linalg/_eigenvalue.py | 190 + cupy/linalg/_einsum.py | 696 +++ cupy/linalg/_einsum_cutn.py | 168 + cupy/linalg/_einsum_opt.py | 412 ++ cupy/linalg/_norms.py | 304 + cupy/linalg/_product.py | 427 ++ cupy/linalg/_solve.py | 407 ++ cupy/linalg/_util.py | 195 + cupy/polynomial/__init__.py | 6 + cupy/polynomial/polynomial.py | 46 + cupy/polynomial/polyutils.py | 110 + cupy/prof/__init__.py | 2 + cupy/prof/_time_range.py | 78 + cupy/random/LICENSE | 109 + cupy/random/__init__.py | 101 + cupy/random/_bit_generator.pyx | 184 + cupy/random/_distributions.py | 952 +++ cupy/random/_generator.py | 1309 +++++ cupy/random/_generator_api.pyx | 1054 ++++ cupy/random/_kernels.py | 1087 ++++ cupy/random/_permutations.py | 31 + cupy/random/_sample.py | 239 + cupy/random/cupy_distributions.cu | 908 +++ cupy/random/cupy_distributions.cuh | 103 + cupy/sparse/__init__.py | 22 + cupy/sparse/linalg/__init__.py | 17 + cupy/testing/__init__.py | 50 + cupy/testing/_array.py | 157 + cupy/testing/_attr.py | 43 + cupy/testing/_bundle.py | 47 + cupy/testing/_condition.py | 124 + cupy/testing/_helper.py | 316 + cupy/testing/_hypothesis.py | 137 + cupy/testing/_loops.py | 1247 ++++ cupy/testing/_parameterized.py | 115 + cupy/testing/_pytest_impl.py | 70 + cupy/testing/_random.py | 142 + cupy/typing/__init__.py | 4 + cupy/typing/_generic_alias.py | 43 + cupy_backends/__init__.pxd | 0 cupy_backends/__init__.py | 0 cupy_backends/cuda/__init__.pxd | 0 cupy_backends/cuda/__init__.py | 0 cupy_backends/cuda/_softlink.pxd | 7 + cupy_backends/cuda/_softlink.pyx | 51 + cupy_backends/cuda/api/__init__.pxd | 0 cupy_backends/cuda/api/__init__.py | 0 cupy_backends/cuda/api/_driver_enum.pxd | 78 + cupy_backends/cuda/api/_driver_enum.pyx | 0 cupy_backends/cuda/api/_driver_extern.pxi | 66 + cupy_backends/cuda/api/_driver_typedef.pxi | 30 + cupy_backends/cuda/api/_runtime_enum.pxd | 342 ++ cupy_backends/cuda/api/_runtime_enum.pyx | 0 cupy_backends/cuda/api/_runtime_extern.pxi | 163 + cupy_backends/cuda/api/_runtime_typedef.pxi | 449 ++ cupy_backends/cuda/api/driver.pxd | 101 + cupy_backends/cuda/api/driver.pyx | 331 ++ cupy_backends/cuda/api/runtime.pxd | 321 + cupy_backends/cuda/api/runtime.pyx | 1097 ++++ cupy_backends/cuda/cupy_cublas.h | 24 + cupy_backends/cuda/cupy_cuda.h | 7 + cupy_backends/cuda/cupy_cuda_profiler_api.h | 18 + cupy_backends/cuda/cupy_cuda_runtime.h | 89 + cupy_backends/cuda/cupy_cudnn.h | 425 ++ cupy_backends/cuda/cupy_cusolver.h | 245 + cupy_backends/cuda/cupy_cusparse.h | 910 +++ cupy_backends/cuda/cupy_cutensor.h | 19 + cupy_backends/cuda/cupy_nccl.h | 12 + cupy_backends/cuda/cupy_nvrtc.h | 22 + cupy_backends/cuda/libs/__init__.pxd | 0 cupy_backends/cuda/libs/__init__.py | 0 cupy_backends/cuda/libs/cublas.pxd | 332 ++ cupy_backends/cuda/libs/cublas.pyx | 1514 +++++ cupy_backends/cuda/libs/cudnn.pxd | 778 +++ cupy_backends/cuda/libs/cudnn.pyx | 2541 ++++++++ cupy_backends/cuda/libs/curand.pxd | 32 + cupy_backends/cuda/libs/curand.pyx | 233 + cupy_backends/cuda/libs/cusolver.pxd | 727 +++ cupy_backends/cuda/libs/cusolver.pyx | 3656 ++++++++++++ cupy_backends/cuda/libs/cusparse.pxd | 220 + cupy_backends/cuda/libs/cusparse.pyx | 5173 +++++++++++++++++ cupy_backends/cuda/libs/cusparselt.pxd | 39 + cupy_backends/cuda/libs/cusparselt.pyx | 505 ++ cupy_backends/cuda/libs/cutensor.pxd | 288 + cupy_backends/cuda/libs/cutensor.pyx | 1000 ++++ cupy_backends/cuda/libs/nccl.pxd | 25 + cupy_backends/cuda/libs/nccl.pyx | 494 ++ cupy_backends/cuda/libs/nvrtc.pxd | 39 + cupy_backends/cuda/libs/nvrtc.pyx | 278 + cupy_backends/cuda/libs/nvtx.pyx | 232 + cupy_backends/cuda/libs/profiler.pxd | 11 + cupy_backends/cuda/libs/profiler.pyx | 70 + cupy_backends/cuda/stream.pxd | 6 + cupy_backends/cuda/stream.pyx | 92 + cupy_backends/cupy_backend.h | 17 + cupy_backends/cupy_backend_runtime.h | 17 + cupy_backends/cupy_blas.h | 17 + cupy_backends/cupy_complex.h | 17 + cupy_backends/cupy_cudnn.h | 441 ++ cupy_backends/cupy_cusparselt.h | 18 + cupy_backends/cupy_cutensor.h | 20 + cupy_backends/cupy_lapack.h | 230 + cupy_backends/cupy_nccl.h | 169 + cupy_backends/cupy_profiler.h | 17 + cupy_backends/cupy_rand.h | 17 + cupy_backends/cupy_rtc.h | 18 + cupy_backends/cupy_sparse.h | 21 + cupy_backends/cupy_tx.h | 37 + cupy_backends/hip/cupy_cuComplex.h | 20 + cupy_backends/hip/cupy_hip.h | 138 + cupy_backends/hip/cupy_hip_common.h | 156 + cupy_backends/hip/cupy_hip_runtime.h | 521 ++ cupy_backends/hip/cupy_hipblas.h | 998 ++++ cupy_backends/hip/cupy_hiprand.h | 104 + cupy_backends/hip/cupy_hiprtc.h | 74 + cupy_backends/hip/cupy_hipsparse.h | 3881 +++++++++++++ cupy_backends/hip/cupy_profiler.h | 24 + cupy_backends/hip/cupy_rccl.h | 7 + cupy_backends/hip/cupy_rocsolver.h | 2167 +++++++ cupy_backends/hip/cupy_roctx.h | 31 + cupy_backends/stub/cupy_cuComplex.h | 22 + cupy_backends/stub/cupy_cublas.h | 430 ++ cupy_backends/stub/cupy_cuda.h | 121 + cupy_backends/stub/cupy_cuda_common.h | 235 + cupy_backends/stub/cupy_cuda_runtime.h | 427 ++ cupy_backends/stub/cupy_cudnn.h | 644 ++ cupy_backends/stub/cupy_curand.h | 94 + cupy_backends/stub/cupy_cusolver.h | 1000 ++++ cupy_backends/stub/cupy_cusparse.h | 1253 ++++ cupy_backends/stub/cupy_cusparselt.h | 137 + cupy_backends/stub/cupy_cutensor.h | 105 + cupy_backends/stub/cupy_nccl.h | 119 + cupy_backends/stub/cupy_nvrtc.h | 69 + cupy_backends/stub/cupy_nvtx.h | 65 + cupy_backends/stub/cupy_profiler.h | 26 + cupyx/__init__.py | 25 + cupyx/_gufunc.py | 5 + cupyx/_pinned_array.py | 141 + cupyx/_rsqrt.py | 7 + cupyx/_runtime.py | 320 + cupyx/_scatter.py | 131 + cupyx/_texture.py | 197 + cupyx/_ufunc_config.py | 123 + cupyx/cudnn.pyx | 2613 +++++++++ cupyx/cusolver.pyx | 984 ++++ cupyx/cusparse.py | 2092 +++++++ cupyx/cutensor.pyx | 919 +++ cupyx/distributed/__init__.py | 2 + cupyx/distributed/_array.py | 181 + cupyx/distributed/_comm.py | 67 + cupyx/distributed/_init.py | 89 + cupyx/distributed/_klv_utils.py | 58 + cupyx/distributed/_nccl_comm.py | 840 +++ cupyx/distributed/_store.py | 153 + cupyx/distributed/_store_actions.py | 177 + cupyx/fallback_mode/__init__.py | 10 + cupyx/fallback_mode/fallback.py | 596 ++ cupyx/fallback_mode/notification.py | 75 + cupyx/jit/__init__.py | 36 + cupyx/jit/_builtin_funcs.py | 497 ++ cupyx/jit/_compile.py | 1009 ++++ cupyx/jit/_cuda_typerules.py | 162 + cupyx/jit/_cuda_types.py | 334 ++ cupyx/jit/_interface.py | 190 + cupyx/jit/_internal_types.py | 130 + cupyx/jit/cg.py | 453 ++ cupyx/jit/cub.py | 115 + cupyx/jit/thrust.py | 952 +++ cupyx/lapack.py | 343 ++ cupyx/linalg/__init__.py | 3 + cupyx/linalg/_solve.py | 26 + cupyx/linalg/sparse/__init__.py | 3 + cupyx/linalg/sparse/_solve.py | 62 + cupyx/optimizing/__init__.py | 1 + cupyx/optimizing/_optimize.py | 109 + cupyx/profiler/__init__.py | 31 + cupyx/profiler/_time.py | 227 + cupyx/profiler/_time_range.py | 89 + cupyx/scipy/__init__.py | 35 + cupyx/scipy/_lib/__init__.py | 0 cupyx/scipy/_lib/_util.py | 73 + cupyx/scipy/fft/__init__.py | 16 + cupyx/scipy/fft/_fft.py | 677 +++ cupyx/scipy/fft/_fftlog.py | 225 + cupyx/scipy/fft/_helper.py | 51 + cupyx/scipy/fft/_realtransforms.py | 924 +++ cupyx/scipy/fftpack/__init__.py | 9 + cupyx/scipy/fftpack/_fft.py | 493 ++ cupyx/scipy/interpolate/__init__.py | 21 + cupyx/scipy/interpolate/_bspline.py | 943 +++ cupyx/scipy/interpolate/_bspline2.py | 564 ++ cupyx/scipy/interpolate/_cubic.py | 414 ++ cupyx/scipy/interpolate/_interpolate.py | 1763 ++++++ cupyx/scipy/interpolate/_polyint.py | 527 ++ cupyx/scipy/interpolate/_rbfinterp.py | 790 +++ cupyx/scipy/interpolate/_rgi.py | 622 ++ cupyx/scipy/linalg/__init__.py | 16 + cupyx/scipy/linalg/_decomp_lu.py | 354 ++ cupyx/scipy/linalg/_solve_triangular.py | 101 + cupyx/scipy/linalg/_special_matrices.py | 570 ++ cupyx/scipy/linalg/_uarray.py | 75 + cupyx/scipy/ndimage/__init__.py | 71 + cupyx/scipy/ndimage/_filters.py | 1255 ++++ cupyx/scipy/ndimage/_filters_core.py | 309 + cupyx/scipy/ndimage/_filters_generic.py | 268 + cupyx/scipy/ndimage/_fourier.py | 253 + cupyx/scipy/ndimage/_interp_kernels.py | 598 ++ cupyx/scipy/ndimage/_interpolation.py | 779 +++ cupyx/scipy/ndimage/_measurements.py | 1239 ++++ cupyx/scipy/ndimage/_morphology.py | 1017 ++++ cupyx/scipy/ndimage/_spline_kernel_weights.py | 73 + cupyx/scipy/ndimage/_spline_prefilter_core.py | 261 + cupyx/scipy/ndimage/_util.py | 162 + cupyx/scipy/signal/__init__.py | 13 + cupyx/scipy/signal/_bsplines.py | 39 + cupyx/scipy/signal/_signaltools.py | 577 ++ cupyx/scipy/signal/_signaltools_core.py | 302 + cupyx/scipy/sparse/__init__.py | 44 + cupyx/scipy/sparse/_base.py | 582 ++ cupyx/scipy/sparse/_compressed.py | 861 +++ cupyx/scipy/sparse/_construct.py | 582 ++ cupyx/scipy/sparse/_coo.py | 553 ++ cupyx/scipy/sparse/_csc.py | 400 ++ cupyx/scipy/sparse/_csr.py | 1211 ++++ cupyx/scipy/sparse/_data.py | 398 ++ cupyx/scipy/sparse/_dia.py | 219 + cupyx/scipy/sparse/_extract.py | 81 + cupyx/scipy/sparse/_index.py | 702 +++ cupyx/scipy/sparse/_sputils.py | 169 + cupyx/scipy/sparse/_util.py | 26 + cupyx/scipy/sparse/csgraph/__init__.py | 4 + cupyx/scipy/sparse/csgraph/_traversal.py | 119 + cupyx/scipy/sparse/linalg/__init__.py | 22 + cupyx/scipy/sparse/linalg/_eigen.py | 422 ++ cupyx/scipy/sparse/linalg/_interface.py | 578 ++ cupyx/scipy/sparse/linalg/_iterative.py | 408 ++ cupyx/scipy/sparse/linalg/_lobpcg.py | 674 +++ cupyx/scipy/sparse/linalg/_norm.py | 111 + cupyx/scipy/sparse/linalg/_solve.py | 1028 ++++ cupyx/scipy/spatial/__init__.py | 1 + cupyx/scipy/spatial/distance.py | 579 ++ cupyx/scipy/special/__init__.py | 109 + cupyx/scipy/special/_basic.py | 287 + cupyx/scipy/special/_bessel.py | 462 ++ cupyx/scipy/special/_beta.py | 1091 ++++ cupyx/scipy/special/_complexstuff.py | 54 + cupyx/scipy/special/_convex_analysis.py | 120 + cupyx/scipy/special/_digamma.py | 191 + cupyx/scipy/special/_erf.py | 59 + cupyx/scipy/special/_exp1.py | 62 + cupyx/scipy/special/_expi.py | 60 + cupyx/scipy/special/_expn.py | 359 ++ cupyx/scipy/special/_gamma.py | 220 + cupyx/scipy/special/_gammainc.py | 1144 ++++ cupyx/scipy/special/_gammaln.py | 65 + cupyx/scipy/special/_gammasgn.py | 51 + cupyx/scipy/special/_loggamma.py | 229 + cupyx/scipy/special/_logsoftmax.py | 53 + cupyx/scipy/special/_logsumexp.py | 75 + cupyx/scipy/special/_lpmv.py | 417 ++ cupyx/scipy/special/_poch.py | 119 + cupyx/scipy/special/_polygamma.py | 22 + cupyx/scipy/special/_softmax.py | 28 + cupyx/scipy/special/_sph_harm.py | 84 + cupyx/scipy/special/_statistics.py | 230 + cupyx/scipy/special/_stats_distributions.py | 1125 ++++ cupyx/scipy/special/_trig.py | 97 + cupyx/scipy/special/_xlogy.py | 68 + cupyx/scipy/special/_zeta.py | 129 + cupyx/scipy/stats/__init__.py | 11 + cupyx/scipy/stats/_distributions.py | 59 + cupyx/scipy/stats/_morestats.py | 47 + cupyx/scipy/stats/_stats.py | 77 + cupyx/scipy/stats/_stats_py.py | 138 + cupyx/time.py | 90 + cupyx/tools/__init__.py | 0 cupyx/tools/_hipsparse_stub_mapper.py | 402 ++ cupyx/tools/install_library.py | 303 + docker/python3/Dockerfile | 12 + docker/rocm/Dockerfile | 22 + docker/rocm/README.md | 5 + docs/LICENSE_THIRD_PARTY | 70 + docs/Makefile | 197 + docs/image/cupy_logo_1000px.png | Bin 0 -> 56999 bytes docs/make.bat | 263 + docs/requirements.txt | 8 + docs/source/.gitignore | 1 + docs/source/_comparison_generator.py | 297 + docs/source/_static/favicon.ico | Bin 0 -> 203401 bytes docs/source/_templates/autosummary/class.rst | 53 + docs/source/conf.py | 518 ++ docs/source/contribution.rst | 420 ++ docs/source/index.rst | 26 + docs/source/install.rst | 476 ++ docs/source/license.rst | 107 + docs/source/overview.rst | 68 + docs/source/reference/_deprecated.rst | 42 + docs/source/reference/_private.rst | 31 + docs/source/reference/array_api.rst | 26 + docs/source/reference/array_api_array.rst | 18 + docs/source/reference/array_api_functions.rst | 8 + docs/source/reference/binary.rst | 38 + docs/source/reference/comparison.rst | 9 + docs/source/reference/creation.rst | 70 + docs/source/reference/cuda.rst | 209 + docs/source/reference/distributed.rst | 15 + docs/source/reference/dtype.rst | 61 + docs/source/reference/environment.rst | 187 + docs/source/reference/ext.rst | 51 + docs/source/reference/fft.rst | 97 + docs/source/reference/fftpack.rst | 3 + docs/source/reference/functional.rst | 18 + docs/source/reference/index.rst | 33 + docs/source/reference/indexing.rst | 65 + docs/source/reference/io.rst | 51 + docs/source/reference/kernel.rst | 68 + docs/source/reference/linalg.rst | 71 + docs/source/reference/logic.rst | 73 + docs/source/reference/manipulation.rst | 134 + docs/source/reference/math.rst | 187 + docs/source/reference/misc.rst | 32 + docs/source/reference/ndarray.rst | 56 + docs/source/reference/ndimage.rst | 3 + docs/source/reference/pad.rst | 10 + docs/source/reference/polynomials.rst | 75 + docs/source/reference/random.rst | 127 + docs/source/reference/routines.rst | 31 + docs/source/reference/scipy.rst | 27 + docs/source/reference/scipy_fft.rst | 84 + docs/source/reference/scipy_fftpack.rst | 38 + docs/source/reference/scipy_interpolate.rst | 47 + docs/source/reference/scipy_linalg.rst | 50 + docs/source/reference/scipy_ndimage.rst | 139 + docs/source/reference/scipy_signal.rst | 33 + docs/source/reference/scipy_sparse.rst | 102 + .../source/reference/scipy_sparse_csgraph.rst | 22 + docs/source/reference/scipy_sparse_linalg.rst | 83 + docs/source/reference/scipy_spatial.rst | 22 + .../reference/scipy_spatial_distance.rst | 51 + docs/source/reference/scipy_special.rst | 164 + docs/source/reference/scipy_stats.rst | 28 + docs/source/reference/set.rst | 27 + docs/source/reference/signal.rst | 3 + docs/source/reference/sorting.rst | 47 + docs/source/reference/sparse.rst | 3 + docs/source/reference/special.rst | 4 + docs/source/reference/statistics.rst | 61 + docs/source/reference/stats.rst | 3 + docs/source/reference/testing.rst | 87 + docs/source/reference/ufunc.rst | 186 + docs/source/reference/window.rst | 18 + docs/source/spelling_wordlist.txt | 173 + docs/source/upgrade.rst | 709 +++ docs/source/user_guide/basic.rst | 222 + docs/source/user_guide/compatibility.rst | 150 + docs/source/user_guide/cuda_api.rst | 89 + docs/source/user_guide/difference.rst | 194 + docs/source/user_guide/fft.rst | 315 + docs/source/user_guide/index.rst | 17 + docs/source/user_guide/interoperability.rst | 462 ++ docs/source/user_guide/kernel.rst | 581 ++ docs/source/user_guide/memory.rst | 200 + docs/source/user_guide/performance.rst | 128 + examples/cg/cg.py | 84 + examples/cusparselt/matmul.py | 99 + examples/custom_struct/README.md | 11 + examples/custom_struct/builtin_vectors.py | 48 + examples/custom_struct/complex_struct.py | 133 + examples/custom_struct/packed_matrix.py | 91 + examples/cutensor/contraction.py | 45 + examples/cutensor/elementwise_binary.py | 46 + examples/cutensor/elementwise_trinary.py | 56 + examples/cutensor/reduction.py | 43 + examples/finance/black_scholes.py | 144 + examples/finance/monte_carlo.py | 161 + examples/finance/monte_carlo_multigpu.py | 93 + examples/gemm/README.md | 37 + examples/gemm/sgemm.cu | 200 + examples/gemm/sgemm.py | 89 + examples/gemm/utils.py | 22 + examples/gmm/README.md | 19 + examples/gmm/gmm.py | 177 + .../mpi4py_multiple_devices.py | 53 + examples/jit/elmentwise_op.py | 21 + examples/jit/reduction_atomic.py | 33 + examples/jit/reduction_simple.py | 33 + examples/kmeans/README.md | 20 + examples/kmeans/kmeans.py | 152 + examples/peer/peer_matrix.py | 17 + examples/stream/cublas.py | 19 + examples/stream/cudnn.py | 21 + examples/stream/cufft.py | 18 + examples/stream/cupy_event.py | 32 + examples/stream/cupy_kernel.py | 18 + examples/stream/cupy_memcpy.py | 50 + examples/stream/curand.py | 12 + examples/stream/cusolver.py | 20 + examples/stream/cusparse.py | 30 + examples/stream/map_reduce.py | 47 + examples/stream/thrust.py | 18 + install/cupy_builder/__init__.py | 24 + install/cupy_builder/_command.py | 209 + install/cupy_builder/_compiler.py | 292 + install/cupy_builder/_context.py | 93 + install/cupy_builder/_environment.py | 37 + install/cupy_builder/_features.py | 490 ++ install/cupy_builder/_preflight.py | 39 + install/cupy_builder/cupy_setup_build.py | 451 ++ install/cupy_builder/install_build.py | 748 +++ install/cupy_builder/install_utils.py | 22 + install/mypy.ini | 23 + install/universal_pkg/RELEASE.md | 5 + install/universal_pkg/__init__.py | 0 install/universal_pkg/setup.py | 285 + setup.cfg | 61 + setup.py | 149 + tests/conftest.py | 74 + tests/cupy_tests/__init__.py | 0 tests/cupy_tests/array_api_tests/__init__.py | 26 + .../array_api_tests/test_array_object.py | 366 ++ .../test_creation_functions.py | 155 + .../test_data_type_functions.py | 19 + .../test_elementwise_functions.py | 111 + .../test_indexing_functions.py | 22 + .../array_api_tests/test_set_functions.py | 21 + .../array_api_tests/test_sorting_functions.py | 23 + .../array_api_tests/test_validation.py | 27 + tests/cupy_tests/binary_tests/__init__.py | 0 .../binary_tests/test_elementwise.py | 37 + tests/cupy_tests/binary_tests/test_packing.py | 78 + tests/cupy_tests/core_tests/__init__.py | 0 .../core_tests/fusion_tests/__init__.py | 0 .../core_tests/fusion_tests/fusion_utils.py | 144 + .../core_tests/fusion_tests/test_array.py | 309 + .../core_tests/fusion_tests/test_example.py | 46 + .../core_tests/fusion_tests/test_indexing.py | 117 + .../fusion_tests/test_kernel_cache.py | 176 + .../core_tests/fusion_tests/test_misc.py | 422 ++ .../fusion_tests/test_optimization.py | 199 + .../core_tests/fusion_tests/test_reduction.py | 242 + .../core_tests/fusion_tests/test_routines.py | 412 ++ .../core_tests/fusion_tests/test_ufunc.py | 316 + .../core_tests/test_array_function.py | 58 + tests/cupy_tests/core_tests/test_carray.py | 85 + tests/cupy_tests/core_tests/test_core.py | 120 + .../core_tests/test_cub_reduction.py | 148 + tests/cupy_tests/core_tests/test_dlpack.py | 150 + .../cupy_tests/core_tests/test_elementwise.py | 145 + tests/cupy_tests/core_tests/test_flags.py | 67 + tests/cupy_tests/core_tests/test_function.py | 218 + tests/cupy_tests/core_tests/test_gufuncs.py | 282 + tests/cupy_tests/core_tests/test_include.py | 86 + tests/cupy_tests/core_tests/test_internal.py | 245 + tests/cupy_tests/core_tests/test_iter.py | 42 + tests/cupy_tests/core_tests/test_ndarray.py | 648 +++ .../core_tests/test_ndarray_adv_indexing.py | 709 +++ .../core_tests/test_ndarray_complex_ops.py | 166 + .../core_tests/test_ndarray_contiguity.py | 16 + .../core_tests/test_ndarray_conversion.py | 59 + .../core_tests/test_ndarray_copy_and_view.py | 460 ++ .../test_ndarray_cuda_array_interface.py | 314 + .../core_tests/test_ndarray_elementwise_op.py | 775 +++ .../cupy_tests/core_tests/test_ndarray_get.py | 142 + .../core_tests/test_ndarray_indexing.py | 232 + .../core_tests/test_ndarray_math.py | 114 + .../core_tests/test_ndarray_owndata.py | 20 + .../core_tests/test_ndarray_reduction.py | 498 ++ .../core_tests/test_ndarray_scatter.py | 283 + .../core_tests/test_ndarray_ufunc.py | 270 + .../core_tests/test_ndarray_unary_op.py | 145 + tests/cupy_tests/core_tests/test_raw.py | 1367 +++++ tests/cupy_tests/core_tests/test_reduction.py | 233 + tests/cupy_tests/core_tests/test_scan.py | 55 + .../cupy_tests/core_tests/test_syncdetect.py | 33 + .../core_tests/test_ufunc_methods.py | 227 + .../cupy_tests/core_tests/test_userkernel.py | 359 ++ tests/cupy_tests/creation_tests/__init__.py | 0 tests/cupy_tests/creation_tests/test_basic.py | 496 ++ .../creation_tests/test_from_data.py | 779 +++ .../cupy_tests/creation_tests/test_matrix.py | 220 + .../cupy_tests/creation_tests/test_ranges.py | 401 ++ tests/cupy_tests/cuda_tests/__init__.py | 0 .../cuda_tests/memory_hooks_tests/__init__.py | 0 .../memory_hooks_tests/test_debug_print.py | 49 + .../memory_hooks_tests/test_line_profile.py | 50 + tests/cupy_tests/cuda_tests/test_compiler.py | 136 + tests/cupy_tests/cuda_tests/test_cublas.py | 13 + tests/cupy_tests/cuda_tests/test_cudnn.py | 17 + tests/cupy_tests/cuda_tests/test_cufft.py | 232 + tests/cupy_tests/cuda_tests/test_curand.py | 47 + tests/cupy_tests/cuda_tests/test_cusolver.py | 13 + tests/cupy_tests/cuda_tests/test_cusparse.py | 27 + tests/cupy_tests/cuda_tests/test_cutensor.py | 14 + tests/cupy_tests/cuda_tests/test_device.py | 238 + tests/cupy_tests/cuda_tests/test_driver.py | 64 + tests/cupy_tests/cuda_tests/test_graph.py | 345 ++ tests/cupy_tests/cuda_tests/test_memory.py | 1341 +++++ .../cupy_tests/cuda_tests/test_memory_hook.py | 85 + tests/cupy_tests/cuda_tests/test_nccl.py | 101 + tests/cupy_tests/cuda_tests/test_nvrtc.py | 13 + tests/cupy_tests/cuda_tests/test_nvtx.py | 21 + .../cuda_tests/test_pinned_memory.py | 129 + tests/cupy_tests/cuda_tests/test_profile.py | 29 + tests/cupy_tests/cuda_tests/test_runtime.py | 57 + tests/cupy_tests/cuda_tests/test_stream.py | 286 + tests/cupy_tests/cuda_tests/test_texture.py | 425 ++ tests/cupy_tests/fft_tests/__init__.py | 0 tests/cupy_tests/fft_tests/test_cache.py | 493 ++ tests/cupy_tests/fft_tests/test_callback.py | 708 +++ tests/cupy_tests/fft_tests/test_fft.py | 1362 +++++ tests/cupy_tests/functional_tests/__init__.py | 0 .../functional_tests/test_piecewise.py | 119 + .../functional_tests/test_vectorize.py | 668 +++ tests/cupy_tests/indexing_tests/__init__.py | 0 .../indexing_tests/test_generate.py | 410 ++ .../indexing_tests/test_indexing.py | 383 ++ .../cupy_tests/indexing_tests/test_insert.py | 305 + .../cupy_tests/indexing_tests/test_iterate.py | 138 + tests/cupy_tests/io_tests/__init__.py | 0 tests/cupy_tests/io_tests/test_base_n.py | 48 + tests/cupy_tests/io_tests/test_formatting.py | 41 + tests/cupy_tests/io_tests/test_npz.py | 100 + tests/cupy_tests/io_tests/test_text.py | 25 + tests/cupy_tests/lib_tests/__init__.py | 0 tests/cupy_tests/lib_tests/test_polynomial.py | 898 +++ tests/cupy_tests/lib_tests/test_shape_base.py | 108 + .../lib_tests/test_strided_tricks.py | 46 + tests/cupy_tests/linalg_tests/__init__.py | 0 .../linalg_tests/test_decomposition.py | 368 ++ .../linalg_tests/test_eigenvalue.py | 181 + tests/cupy_tests/linalg_tests/test_einsum.py | 563 ++ tests/cupy_tests/linalg_tests/test_norms.py | 208 + tests/cupy_tests/linalg_tests/test_product.py | 478 ++ tests/cupy_tests/linalg_tests/test_solve.py | 334 ++ tests/cupy_tests/logic_tests/__init__.py | 0 .../cupy_tests/logic_tests/test_comparison.py | 291 + tests/cupy_tests/logic_tests/test_content.py | 46 + tests/cupy_tests/logic_tests/test_ops.py | 31 + tests/cupy_tests/logic_tests/test_truth.py | 305 + .../cupy_tests/logic_tests/test_type_test.py | 106 + .../cupy_tests/manipulation_tests/__init__.py | 0 .../manipulation_tests/test_add_remove.py | 211 + .../manipulation_tests/test_basic.py | 259 + .../manipulation_tests/test_dims.py | 344 ++ .../manipulation_tests/test_join.py | 452 ++ .../manipulation_tests/test_kind.py | 127 + .../manipulation_tests/test_rearrange.py | 272 + .../manipulation_tests/test_shape.py | 243 + .../manipulation_tests/test_split.py | 96 + .../manipulation_tests/test_tiling.py | 131 + .../manipulation_tests/test_transpose.py | 170 + tests/cupy_tests/math_tests/__init__.py | 0 .../cupy_tests/math_tests/test_arithmetic.py | 533 ++ tests/cupy_tests/math_tests/test_explog.py | 70 + tests/cupy_tests/math_tests/test_floating.py | 67 + .../cupy_tests/math_tests/test_hyperbolic.py | 39 + tests/cupy_tests/math_tests/test_matmul.py | 400 ++ tests/cupy_tests/math_tests/test_misc.py | 537 ++ tests/cupy_tests/math_tests/test_rational.py | 37 + tests/cupy_tests/math_tests/test_rounding.py | 165 + tests/cupy_tests/math_tests/test_special.py | 24 + tests/cupy_tests/math_tests/test_sumprod.py | 1150 ++++ .../math_tests/test_trigonometric.py | 113 + tests/cupy_tests/math_tests/test_window.py | 38 + tests/cupy_tests/misc_tests/__init__.py | 0 .../cupy_tests/misc_tests/test_byte_bounds.py | 76 + .../misc_tests/test_memory_ranges.py | 152 + tests/cupy_tests/misc_tests/test_who.py | 65 + tests/cupy_tests/padding_tests/__init__.py | 0 tests/cupy_tests/padding_tests/test_pad.py | 310 + .../polynomial_tests/test_polynomial.py | 134 + .../polynomial_tests/test_polyutils.py | 172 + tests/cupy_tests/prof_tests/__init__.py | 0 tests/cupy_tests/prof_tests/test_range.py | 98 + tests/cupy_tests/random_tests/__init__.py | 0 .../random_tests/common_distributions.py | 521 ++ .../random_tests/test_bit_generator.py | 75 + .../random_tests/test_distributions.py | 852 +++ .../cupy_tests/random_tests/test_generator.py | 1290 ++++ .../random_tests/test_generator_api.py | 370 ++ tests/cupy_tests/random_tests/test_init.py | 6 + .../random_tests/test_permutations.py | 182 + tests/cupy_tests/random_tests/test_random.py | 19 + tests/cupy_tests/random_tests/test_sample.py | 300 + tests/cupy_tests/sorting_tests/__init__.py | 0 tests/cupy_tests/sorting_tests/test_count.py | 60 + tests/cupy_tests/sorting_tests/test_search.py | 832 +++ tests/cupy_tests/sorting_tests/test_sort.py | 783 +++ tests/cupy_tests/statistics_tests/__init__.py | 0 .../statistics_tests/test_correlation.py | 184 + .../statistics_tests/test_histogram.py | 566 ++ .../statistics_tests/test_meanvar.py | 514 ++ .../cupy_tests/statistics_tests/test_order.py | 430 ++ tests/cupy_tests/test_cublas.py | 629 ++ tests/cupy_tests/test_init.py | 153 + tests/cupy_tests/test_ndim.py | 66 + tests/cupy_tests/test_numpy_interop.py | 150 + tests/cupy_tests/test_type_routines.py | 91 + tests/cupy_tests/test_typing.py | 8 + tests/cupy_tests/testing_tests/__init__.py | 0 tests/cupy_tests/testing_tests/test_array.py | 116 + .../testing_tests/test_condition.py | 190 + tests/cupy_tests/testing_tests/test_helper.py | 185 + tests/cupy_tests/testing_tests/test_loops.py | 510 ++ .../testing_tests/test_parameterized.py | 232 + tests/cupyx_tests/__init__.py | 0 .../distributed_tests/comm_runner.py | 607 ++ .../distributed_tests/test_array.py | 106 + .../distributed_tests/test_comm.py | 189 + .../distributed_tests/test_store.py | 43 + .../fallback_mode_tests/__init__.py | 0 .../fallback_mode_tests/test_fallback.py | 657 +++ .../fallback_mode_tests/test_notifications.py | 172 + tests/cupyx_tests/jit_tests/__init__.py | 0 .../jit_tests/test_cooperative_groups.py | 185 + tests/cupyx_tests/jit_tests/test_cub.py | 128 + .../jit_tests/test_device_function.py | 103 + tests/cupyx_tests/jit_tests/test_raw.py | 770 +++ tests/cupyx_tests/jit_tests/test_thrust.py | 1125 ++++ tests/cupyx_tests/linalg_tests/__init__.py | 0 .../linalg_tests/sparse_tests/__init__.py | 8 + .../linalg_tests/sparse_tests/test_solve.py | 67 + tests/cupyx_tests/linalg_tests/test_solve.py | 84 + tests/cupyx_tests/profiler_tests/__init__.py | 0 .../profiler_tests/test_benchmark.py | 132 + .../profiler_tests/test_profile.py | 29 + .../profiler_tests/test_time_range.py | 98 + tests/cupyx_tests/scipy_tests/__init__.py | 0 .../scipy_tests/fft_tests/__init__.py | 0 .../scipy_tests/fft_tests/test_fft.py | 1786 ++++++ .../scipy_tests/fft_tests/test_fftlog.py | 75 + .../scipy_tests/fft_tests/test_helper.py | 21 + .../fft_tests/test_realtransforms.py | 173 + .../scipy_tests/fftpack_tests/__init__.py | 0 .../scipy_tests/fftpack_tests/test_fftpack.py | 689 +++ .../scipy_tests/interpolate_tests/__init__.py | 0 .../interpolate_tests/test_bspline.py | 489 ++ .../interpolate_tests/test_bspline2.py | 458 ++ .../interpolate_tests/test_polyint.py | 617 ++ .../interpolate_tests/test_ppoly.py | 1315 +++++ .../interpolate_tests/test_rbfinterp.py | 576 ++ .../scipy_tests/interpolate_tests/test_rgi.py | 741 +++ .../scipy_tests/linalg_tests/__init__.py | 0 .../linalg_tests/test_decomp_lu.py | 155 + .../linalg_tests/test_solve_triangular.py | 95 + .../linalg_tests/test_special_matrices.py | 107 + .../scipy_tests/linalg_tests/test_uarray.py | 9 + .../scipy_tests/ndimage_tests/__init__.py | 0 .../scipy_tests/ndimage_tests/test_filters.py | 824 +++ .../scipy_tests/ndimage_tests/test_fourier.py | 421 ++ .../ndimage_tests/test_interpolation.py | 1029 ++++ .../ndimage_tests/test_measurements.py | 452 ++ .../ndimage_tests/test_morphology.py | 610 ++ .../scipy_tests/signal_tests/test_bsplines.py | 33 + .../signal_tests/test_signaltools.py | 362 ++ .../scipy_tests/sparse_tests/__init__.py | 0 .../csgraph_tests/test_traversal.py | 63 + .../scipy_tests/sparse_tests/test_base.py | 84 + .../sparse_tests/test_construct.py | 476 ++ .../scipy_tests/sparse_tests/test_coo.py | 1191 ++++ .../scipy_tests/sparse_tests/test_csc.py | 1560 +++++ .../scipy_tests/sparse_tests/test_csr.py | 2170 +++++++ .../scipy_tests/sparse_tests/test_dia.py | 370 ++ .../scipy_tests/sparse_tests/test_extract.py | 75 + .../scipy_tests/sparse_tests/test_index.py | 579 ++ .../scipy_tests/sparse_tests/test_linalg.py | 1697 ++++++ .../scipy_tests/spatial_tests/__init__.py | 0 .../spatial_tests/test_distance.py | 232 + .../scipy_tests/special_tests/__init__.py | 0 .../scipy_tests/special_tests/test_basic.py | 218 + .../scipy_tests/special_tests/test_bessel.py | 135 + .../scipy_tests/special_tests/test_beta.py | 144 + .../special_tests/test_convex_analysis.py | 59 + .../scipy_tests/special_tests/test_digamma.py | 59 + .../scipy_tests/special_tests/test_erf.py | 148 + .../scipy_tests/special_tests/test_exp1.py | 40 + .../scipy_tests/special_tests/test_expi.py | 40 + .../scipy_tests/special_tests/test_expn.py | 52 + .../scipy_tests/special_tests/test_gamma.py | 63 + .../special_tests/test_gammainc.py | 169 + .../scipy_tests/special_tests/test_gammaln.py | 119 + .../special_tests/test_log_softmax.py | 66 + .../special_tests/test_logsumexp.py | 112 + .../special_tests/test_polygamma.py | 58 + .../scipy_tests/special_tests/test_softmax.py | 82 + .../special_tests/test_sph_harm.py | 28 + .../special_tests/test_statistics.py | 302 + .../special_tests/test_ufunc_dispatch.py | 54 + .../scipy_tests/special_tests/test_zeta.py | 48 + .../stats_tests/test_distributions.py | 138 + .../scipy_tests/stats_tests/test_morestats.py | 145 + .../scipy_tests/stats_tests/test_stats.py | 244 + .../scipy_tests/test_get_array_module.py | 28 + tests/cupyx_tests/test_cudnn.py | 457 ++ tests/cupyx_tests/test_cupyx.py | 30 + tests/cupyx_tests/test_cusolver.py | 316 + tests/cupyx_tests/test_cusparse.py | 1118 ++++ tests/cupyx_tests/test_cutensor.py | 370 ++ tests/cupyx_tests/test_lapack.py | 167 + tests/cupyx_tests/test_optimize.py | 212 + tests/cupyx_tests/test_pinned_array.py | 422 ++ tests/cupyx_tests/test_rsqrt.py | 18 + tests/cupyx_tests/test_runtime.py | 46 + tests/cupyx_tests/test_time.py | 131 + tests/cupyx_tests/tools_tests/__init__.py | 0 .../tools_tests/test_install_library.py | 78 + tests/example_tests/__init__.py | 0 tests/example_tests/example_test.py | 18 + tests/example_tests/test_custom_struct.py | 34 + tests/example_tests/test_finance.py | 62 + tests/example_tests/test_gemm.py | 9 + tests/example_tests/test_gmm.py | 37 + tests/example_tests/test_kmeans.py | 43 + tests/install_tests/__init__.py | 20 + tests/install_tests/test_build.py | 39 + .../test_cupy_builder/__init__.py | 0 .../test_cupy_builder/test_command.py | 17 + .../test_cupy_builder/test_context.py | 86 + .../test_cupy_builder/test_features.py | 30 + .../test_universal_pkg/__init__.py | 0 .../test_universal_pkg/test_setup.py | 54 + tests/install_tests/test_utils.py | 18 + third_party/cub/.cproject | 1223 ++++ third_party/cub/.project | 27 + third_party/cub/.settings/.gitignore | 1 + .../org.eclipse.cdt.codan.core.prefs | 72 + .../cub/.settings/org.eclipse.cdt.core.prefs | 177 + .../cub/.settings/org.eclipse.cdt.ui.prefs | 3 + .../.settings/org.eclipse.core.runtime.prefs | 4 + third_party/cub/CHANGE_LOG.TXT | 403 ++ third_party/cub/LICENSE.TXT | 24 + third_party/cub/README.md | 128 + third_party/cub/common.mk | 233 + third_party/cub/cub/agent/agent_histogram.cuh | 787 +++ .../cub/agent/agent_radix_sort_downsweep.cuh | 789 +++ .../cub/agent/agent_radix_sort_upsweep.cuh | 526 ++ third_party/cub/cub/agent/agent_reduce.cuh | 385 ++ .../cub/cub/agent/agent_reduce_by_key.cuh | 547 ++ third_party/cub/cub/agent/agent_rle.cuh | 837 +++ third_party/cub/cub/agent/agent_scan.cuh | 471 ++ .../cub/cub/agent/agent_segment_fixup.cuh | 375 ++ third_party/cub/cub/agent/agent_select_if.cuh | 703 +++ third_party/cub/cub/agent/agent_spmv_orig.cuh | 670 +++ .../cub/agent/single_pass_scan_operators.cuh | 815 +++ .../cub/block/block_adjacent_difference.cuh | 596 ++ .../cub/cub/block/block_discontinuity.cuh | 1148 ++++ third_party/cub/cub/block/block_exchange.cuh | 1248 ++++ third_party/cub/cub/block/block_histogram.cuh | 415 ++ third_party/cub/cub/block/block_load.cuh | 1241 ++++ .../cub/cub/block/block_radix_rank.cuh | 696 +++ .../cub/cub/block/block_radix_sort.cuh | 863 +++ .../cub/cub/block/block_raking_layout.cuh | 152 + third_party/cub/cub/block/block_reduce.cuh | 607 ++ third_party/cub/cub/block/block_scan.cuh | 2126 +++++++ third_party/cub/cub/block/block_shuffle.cuh | 305 + third_party/cub/cub/block/block_store.cuh | 1000 ++++ .../block_histogram_atomic.cuh | 82 + .../specializations/block_histogram_sort.cuh | 226 + .../specializations/block_reduce_raking.cuh | 226 + .../block_reduce_raking_commutative_only.cuh | 199 + .../block_reduce_warp_reductions.cuh | 218 + .../specializations/block_scan_raking.cuh | 666 +++ .../specializations/block_scan_warp_scans.cuh | 392 ++ .../block_scan_warp_scans2.cuh | 436 ++ .../block_scan_warp_scans3.cuh | 418 ++ third_party/cub/cub/cub.cuh | 95 + .../cub/cub/device/device_histogram.cuh | 866 +++ .../cub/cub/device/device_partition.cuh | 273 + .../cub/cub/device/device_radix_sort.cuh | 797 +++ third_party/cub/cub/device/device_reduce.cuh | 734 +++ .../cub/device/device_run_length_encode.cuh | 278 + third_party/cub/cub/device/device_scan.cuh | 443 ++ .../device/device_segmented_radix_sort.cuh | 876 +++ .../cub/device/device_segmented_reduce.cuh | 619 ++ third_party/cub/cub/device/device_select.cuh | 369 ++ third_party/cub/cub/device/device_spmv.cuh | 174 + .../device/dispatch/dispatch_histogram.cuh | 1096 ++++ .../device/dispatch/dispatch_radix_sort.cuh | 1619 ++++++ .../cub/device/dispatch/dispatch_reduce.cuh | 882 +++ .../dispatch/dispatch_reduce_by_key.cuh | 554 ++ .../cub/cub/device/dispatch/dispatch_rle.cuh | 538 ++ .../cub/cub/device/dispatch/dispatch_scan.cuh | 563 ++ .../device/dispatch/dispatch_select_if.cuh | 542 ++ .../device/dispatch/dispatch_spmv_orig.cuh | 834 +++ third_party/cub/cub/grid/grid_barrier.cuh | 211 + third_party/cub/cub/grid/grid_even_share.cuh | 222 + third_party/cub/cub/grid/grid_mapping.cuh | 113 + third_party/cub/cub/grid/grid_queue.cuh | 220 + third_party/cub/cub/host/mutex.cuh | 171 + .../cub/iterator/arg_index_input_iterator.cuh | 259 + .../cache_modified_input_iterator.cuh | 240 + .../cache_modified_output_iterator.cuh | 254 + .../cub/iterator/constant_input_iterator.cuh | 235 + .../cub/iterator/counting_input_iterator.cuh | 228 + .../cub/iterator/discard_output_iterator.cuh | 220 + .../cub/iterator/tex_obj_input_iterator.cuh | 310 + .../cub/iterator/tex_ref_input_iterator.cuh | 374 ++ .../cub/iterator/transform_input_iterator.cuh | 252 + third_party/cub/cub/thread/thread_load.cuh | 438 ++ .../cub/cub/thread/thread_operators.cuh | 317 + third_party/cub/cub/thread/thread_reduce.cuh | 152 + third_party/cub/cub/thread/thread_scan.cuh | 268 + third_party/cub/cub/thread/thread_search.cuh | 154 + third_party/cub/cub/thread/thread_store.cuh | 422 ++ third_party/cub/cub/util_allocator.cuh | 708 +++ third_party/cub/cub/util_arch.cuh | 151 + third_party/cub/cub/util_debug.cuh | 145 + third_party/cub/cub/util_device.cuh | 347 ++ third_party/cub/cub/util_macro.cuh | 103 + third_party/cub/cub/util_namespace.cuh | 46 + third_party/cub/cub/util_ptx.cuh | 758 +++ third_party/cub/cub/util_type.cuh | 1167 ++++ .../warp/specializations/warp_reduce_shfl.cuh | 541 ++ .../warp/specializations/warp_reduce_smem.cuh | 372 ++ .../warp/specializations/warp_scan_shfl.cuh | 632 ++ .../warp/specializations/warp_scan_smem.cuh | 397 ++ third_party/cub/cub/warp/warp_reduce.cuh | 612 ++ third_party/cub/cub/warp/warp_scan.cuh | 936 +++ .../cub/eclipse code style profile.xml | 155 + third_party/cub/examples/block/.gitignore | 7 + third_party/cub/examples/block/Makefile | 128 + .../block/example_block_radix_sort.cu | 323 + .../examples/block/example_block_reduce.cu | 290 + .../cub/examples/block/example_block_scan.cu | 334 ++ .../cub/examples/block/reduce_by_key.cu | 57 + third_party/cub/examples/device/.gitignore | 8 + third_party/cub/examples/device/Makefile | 197 + .../example_device_partition_flagged.cu | 233 + .../device/example_device_partition_if.cu | 244 + .../device/example_device_radix_sort.cu | 226 + .../examples/device/example_device_reduce.cu | 180 + .../examples/device/example_device_scan.cu | 186 + .../device/example_device_select_flagged.cu | 233 + .../device/example_device_select_if.cu | 242 + .../device/example_device_select_unique.cu | 221 + ...ample_device_sort_find_non_trivial_runs.cu | 384 ++ third_party/cub/experimental/.gitignore | 1 + third_party/cub/experimental/Makefile | 125 + .../experimental/defunct/example_coo_spmv.cu | 1070 ++++ .../defunct/test_device_seg_reduce.cu | 2142 +++++++ .../experimental/histogram/histogram_cub.h | 109 + .../histogram/histogram_gmem_atomics.h | 185 + .../histogram/histogram_smem_atomics.h | 195 + .../cub/experimental/histogram_compare.cu | 635 ++ third_party/cub/experimental/sparse_matrix.h | 1244 ++++ third_party/cub/experimental/spmv_compare.cu | 917 +++ third_party/cub/experimental/spmv_script.sh | 30 + third_party/cub/test/.gitignore | 3 + third_party/cub/test/Makefile | 468 ++ third_party/cub/test/half.h | 298 + third_party/cub/test/link_a.cu | 11 + third_party/cub/test/link_b.cu | 11 + third_party/cub/test/link_main.cpp | 10 + third_party/cub/test/mersenne.h | 160 + third_party/cub/test/test_allocator.cu | 459 ++ third_party/cub/test/test_block_histogram.cu | 310 + third_party/cub/test/test_block_load_store.cu | 549 ++ third_party/cub/test/test_block_radix_sort.cu | 717 +++ third_party/cub/test/test_block_reduce.cu | 822 +++ third_party/cub/test/test_block_scan.cu | 929 +++ third_party/cub/test/test_device_histogram.cu | 1669 ++++++ .../cub/test/test_device_radix_sort.cu | 1298 +++++ third_party/cub/test/test_device_reduce.cu | 1359 +++++ .../cub/test/test_device_reduce_by_key.cu | 853 +++ .../cub/test/test_device_run_length_encode.cu | 890 +++ third_party/cub/test/test_device_scan.cu | 1015 ++++ third_party/cub/test/test_device_select_if.cu | 1039 ++++ .../cub/test/test_device_select_unique.cu | 651 +++ third_party/cub/test/test_grid_barrier.cu | 152 + third_party/cub/test/test_iterator.cu | 805 +++ third_party/cub/test/test_util.h | 1628 ++++++ third_party/cub/test/test_warp_reduce.cu | 840 +++ third_party/cub/test/test_warp_scan.cu | 661 +++ third_party/cub/tune/.gitignore | 1 + third_party/cub/tune/Makefile | 192 + third_party/cub/tune/tune_device_reduce.cu | 763 +++ 1156 files changed, 357999 insertions(+) create mode 100644 CITATION.bib create mode 100644 CODE_OF_CONDUCT.md create mode 100644 LICENSE create mode 100644 MANIFEST.in create mode 100644 codecov.yml create mode 100644 cupy/__init__.py create mode 100644 cupy/_binary/__init__.py create mode 100644 cupy/_binary/elementwise.py create mode 100644 cupy/_binary/packing.py create mode 100644 cupy/_core/__init__.pxd create mode 100644 cupy/_core/__init__.py create mode 100644 cupy/_core/_accelerator.pxd create mode 100644 cupy/_core/_accelerator.pyx create mode 100644 cupy/_core/_carray.pxd create mode 100644 cupy/_core/_carray.pyx create mode 100644 cupy/_core/_codeblock.py create mode 100644 cupy/_core/_cub_reduction.pxd create mode 100644 cupy/_core/_cub_reduction.pyx create mode 100644 cupy/_core/_dtype.pxd create mode 100644 cupy/_core/_dtype.pyx create mode 100644 cupy/_core/_fusion_interface.py create mode 100644 cupy/_core/_fusion_kernel.pyx create mode 100644 cupy/_core/_fusion_op.py create mode 100644 cupy/_core/_fusion_optimization.py create mode 100644 cupy/_core/_fusion_thread_local.pyx create mode 100644 cupy/_core/_fusion_trace.pyx create mode 100644 cupy/_core/_fusion_variable.pxd create mode 100644 cupy/_core/_fusion_variable.pyx create mode 100644 cupy/_core/_gufuncs.py create mode 100644 cupy/_core/_kernel.pxd create mode 100644 cupy/_core/_kernel.pyx create mode 100644 cupy/_core/_memory_range.pxd create mode 100644 cupy/_core/_memory_range.pyx create mode 100644 cupy/_core/_optimize_config.pxd create mode 100644 cupy/_core/_optimize_config.pyx create mode 100644 cupy/_core/_reduction.pxd create mode 100644 cupy/_core/_reduction.pyx create mode 100644 cupy/_core/_routines_binary.pxd create mode 100644 cupy/_core/_routines_binary.pyx create mode 100644 cupy/_core/_routines_indexing.pxd create mode 100644 cupy/_core/_routines_indexing.pyx create mode 100644 cupy/_core/_routines_linalg.pxd create mode 100644 cupy/_core/_routines_linalg.pyx create mode 100644 cupy/_core/_routines_logic.pxd create mode 100644 cupy/_core/_routines_logic.pyx create mode 100644 cupy/_core/_routines_manipulation.pxd create mode 100644 cupy/_core/_routines_manipulation.pyx create mode 100644 cupy/_core/_routines_math.pxd create mode 100644 cupy/_core/_routines_math.pyx create mode 100644 cupy/_core/_routines_sorting.pxd create mode 100644 cupy/_core/_routines_sorting.pyx create mode 100644 cupy/_core/_routines_statistics.pxd create mode 100644 cupy/_core/_routines_statistics.pyx create mode 100644 cupy/_core/_scalar.pxd create mode 100644 cupy/_core/_scalar.pyx create mode 100644 cupy/_core/_ufuncs.py create mode 100644 cupy/_core/core.pxd create mode 100644 cupy/_core/core.pyx create mode 100644 cupy/_core/dlpack.pxd create mode 100644 cupy/_core/dlpack.pyx create mode 100644 cupy/_core/flags.pyx create mode 100644 cupy/_core/fusion.pyx create mode 100644 cupy/_core/halffloat.h create mode 100644 cupy/_core/include/cupy/README.md create mode 100644 cupy/_core/include/cupy/_cuda/README.md create mode 100755 cupy/_core/include/cupy/_cuda/cuda-10.2/cuda_fp16.h create mode 100755 cupy/_core/include/cupy/_cuda/cuda-10.2/cuda_fp16.hpp create mode 100755 cupy/_core/include/cupy/_cuda/cuda-11.0/cuda_fp16.h create mode 100755 cupy/_core/include/cupy/_cuda/cuda-11.0/cuda_fp16.hpp create mode 100755 cupy/_core/include/cupy/_cuda/cuda-11.1/cuda_fp16.h create mode 100755 cupy/_core/include/cupy/_cuda/cuda-11.1/cuda_fp16.hpp create mode 100644 cupy/_core/include/cupy/_cuda/cuda-11/cuda_fp16.h create mode 100644 cupy/_core/include/cupy/_cuda/cuda-11/cuda_fp16.hpp create mode 100644 cupy/_core/include/cupy/_cuda/cuda-12/cuda_fp16.h create mode 100644 cupy/_core/include/cupy/_cuda/cuda-12/cuda_fp16.hpp create mode 100644 cupy/_core/include/cupy/atomics.cuh create mode 100644 cupy/_core/include/cupy/carray.cuh create mode 100644 cupy/_core/include/cupy/complex.cuh create mode 100644 cupy/_core/include/cupy/complex/README.md create mode 100644 cupy/_core/include/cupy/complex/arithmetic.h create mode 100644 cupy/_core/include/cupy/complex/catrig.h create mode 100644 cupy/_core/include/cupy/complex/catrigf.h create mode 100644 cupy/_core/include/cupy/complex/ccosh.h create mode 100644 cupy/_core/include/cupy/complex/ccoshf.h create mode 100644 cupy/_core/include/cupy/complex/cexp.h create mode 100644 cupy/_core/include/cupy/complex/cexpf.h create mode 100644 cupy/_core/include/cupy/complex/clog.h create mode 100644 cupy/_core/include/cupy/complex/clogf.h create mode 100644 cupy/_core/include/cupy/complex/complex.h create mode 100644 cupy/_core/include/cupy/complex/complex_inl.h create mode 100644 cupy/_core/include/cupy/complex/cpow.h create mode 100644 cupy/_core/include/cupy/complex/cproj.h create mode 100644 cupy/_core/include/cupy/complex/csinh.h create mode 100644 cupy/_core/include/cupy/complex/csinhf.h create mode 100644 cupy/_core/include/cupy/complex/csqrt.h create mode 100644 cupy/_core/include/cupy/complex/csqrtf.h create mode 100644 cupy/_core/include/cupy/complex/ctanh.h create mode 100644 cupy/_core/include/cupy/complex/ctanhf.h create mode 100644 cupy/_core/include/cupy/complex/math_private.h create mode 100644 cupy/_core/include/cupy/cuComplex_bridge.h create mode 100644 cupy/_core/include/cupy/cub/.gitattributes create mode 120000 cupy/_core/include/cupy/cub/LICENSE.TXT create mode 120000 cupy/_core/include/cupy/cub/cub create mode 100644 cupy/_core/include/cupy/cuda_workaround.h create mode 100644 cupy/_core/include/cupy/dlpack/README.md create mode 100644 cupy/_core/include/cupy/dlpack/dlpack.h create mode 100644 cupy/_core/include/cupy/hip_workaround.cuh create mode 100644 cupy/_core/include/cupy/jitify/.clang-format create mode 100644 cupy/_core/include/cupy/jitify/.gitignore create mode 100644 cupy/_core/include/cupy/jitify/Doxyfile create mode 100644 cupy/_core/include/cupy/jitify/LICENSE create mode 100644 cupy/_core/include/cupy/jitify/Makefile create mode 100644 cupy/_core/include/cupy/jitify/README.md create mode 100644 cupy/_core/include/cupy/jitify/example_headers/class_arg_kernel.cuh create mode 100644 cupy/_core/include/cupy/jitify/example_headers/constant_header.cuh create mode 100644 cupy/_core/include/cupy/jitify/example_headers/my_header1.cuh create mode 100644 cupy/_core/include/cupy/jitify/example_headers/my_header2.cuh create mode 100644 cupy/_core/include/cupy/jitify/example_headers/my_header3.cuh create mode 100644 cupy/_core/include/cupy/jitify/jitify.hpp create mode 100644 cupy/_core/include/cupy/jitify/jitify_example.cpp create mode 100644 cupy/_core/include/cupy/jitify/jitify_test.cu create mode 100644 cupy/_core/include/cupy/jitify/stringify.cpp create mode 100644 cupy/_core/include/cupy/math_constants.h create mode 100644 cupy/_core/include/cupy/pair.cuh create mode 100644 cupy/_core/include/cupy/swap.cuh create mode 100644 cupy/_core/include/cupy/tuple.cuh create mode 100644 cupy/_core/include/cupy/tuple/pair.h create mode 100644 cupy/_core/include/cupy/tuple/tuple.h create mode 100644 cupy/_core/include/cupy/tuple/type_traits.h create mode 100644 cupy/_core/include/cupy/type_dispatcher.cuh create mode 100644 cupy/_core/internal.pxd create mode 100644 cupy/_core/internal.pyx create mode 100644 cupy/_core/new_fusion.pyx create mode 100644 cupy/_core/raw.pxd create mode 100644 cupy/_core/raw.pyx create mode 100644 cupy/_core/syncdetect.py create mode 100644 cupy/_creation/__init__.py create mode 100644 cupy/_creation/basic.py create mode 100644 cupy/_creation/from_data.py create mode 100644 cupy/_creation/matrix.py create mode 100644 cupy/_creation/ranges.py create mode 100644 cupy/_environment.py create mode 100644 cupy/_functional/__init__.py create mode 100644 cupy/_functional/piecewise.py create mode 100644 cupy/_functional/vectorize.py create mode 100644 cupy/_indexing/__init__.py create mode 100644 cupy/_indexing/generate.py create mode 100644 cupy/_indexing/indexing.py create mode 100644 cupy/_indexing/insert.py create mode 100644 cupy/_indexing/iterate.py create mode 100644 cupy/_io/__init__.py create mode 100644 cupy/_io/formatting.py create mode 100644 cupy/_io/npz.py create mode 100644 cupy/_io/text.py create mode 100644 cupy/_logic/__init__.py create mode 100644 cupy/_logic/comparison.py create mode 100644 cupy/_logic/content.py create mode 100644 cupy/_logic/ops.py create mode 100644 cupy/_logic/truth.py create mode 100644 cupy/_logic/type_testing.py create mode 100644 cupy/_manipulation/__init__.py create mode 100644 cupy/_manipulation/add_remove.py create mode 100644 cupy/_manipulation/basic.py create mode 100644 cupy/_manipulation/dims.py create mode 100644 cupy/_manipulation/join.py create mode 100644 cupy/_manipulation/kind.py create mode 100644 cupy/_manipulation/rearrange.py create mode 100644 cupy/_manipulation/shape.py create mode 100644 cupy/_manipulation/split.py create mode 100644 cupy/_manipulation/tiling.py create mode 100644 cupy/_manipulation/transpose.py create mode 100644 cupy/_math/__init__.py create mode 100644 cupy/_math/arithmetic.py create mode 100644 cupy/_math/explog.py create mode 100644 cupy/_math/floating.py create mode 100644 cupy/_math/hyperbolic.py create mode 100644 cupy/_math/misc.py create mode 100644 cupy/_math/rational.py create mode 100644 cupy/_math/rounding.py create mode 100644 cupy/_math/special.py create mode 100644 cupy/_math/sumprod.py create mode 100644 cupy/_math/trigonometric.py create mode 100644 cupy/_math/ufunc.py create mode 100644 cupy/_math/window.py create mode 100644 cupy/_misc/__init__.py create mode 100644 cupy/_misc/byte_bounds.py create mode 100644 cupy/_misc/memory_ranges.py create mode 100644 cupy/_misc/who.py create mode 100644 cupy/_padding/__init__.py create mode 100644 cupy/_padding/pad.py create mode 100644 cupy/_sorting/__init__.py create mode 100644 cupy/_sorting/count.py create mode 100644 cupy/_sorting/search.py create mode 100644 cupy/_sorting/sort.py create mode 100644 cupy/_statistics/__init__.py create mode 100644 cupy/_statistics/correlation.py create mode 100644 cupy/_statistics/histogram.py create mode 100644 cupy/_statistics/meanvar.py create mode 100644 cupy/_statistics/order.py create mode 100644 cupy/_util.pyi create mode 100644 cupy/_util.pyx create mode 100644 cupy/_version.py create mode 100644 cupy/array_api/__init__.py create mode 100644 cupy/array_api/_array_object.py create mode 100644 cupy/array_api/_constants.py create mode 100644 cupy/array_api/_creation_functions.py create mode 100644 cupy/array_api/_data_type_functions.py create mode 100644 cupy/array_api/_dtypes.py create mode 100644 cupy/array_api/_elementwise_functions.py create mode 100644 cupy/array_api/_indexing_functions.py create mode 100644 cupy/array_api/_manipulation_functions.py create mode 100644 cupy/array_api/_searching_functions.py create mode 100644 cupy/array_api/_set_functions.py create mode 100644 cupy/array_api/_sorting_functions.py create mode 100644 cupy/array_api/_statistical_functions.py create mode 100644 cupy/array_api/_typing.py create mode 100644 cupy/array_api/_utility_functions.py create mode 100644 cupy/array_api/linalg.py create mode 100644 cupy/cublas.py create mode 100644 cupy/cuda/__init__.pxd create mode 100644 cupy/cuda/__init__.py create mode 100644 cupy/cuda/common.pxd create mode 100644 cupy/cuda/common.pyx create mode 100644 cupy/cuda/compiler.py create mode 100644 cupy/cuda/cub.pxd create mode 100644 cupy/cuda/cub.pyx create mode 100644 cupy/cuda/cudnn.py create mode 100644 cupy/cuda/cufft.pxd create mode 100644 cupy/cuda/cufft.pyx create mode 100644 cupy/cuda/cupy_cub.cu create mode 100644 cupy/cuda/cupy_cub.h create mode 100644 cupy/cuda/cupy_cufft.h create mode 100644 cupy/cuda/cupy_cufftXt.cu create mode 100644 cupy/cuda/cupy_cufftXt.h create mode 100644 cupy/cuda/cupy_jitify.h create mode 100644 cupy/cuda/cupy_thrust.cu create mode 100644 cupy/cuda/cupy_thrust.h create mode 100644 cupy/cuda/cutensor.py create mode 100644 cupy/cuda/device.pxd create mode 100644 cupy/cuda/device.pyx create mode 100644 cupy/cuda/function.pxd create mode 100644 cupy/cuda/function.pyx create mode 100644 cupy/cuda/graph.pxd create mode 100644 cupy/cuda/graph.pyx create mode 100644 cupy/cuda/jitify.pyx create mode 100644 cupy/cuda/memory.pxd create mode 100644 cupy/cuda/memory.pyx create mode 100644 cupy/cuda/memory_hook.pxd create mode 100644 cupy/cuda/memory_hook.pyx create mode 100644 cupy/cuda/memory_hooks/__init__.py create mode 100644 cupy/cuda/memory_hooks/debug_print.py create mode 100644 cupy/cuda/memory_hooks/line_profile.py create mode 100644 cupy/cuda/nccl.py create mode 100644 cupy/cuda/nvtx.py create mode 100644 cupy/cuda/pinned_memory.pxd create mode 100644 cupy/cuda/pinned_memory.pyx create mode 100644 cupy/cuda/profiler.py create mode 100644 cupy/cuda/runtime.py create mode 100644 cupy/cuda/stream.pxd create mode 100644 cupy/cuda/stream.pyx create mode 100644 cupy/cuda/texture.pxd create mode 100644 cupy/cuda/texture.pyx create mode 100644 cupy/cuda/thrust.pyx create mode 100644 cupy/cudnn.py create mode 100644 cupy/cusolver.py create mode 100644 cupy/cusparse.py create mode 100644 cupy/cutensor.py create mode 100644 cupy/fft/__init__.pxd create mode 100644 cupy/fft/__init__.py create mode 100644 cupy/fft/_cache.pyx create mode 100644 cupy/fft/_callback.pyx create mode 100644 cupy/fft/_fft.py create mode 100644 cupy/fft/config.py create mode 100644 cupy/lib/__init__.py create mode 100644 cupy/lib/_polynomial.pyx create mode 100644 cupy/lib/_routines_poly.py create mode 100644 cupy/lib/_shape_base.py create mode 100644 cupy/lib/stride_tricks.py create mode 100644 cupy/linalg/__init__.py create mode 100644 cupy/linalg/_decomposition.py create mode 100644 cupy/linalg/_eigenvalue.py create mode 100644 cupy/linalg/_einsum.py create mode 100644 cupy/linalg/_einsum_cutn.py create mode 100644 cupy/linalg/_einsum_opt.py create mode 100644 cupy/linalg/_norms.py create mode 100644 cupy/linalg/_product.py create mode 100644 cupy/linalg/_solve.py create mode 100644 cupy/linalg/_util.py create mode 100644 cupy/polynomial/__init__.py create mode 100644 cupy/polynomial/polynomial.py create mode 100644 cupy/polynomial/polyutils.py create mode 100644 cupy/prof/__init__.py create mode 100644 cupy/prof/_time_range.py create mode 100644 cupy/random/LICENSE create mode 100644 cupy/random/__init__.py create mode 100644 cupy/random/_bit_generator.pyx create mode 100644 cupy/random/_distributions.py create mode 100644 cupy/random/_generator.py create mode 100644 cupy/random/_generator_api.pyx create mode 100644 cupy/random/_kernels.py create mode 100644 cupy/random/_permutations.py create mode 100644 cupy/random/_sample.py create mode 100644 cupy/random/cupy_distributions.cu create mode 100644 cupy/random/cupy_distributions.cuh create mode 100644 cupy/sparse/__init__.py create mode 100644 cupy/sparse/linalg/__init__.py create mode 100644 cupy/testing/__init__.py create mode 100644 cupy/testing/_array.py create mode 100644 cupy/testing/_attr.py create mode 100644 cupy/testing/_bundle.py create mode 100644 cupy/testing/_condition.py create mode 100644 cupy/testing/_helper.py create mode 100644 cupy/testing/_hypothesis.py create mode 100644 cupy/testing/_loops.py create mode 100644 cupy/testing/_parameterized.py create mode 100644 cupy/testing/_pytest_impl.py create mode 100644 cupy/testing/_random.py create mode 100644 cupy/typing/__init__.py create mode 100644 cupy/typing/_generic_alias.py create mode 100644 cupy_backends/__init__.pxd create mode 100755 cupy_backends/__init__.py create mode 100644 cupy_backends/cuda/__init__.pxd create mode 100644 cupy_backends/cuda/__init__.py create mode 100644 cupy_backends/cuda/_softlink.pxd create mode 100644 cupy_backends/cuda/_softlink.pyx create mode 100644 cupy_backends/cuda/api/__init__.pxd create mode 100644 cupy_backends/cuda/api/__init__.py create mode 100644 cupy_backends/cuda/api/_driver_enum.pxd create mode 100644 cupy_backends/cuda/api/_driver_enum.pyx create mode 100644 cupy_backends/cuda/api/_driver_extern.pxi create mode 100644 cupy_backends/cuda/api/_driver_typedef.pxi create mode 100644 cupy_backends/cuda/api/_runtime_enum.pxd create mode 100644 cupy_backends/cuda/api/_runtime_enum.pyx create mode 100644 cupy_backends/cuda/api/_runtime_extern.pxi create mode 100644 cupy_backends/cuda/api/_runtime_typedef.pxi create mode 100644 cupy_backends/cuda/api/driver.pxd create mode 100644 cupy_backends/cuda/api/driver.pyx create mode 100644 cupy_backends/cuda/api/runtime.pxd create mode 100644 cupy_backends/cuda/api/runtime.pyx create mode 100644 cupy_backends/cuda/cupy_cublas.h create mode 100644 cupy_backends/cuda/cupy_cuda.h create mode 100644 cupy_backends/cuda/cupy_cuda_profiler_api.h create mode 100644 cupy_backends/cuda/cupy_cuda_runtime.h create mode 100644 cupy_backends/cuda/cupy_cudnn.h create mode 100644 cupy_backends/cuda/cupy_cusolver.h create mode 100644 cupy_backends/cuda/cupy_cusparse.h create mode 100644 cupy_backends/cuda/cupy_cutensor.h create mode 100644 cupy_backends/cuda/cupy_nccl.h create mode 100644 cupy_backends/cuda/cupy_nvrtc.h create mode 100644 cupy_backends/cuda/libs/__init__.pxd create mode 100644 cupy_backends/cuda/libs/__init__.py create mode 100644 cupy_backends/cuda/libs/cublas.pxd create mode 100644 cupy_backends/cuda/libs/cublas.pyx create mode 100644 cupy_backends/cuda/libs/cudnn.pxd create mode 100644 cupy_backends/cuda/libs/cudnn.pyx create mode 100644 cupy_backends/cuda/libs/curand.pxd create mode 100644 cupy_backends/cuda/libs/curand.pyx create mode 100644 cupy_backends/cuda/libs/cusolver.pxd create mode 100644 cupy_backends/cuda/libs/cusolver.pyx create mode 100644 cupy_backends/cuda/libs/cusparse.pxd create mode 100644 cupy_backends/cuda/libs/cusparse.pyx create mode 100644 cupy_backends/cuda/libs/cusparselt.pxd create mode 100644 cupy_backends/cuda/libs/cusparselt.pyx create mode 100644 cupy_backends/cuda/libs/cutensor.pxd create mode 100644 cupy_backends/cuda/libs/cutensor.pyx create mode 100644 cupy_backends/cuda/libs/nccl.pxd create mode 100644 cupy_backends/cuda/libs/nccl.pyx create mode 100644 cupy_backends/cuda/libs/nvrtc.pxd create mode 100644 cupy_backends/cuda/libs/nvrtc.pyx create mode 100644 cupy_backends/cuda/libs/nvtx.pyx create mode 100644 cupy_backends/cuda/libs/profiler.pxd create mode 100644 cupy_backends/cuda/libs/profiler.pyx create mode 100755 cupy_backends/cuda/stream.pxd create mode 100755 cupy_backends/cuda/stream.pyx create mode 100644 cupy_backends/cupy_backend.h create mode 100644 cupy_backends/cupy_backend_runtime.h create mode 100644 cupy_backends/cupy_blas.h create mode 100644 cupy_backends/cupy_complex.h create mode 100644 cupy_backends/cupy_cudnn.h create mode 100644 cupy_backends/cupy_cusparselt.h create mode 100644 cupy_backends/cupy_cutensor.h create mode 100644 cupy_backends/cupy_lapack.h create mode 100644 cupy_backends/cupy_nccl.h create mode 100644 cupy_backends/cupy_profiler.h create mode 100644 cupy_backends/cupy_rand.h create mode 100644 cupy_backends/cupy_rtc.h create mode 100644 cupy_backends/cupy_sparse.h create mode 100644 cupy_backends/cupy_tx.h create mode 100644 cupy_backends/hip/cupy_cuComplex.h create mode 100644 cupy_backends/hip/cupy_hip.h create mode 100644 cupy_backends/hip/cupy_hip_common.h create mode 100644 cupy_backends/hip/cupy_hip_runtime.h create mode 100644 cupy_backends/hip/cupy_hipblas.h create mode 100644 cupy_backends/hip/cupy_hiprand.h create mode 100644 cupy_backends/hip/cupy_hiprtc.h create mode 100644 cupy_backends/hip/cupy_hipsparse.h create mode 100644 cupy_backends/hip/cupy_profiler.h create mode 100644 cupy_backends/hip/cupy_rccl.h create mode 100644 cupy_backends/hip/cupy_rocsolver.h create mode 100644 cupy_backends/hip/cupy_roctx.h create mode 100644 cupy_backends/stub/cupy_cuComplex.h create mode 100644 cupy_backends/stub/cupy_cublas.h create mode 100644 cupy_backends/stub/cupy_cuda.h create mode 100644 cupy_backends/stub/cupy_cuda_common.h create mode 100644 cupy_backends/stub/cupy_cuda_runtime.h create mode 100644 cupy_backends/stub/cupy_cudnn.h create mode 100644 cupy_backends/stub/cupy_curand.h create mode 100644 cupy_backends/stub/cupy_cusolver.h create mode 100644 cupy_backends/stub/cupy_cusparse.h create mode 100644 cupy_backends/stub/cupy_cusparselt.h create mode 100644 cupy_backends/stub/cupy_cutensor.h create mode 100644 cupy_backends/stub/cupy_nccl.h create mode 100644 cupy_backends/stub/cupy_nvrtc.h create mode 100644 cupy_backends/stub/cupy_nvtx.h create mode 100644 cupy_backends/stub/cupy_profiler.h create mode 100644 cupyx/__init__.py create mode 100644 cupyx/_gufunc.py create mode 100644 cupyx/_pinned_array.py create mode 100644 cupyx/_rsqrt.py create mode 100644 cupyx/_runtime.py create mode 100644 cupyx/_scatter.py create mode 100644 cupyx/_texture.py create mode 100644 cupyx/_ufunc_config.py create mode 100644 cupyx/cudnn.pyx create mode 100644 cupyx/cusolver.pyx create mode 100644 cupyx/cusparse.py create mode 100644 cupyx/cutensor.pyx create mode 100644 cupyx/distributed/__init__.py create mode 100644 cupyx/distributed/_array.py create mode 100644 cupyx/distributed/_comm.py create mode 100644 cupyx/distributed/_init.py create mode 100644 cupyx/distributed/_klv_utils.py create mode 100644 cupyx/distributed/_nccl_comm.py create mode 100644 cupyx/distributed/_store.py create mode 100644 cupyx/distributed/_store_actions.py create mode 100644 cupyx/fallback_mode/__init__.py create mode 100644 cupyx/fallback_mode/fallback.py create mode 100644 cupyx/fallback_mode/notification.py create mode 100644 cupyx/jit/__init__.py create mode 100644 cupyx/jit/_builtin_funcs.py create mode 100644 cupyx/jit/_compile.py create mode 100644 cupyx/jit/_cuda_typerules.py create mode 100644 cupyx/jit/_cuda_types.py create mode 100644 cupyx/jit/_interface.py create mode 100644 cupyx/jit/_internal_types.py create mode 100644 cupyx/jit/cg.py create mode 100644 cupyx/jit/cub.py create mode 100644 cupyx/jit/thrust.py create mode 100644 cupyx/lapack.py create mode 100644 cupyx/linalg/__init__.py create mode 100644 cupyx/linalg/_solve.py create mode 100644 cupyx/linalg/sparse/__init__.py create mode 100644 cupyx/linalg/sparse/_solve.py create mode 100644 cupyx/optimizing/__init__.py create mode 100644 cupyx/optimizing/_optimize.py create mode 100644 cupyx/profiler/__init__.py create mode 100644 cupyx/profiler/_time.py create mode 100644 cupyx/profiler/_time_range.py create mode 100644 cupyx/scipy/__init__.py create mode 100644 cupyx/scipy/_lib/__init__.py create mode 100644 cupyx/scipy/_lib/_util.py create mode 100644 cupyx/scipy/fft/__init__.py create mode 100644 cupyx/scipy/fft/_fft.py create mode 100644 cupyx/scipy/fft/_fftlog.py create mode 100644 cupyx/scipy/fft/_helper.py create mode 100644 cupyx/scipy/fft/_realtransforms.py create mode 100644 cupyx/scipy/fftpack/__init__.py create mode 100644 cupyx/scipy/fftpack/_fft.py create mode 100644 cupyx/scipy/interpolate/__init__.py create mode 100644 cupyx/scipy/interpolate/_bspline.py create mode 100644 cupyx/scipy/interpolate/_bspline2.py create mode 100644 cupyx/scipy/interpolate/_cubic.py create mode 100644 cupyx/scipy/interpolate/_interpolate.py create mode 100644 cupyx/scipy/interpolate/_polyint.py create mode 100644 cupyx/scipy/interpolate/_rbfinterp.py create mode 100644 cupyx/scipy/interpolate/_rgi.py create mode 100644 cupyx/scipy/linalg/__init__.py create mode 100644 cupyx/scipy/linalg/_decomp_lu.py create mode 100644 cupyx/scipy/linalg/_solve_triangular.py create mode 100644 cupyx/scipy/linalg/_special_matrices.py create mode 100644 cupyx/scipy/linalg/_uarray.py create mode 100644 cupyx/scipy/ndimage/__init__.py create mode 100644 cupyx/scipy/ndimage/_filters.py create mode 100644 cupyx/scipy/ndimage/_filters_core.py create mode 100644 cupyx/scipy/ndimage/_filters_generic.py create mode 100644 cupyx/scipy/ndimage/_fourier.py create mode 100644 cupyx/scipy/ndimage/_interp_kernels.py create mode 100644 cupyx/scipy/ndimage/_interpolation.py create mode 100644 cupyx/scipy/ndimage/_measurements.py create mode 100644 cupyx/scipy/ndimage/_morphology.py create mode 100644 cupyx/scipy/ndimage/_spline_kernel_weights.py create mode 100644 cupyx/scipy/ndimage/_spline_prefilter_core.py create mode 100644 cupyx/scipy/ndimage/_util.py create mode 100644 cupyx/scipy/signal/__init__.py create mode 100644 cupyx/scipy/signal/_bsplines.py create mode 100644 cupyx/scipy/signal/_signaltools.py create mode 100644 cupyx/scipy/signal/_signaltools_core.py create mode 100644 cupyx/scipy/sparse/__init__.py create mode 100644 cupyx/scipy/sparse/_base.py create mode 100644 cupyx/scipy/sparse/_compressed.py create mode 100644 cupyx/scipy/sparse/_construct.py create mode 100644 cupyx/scipy/sparse/_coo.py create mode 100644 cupyx/scipy/sparse/_csc.py create mode 100644 cupyx/scipy/sparse/_csr.py create mode 100644 cupyx/scipy/sparse/_data.py create mode 100644 cupyx/scipy/sparse/_dia.py create mode 100644 cupyx/scipy/sparse/_extract.py create mode 100644 cupyx/scipy/sparse/_index.py create mode 100644 cupyx/scipy/sparse/_sputils.py create mode 100644 cupyx/scipy/sparse/_util.py create mode 100644 cupyx/scipy/sparse/csgraph/__init__.py create mode 100644 cupyx/scipy/sparse/csgraph/_traversal.py create mode 100644 cupyx/scipy/sparse/linalg/__init__.py create mode 100644 cupyx/scipy/sparse/linalg/_eigen.py create mode 100644 cupyx/scipy/sparse/linalg/_interface.py create mode 100644 cupyx/scipy/sparse/linalg/_iterative.py create mode 100644 cupyx/scipy/sparse/linalg/_lobpcg.py create mode 100644 cupyx/scipy/sparse/linalg/_norm.py create mode 100644 cupyx/scipy/sparse/linalg/_solve.py create mode 100644 cupyx/scipy/spatial/__init__.py create mode 100644 cupyx/scipy/spatial/distance.py create mode 100644 cupyx/scipy/special/__init__.py create mode 100644 cupyx/scipy/special/_basic.py create mode 100644 cupyx/scipy/special/_bessel.py create mode 100644 cupyx/scipy/special/_beta.py create mode 100644 cupyx/scipy/special/_complexstuff.py create mode 100644 cupyx/scipy/special/_convex_analysis.py create mode 100644 cupyx/scipy/special/_digamma.py create mode 100644 cupyx/scipy/special/_erf.py create mode 100644 cupyx/scipy/special/_exp1.py create mode 100644 cupyx/scipy/special/_expi.py create mode 100644 cupyx/scipy/special/_expn.py create mode 100644 cupyx/scipy/special/_gamma.py create mode 100644 cupyx/scipy/special/_gammainc.py create mode 100644 cupyx/scipy/special/_gammaln.py create mode 100644 cupyx/scipy/special/_gammasgn.py create mode 100644 cupyx/scipy/special/_loggamma.py create mode 100644 cupyx/scipy/special/_logsoftmax.py create mode 100644 cupyx/scipy/special/_logsumexp.py create mode 100644 cupyx/scipy/special/_lpmv.py create mode 100644 cupyx/scipy/special/_poch.py create mode 100644 cupyx/scipy/special/_polygamma.py create mode 100644 cupyx/scipy/special/_softmax.py create mode 100644 cupyx/scipy/special/_sph_harm.py create mode 100644 cupyx/scipy/special/_statistics.py create mode 100644 cupyx/scipy/special/_stats_distributions.py create mode 100644 cupyx/scipy/special/_trig.py create mode 100644 cupyx/scipy/special/_xlogy.py create mode 100644 cupyx/scipy/special/_zeta.py create mode 100644 cupyx/scipy/stats/__init__.py create mode 100644 cupyx/scipy/stats/_distributions.py create mode 100644 cupyx/scipy/stats/_morestats.py create mode 100644 cupyx/scipy/stats/_stats.py create mode 100644 cupyx/scipy/stats/_stats_py.py create mode 100644 cupyx/time.py create mode 100644 cupyx/tools/__init__.py create mode 100644 cupyx/tools/_hipsparse_stub_mapper.py create mode 100755 cupyx/tools/install_library.py create mode 100644 docker/python3/Dockerfile create mode 100644 docker/rocm/Dockerfile create mode 100644 docker/rocm/README.md create mode 100644 docs/LICENSE_THIRD_PARTY create mode 100644 docs/Makefile create mode 100644 docs/image/cupy_logo_1000px.png create mode 100644 docs/make.bat create mode 100644 docs/requirements.txt create mode 100644 docs/source/.gitignore create mode 100644 docs/source/_comparison_generator.py create mode 100644 docs/source/_static/favicon.ico create mode 100644 docs/source/_templates/autosummary/class.rst create mode 100644 docs/source/conf.py create mode 100644 docs/source/contribution.rst create mode 100644 docs/source/index.rst create mode 100644 docs/source/install.rst create mode 100644 docs/source/license.rst create mode 100644 docs/source/overview.rst create mode 100644 docs/source/reference/_deprecated.rst create mode 100644 docs/source/reference/_private.rst create mode 100644 docs/source/reference/array_api.rst create mode 100644 docs/source/reference/array_api_array.rst create mode 100644 docs/source/reference/array_api_functions.rst create mode 100644 docs/source/reference/binary.rst create mode 100644 docs/source/reference/comparison.rst create mode 100644 docs/source/reference/creation.rst create mode 100644 docs/source/reference/cuda.rst create mode 100644 docs/source/reference/distributed.rst create mode 100644 docs/source/reference/dtype.rst create mode 100644 docs/source/reference/environment.rst create mode 100644 docs/source/reference/ext.rst create mode 100644 docs/source/reference/fft.rst create mode 100644 docs/source/reference/fftpack.rst create mode 100644 docs/source/reference/functional.rst create mode 100644 docs/source/reference/index.rst create mode 100644 docs/source/reference/indexing.rst create mode 100644 docs/source/reference/io.rst create mode 100644 docs/source/reference/kernel.rst create mode 100644 docs/source/reference/linalg.rst create mode 100644 docs/source/reference/logic.rst create mode 100644 docs/source/reference/manipulation.rst create mode 100644 docs/source/reference/math.rst create mode 100644 docs/source/reference/misc.rst create mode 100644 docs/source/reference/ndarray.rst create mode 100644 docs/source/reference/ndimage.rst create mode 100644 docs/source/reference/pad.rst create mode 100644 docs/source/reference/polynomials.rst create mode 100644 docs/source/reference/random.rst create mode 100644 docs/source/reference/routines.rst create mode 100644 docs/source/reference/scipy.rst create mode 100644 docs/source/reference/scipy_fft.rst create mode 100644 docs/source/reference/scipy_fftpack.rst create mode 100644 docs/source/reference/scipy_interpolate.rst create mode 100644 docs/source/reference/scipy_linalg.rst create mode 100644 docs/source/reference/scipy_ndimage.rst create mode 100644 docs/source/reference/scipy_signal.rst create mode 100644 docs/source/reference/scipy_sparse.rst create mode 100644 docs/source/reference/scipy_sparse_csgraph.rst create mode 100644 docs/source/reference/scipy_sparse_linalg.rst create mode 100644 docs/source/reference/scipy_spatial.rst create mode 100644 docs/source/reference/scipy_spatial_distance.rst create mode 100644 docs/source/reference/scipy_special.rst create mode 100644 docs/source/reference/scipy_stats.rst create mode 100644 docs/source/reference/set.rst create mode 100644 docs/source/reference/signal.rst create mode 100644 docs/source/reference/sorting.rst create mode 100644 docs/source/reference/sparse.rst create mode 100644 docs/source/reference/special.rst create mode 100644 docs/source/reference/statistics.rst create mode 100644 docs/source/reference/stats.rst create mode 100644 docs/source/reference/testing.rst create mode 100644 docs/source/reference/ufunc.rst create mode 100644 docs/source/reference/window.rst create mode 100644 docs/source/spelling_wordlist.txt create mode 100644 docs/source/upgrade.rst create mode 100644 docs/source/user_guide/basic.rst create mode 100644 docs/source/user_guide/compatibility.rst create mode 100644 docs/source/user_guide/cuda_api.rst create mode 100644 docs/source/user_guide/difference.rst create mode 100644 docs/source/user_guide/fft.rst create mode 100644 docs/source/user_guide/index.rst create mode 100644 docs/source/user_guide/interoperability.rst create mode 100644 docs/source/user_guide/kernel.rst create mode 100644 docs/source/user_guide/memory.rst create mode 100644 docs/source/user_guide/performance.rst create mode 100644 examples/cg/cg.py create mode 100644 examples/cusparselt/matmul.py create mode 100644 examples/custom_struct/README.md create mode 100644 examples/custom_struct/builtin_vectors.py create mode 100644 examples/custom_struct/complex_struct.py create mode 100644 examples/custom_struct/packed_matrix.py create mode 100644 examples/cutensor/contraction.py create mode 100644 examples/cutensor/elementwise_binary.py create mode 100644 examples/cutensor/elementwise_trinary.py create mode 100644 examples/cutensor/reduction.py create mode 100644 examples/finance/black_scholes.py create mode 100644 examples/finance/monte_carlo.py create mode 100644 examples/finance/monte_carlo_multigpu.py create mode 100644 examples/gemm/README.md create mode 100644 examples/gemm/sgemm.cu create mode 100644 examples/gemm/sgemm.py create mode 100644 examples/gemm/utils.py create mode 100644 examples/gmm/README.md create mode 100644 examples/gmm/gmm.py create mode 100644 examples/interoperability/mpi4py_multiple_devices.py create mode 100644 examples/jit/elmentwise_op.py create mode 100644 examples/jit/reduction_atomic.py create mode 100644 examples/jit/reduction_simple.py create mode 100644 examples/kmeans/README.md create mode 100644 examples/kmeans/kmeans.py create mode 100644 examples/peer/peer_matrix.py create mode 100644 examples/stream/cublas.py create mode 100644 examples/stream/cudnn.py create mode 100644 examples/stream/cufft.py create mode 100644 examples/stream/cupy_event.py create mode 100644 examples/stream/cupy_kernel.py create mode 100644 examples/stream/cupy_memcpy.py create mode 100644 examples/stream/curand.py create mode 100644 examples/stream/cusolver.py create mode 100644 examples/stream/cusparse.py create mode 100644 examples/stream/map_reduce.py create mode 100644 examples/stream/thrust.py create mode 100644 install/cupy_builder/__init__.py create mode 100644 install/cupy_builder/_command.py create mode 100644 install/cupy_builder/_compiler.py create mode 100644 install/cupy_builder/_context.py create mode 100644 install/cupy_builder/_environment.py create mode 100644 install/cupy_builder/_features.py create mode 100644 install/cupy_builder/_preflight.py create mode 100644 install/cupy_builder/cupy_setup_build.py create mode 100644 install/cupy_builder/install_build.py create mode 100644 install/cupy_builder/install_utils.py create mode 100644 install/mypy.ini create mode 100644 install/universal_pkg/RELEASE.md create mode 100644 install/universal_pkg/__init__.py create mode 100644 install/universal_pkg/setup.py create mode 100644 setup.cfg create mode 100644 setup.py create mode 100644 tests/conftest.py create mode 100644 tests/cupy_tests/__init__.py create mode 100644 tests/cupy_tests/array_api_tests/__init__.py create mode 100644 tests/cupy_tests/array_api_tests/test_array_object.py create mode 100644 tests/cupy_tests/array_api_tests/test_creation_functions.py create mode 100644 tests/cupy_tests/array_api_tests/test_data_type_functions.py create mode 100644 tests/cupy_tests/array_api_tests/test_elementwise_functions.py create mode 100644 tests/cupy_tests/array_api_tests/test_indexing_functions.py create mode 100644 tests/cupy_tests/array_api_tests/test_set_functions.py create mode 100644 tests/cupy_tests/array_api_tests/test_sorting_functions.py create mode 100644 tests/cupy_tests/array_api_tests/test_validation.py create mode 100644 tests/cupy_tests/binary_tests/__init__.py create mode 100644 tests/cupy_tests/binary_tests/test_elementwise.py create mode 100644 tests/cupy_tests/binary_tests/test_packing.py create mode 100644 tests/cupy_tests/core_tests/__init__.py create mode 100644 tests/cupy_tests/core_tests/fusion_tests/__init__.py create mode 100644 tests/cupy_tests/core_tests/fusion_tests/fusion_utils.py create mode 100644 tests/cupy_tests/core_tests/fusion_tests/test_array.py create mode 100644 tests/cupy_tests/core_tests/fusion_tests/test_example.py create mode 100644 tests/cupy_tests/core_tests/fusion_tests/test_indexing.py create mode 100644 tests/cupy_tests/core_tests/fusion_tests/test_kernel_cache.py create mode 100644 tests/cupy_tests/core_tests/fusion_tests/test_misc.py create mode 100644 tests/cupy_tests/core_tests/fusion_tests/test_optimization.py create mode 100644 tests/cupy_tests/core_tests/fusion_tests/test_reduction.py create mode 100644 tests/cupy_tests/core_tests/fusion_tests/test_routines.py create mode 100644 tests/cupy_tests/core_tests/fusion_tests/test_ufunc.py create mode 100644 tests/cupy_tests/core_tests/test_array_function.py create mode 100644 tests/cupy_tests/core_tests/test_carray.py create mode 100644 tests/cupy_tests/core_tests/test_core.py create mode 100644 tests/cupy_tests/core_tests/test_cub_reduction.py create mode 100644 tests/cupy_tests/core_tests/test_dlpack.py create mode 100644 tests/cupy_tests/core_tests/test_elementwise.py create mode 100644 tests/cupy_tests/core_tests/test_flags.py create mode 100644 tests/cupy_tests/core_tests/test_function.py create mode 100644 tests/cupy_tests/core_tests/test_gufuncs.py create mode 100644 tests/cupy_tests/core_tests/test_include.py create mode 100644 tests/cupy_tests/core_tests/test_internal.py create mode 100644 tests/cupy_tests/core_tests/test_iter.py create mode 100644 tests/cupy_tests/core_tests/test_ndarray.py create mode 100644 tests/cupy_tests/core_tests/test_ndarray_adv_indexing.py create mode 100644 tests/cupy_tests/core_tests/test_ndarray_complex_ops.py create mode 100644 tests/cupy_tests/core_tests/test_ndarray_contiguity.py create mode 100644 tests/cupy_tests/core_tests/test_ndarray_conversion.py create mode 100644 tests/cupy_tests/core_tests/test_ndarray_copy_and_view.py create mode 100644 tests/cupy_tests/core_tests/test_ndarray_cuda_array_interface.py create mode 100644 tests/cupy_tests/core_tests/test_ndarray_elementwise_op.py create mode 100644 tests/cupy_tests/core_tests/test_ndarray_get.py create mode 100644 tests/cupy_tests/core_tests/test_ndarray_indexing.py create mode 100644 tests/cupy_tests/core_tests/test_ndarray_math.py create mode 100644 tests/cupy_tests/core_tests/test_ndarray_owndata.py create mode 100644 tests/cupy_tests/core_tests/test_ndarray_reduction.py create mode 100644 tests/cupy_tests/core_tests/test_ndarray_scatter.py create mode 100644 tests/cupy_tests/core_tests/test_ndarray_ufunc.py create mode 100644 tests/cupy_tests/core_tests/test_ndarray_unary_op.py create mode 100644 tests/cupy_tests/core_tests/test_raw.py create mode 100644 tests/cupy_tests/core_tests/test_reduction.py create mode 100644 tests/cupy_tests/core_tests/test_scan.py create mode 100644 tests/cupy_tests/core_tests/test_syncdetect.py create mode 100644 tests/cupy_tests/core_tests/test_ufunc_methods.py create mode 100644 tests/cupy_tests/core_tests/test_userkernel.py create mode 100644 tests/cupy_tests/creation_tests/__init__.py create mode 100644 tests/cupy_tests/creation_tests/test_basic.py create mode 100644 tests/cupy_tests/creation_tests/test_from_data.py create mode 100644 tests/cupy_tests/creation_tests/test_matrix.py create mode 100644 tests/cupy_tests/creation_tests/test_ranges.py create mode 100644 tests/cupy_tests/cuda_tests/__init__.py create mode 100644 tests/cupy_tests/cuda_tests/memory_hooks_tests/__init__.py create mode 100644 tests/cupy_tests/cuda_tests/memory_hooks_tests/test_debug_print.py create mode 100644 tests/cupy_tests/cuda_tests/memory_hooks_tests/test_line_profile.py create mode 100644 tests/cupy_tests/cuda_tests/test_compiler.py create mode 100644 tests/cupy_tests/cuda_tests/test_cublas.py create mode 100644 tests/cupy_tests/cuda_tests/test_cudnn.py create mode 100644 tests/cupy_tests/cuda_tests/test_cufft.py create mode 100644 tests/cupy_tests/cuda_tests/test_curand.py create mode 100644 tests/cupy_tests/cuda_tests/test_cusolver.py create mode 100644 tests/cupy_tests/cuda_tests/test_cusparse.py create mode 100644 tests/cupy_tests/cuda_tests/test_cutensor.py create mode 100644 tests/cupy_tests/cuda_tests/test_device.py create mode 100644 tests/cupy_tests/cuda_tests/test_driver.py create mode 100644 tests/cupy_tests/cuda_tests/test_graph.py create mode 100644 tests/cupy_tests/cuda_tests/test_memory.py create mode 100644 tests/cupy_tests/cuda_tests/test_memory_hook.py create mode 100644 tests/cupy_tests/cuda_tests/test_nccl.py create mode 100644 tests/cupy_tests/cuda_tests/test_nvrtc.py create mode 100644 tests/cupy_tests/cuda_tests/test_nvtx.py create mode 100644 tests/cupy_tests/cuda_tests/test_pinned_memory.py create mode 100644 tests/cupy_tests/cuda_tests/test_profile.py create mode 100644 tests/cupy_tests/cuda_tests/test_runtime.py create mode 100644 tests/cupy_tests/cuda_tests/test_stream.py create mode 100644 tests/cupy_tests/cuda_tests/test_texture.py create mode 100644 tests/cupy_tests/fft_tests/__init__.py create mode 100644 tests/cupy_tests/fft_tests/test_cache.py create mode 100644 tests/cupy_tests/fft_tests/test_callback.py create mode 100644 tests/cupy_tests/fft_tests/test_fft.py create mode 100644 tests/cupy_tests/functional_tests/__init__.py create mode 100644 tests/cupy_tests/functional_tests/test_piecewise.py create mode 100644 tests/cupy_tests/functional_tests/test_vectorize.py create mode 100644 tests/cupy_tests/indexing_tests/__init__.py create mode 100644 tests/cupy_tests/indexing_tests/test_generate.py create mode 100644 tests/cupy_tests/indexing_tests/test_indexing.py create mode 100644 tests/cupy_tests/indexing_tests/test_insert.py create mode 100644 tests/cupy_tests/indexing_tests/test_iterate.py create mode 100644 tests/cupy_tests/io_tests/__init__.py create mode 100644 tests/cupy_tests/io_tests/test_base_n.py create mode 100644 tests/cupy_tests/io_tests/test_formatting.py create mode 100644 tests/cupy_tests/io_tests/test_npz.py create mode 100644 tests/cupy_tests/io_tests/test_text.py create mode 100644 tests/cupy_tests/lib_tests/__init__.py create mode 100644 tests/cupy_tests/lib_tests/test_polynomial.py create mode 100644 tests/cupy_tests/lib_tests/test_shape_base.py create mode 100644 tests/cupy_tests/lib_tests/test_strided_tricks.py create mode 100644 tests/cupy_tests/linalg_tests/__init__.py create mode 100644 tests/cupy_tests/linalg_tests/test_decomposition.py create mode 100644 tests/cupy_tests/linalg_tests/test_eigenvalue.py create mode 100644 tests/cupy_tests/linalg_tests/test_einsum.py create mode 100644 tests/cupy_tests/linalg_tests/test_norms.py create mode 100644 tests/cupy_tests/linalg_tests/test_product.py create mode 100644 tests/cupy_tests/linalg_tests/test_solve.py create mode 100644 tests/cupy_tests/logic_tests/__init__.py create mode 100644 tests/cupy_tests/logic_tests/test_comparison.py create mode 100644 tests/cupy_tests/logic_tests/test_content.py create mode 100644 tests/cupy_tests/logic_tests/test_ops.py create mode 100644 tests/cupy_tests/logic_tests/test_truth.py create mode 100644 tests/cupy_tests/logic_tests/test_type_test.py create mode 100644 tests/cupy_tests/manipulation_tests/__init__.py create mode 100644 tests/cupy_tests/manipulation_tests/test_add_remove.py create mode 100644 tests/cupy_tests/manipulation_tests/test_basic.py create mode 100644 tests/cupy_tests/manipulation_tests/test_dims.py create mode 100644 tests/cupy_tests/manipulation_tests/test_join.py create mode 100644 tests/cupy_tests/manipulation_tests/test_kind.py create mode 100644 tests/cupy_tests/manipulation_tests/test_rearrange.py create mode 100644 tests/cupy_tests/manipulation_tests/test_shape.py create mode 100644 tests/cupy_tests/manipulation_tests/test_split.py create mode 100644 tests/cupy_tests/manipulation_tests/test_tiling.py create mode 100644 tests/cupy_tests/manipulation_tests/test_transpose.py create mode 100644 tests/cupy_tests/math_tests/__init__.py create mode 100644 tests/cupy_tests/math_tests/test_arithmetic.py create mode 100644 tests/cupy_tests/math_tests/test_explog.py create mode 100644 tests/cupy_tests/math_tests/test_floating.py create mode 100644 tests/cupy_tests/math_tests/test_hyperbolic.py create mode 100644 tests/cupy_tests/math_tests/test_matmul.py create mode 100644 tests/cupy_tests/math_tests/test_misc.py create mode 100644 tests/cupy_tests/math_tests/test_rational.py create mode 100644 tests/cupy_tests/math_tests/test_rounding.py create mode 100644 tests/cupy_tests/math_tests/test_special.py create mode 100644 tests/cupy_tests/math_tests/test_sumprod.py create mode 100644 tests/cupy_tests/math_tests/test_trigonometric.py create mode 100644 tests/cupy_tests/math_tests/test_window.py create mode 100644 tests/cupy_tests/misc_tests/__init__.py create mode 100644 tests/cupy_tests/misc_tests/test_byte_bounds.py create mode 100644 tests/cupy_tests/misc_tests/test_memory_ranges.py create mode 100644 tests/cupy_tests/misc_tests/test_who.py create mode 100644 tests/cupy_tests/padding_tests/__init__.py create mode 100644 tests/cupy_tests/padding_tests/test_pad.py create mode 100644 tests/cupy_tests/polynomial_tests/test_polynomial.py create mode 100644 tests/cupy_tests/polynomial_tests/test_polyutils.py create mode 100644 tests/cupy_tests/prof_tests/__init__.py create mode 100644 tests/cupy_tests/prof_tests/test_range.py create mode 100644 tests/cupy_tests/random_tests/__init__.py create mode 100644 tests/cupy_tests/random_tests/common_distributions.py create mode 100644 tests/cupy_tests/random_tests/test_bit_generator.py create mode 100644 tests/cupy_tests/random_tests/test_distributions.py create mode 100644 tests/cupy_tests/random_tests/test_generator.py create mode 100644 tests/cupy_tests/random_tests/test_generator_api.py create mode 100644 tests/cupy_tests/random_tests/test_init.py create mode 100644 tests/cupy_tests/random_tests/test_permutations.py create mode 100644 tests/cupy_tests/random_tests/test_random.py create mode 100644 tests/cupy_tests/random_tests/test_sample.py create mode 100644 tests/cupy_tests/sorting_tests/__init__.py create mode 100644 tests/cupy_tests/sorting_tests/test_count.py create mode 100644 tests/cupy_tests/sorting_tests/test_search.py create mode 100644 tests/cupy_tests/sorting_tests/test_sort.py create mode 100644 tests/cupy_tests/statistics_tests/__init__.py create mode 100644 tests/cupy_tests/statistics_tests/test_correlation.py create mode 100644 tests/cupy_tests/statistics_tests/test_histogram.py create mode 100644 tests/cupy_tests/statistics_tests/test_meanvar.py create mode 100644 tests/cupy_tests/statistics_tests/test_order.py create mode 100644 tests/cupy_tests/test_cublas.py create mode 100644 tests/cupy_tests/test_init.py create mode 100644 tests/cupy_tests/test_ndim.py create mode 100644 tests/cupy_tests/test_numpy_interop.py create mode 100644 tests/cupy_tests/test_type_routines.py create mode 100644 tests/cupy_tests/test_typing.py create mode 100644 tests/cupy_tests/testing_tests/__init__.py create mode 100644 tests/cupy_tests/testing_tests/test_array.py create mode 100644 tests/cupy_tests/testing_tests/test_condition.py create mode 100644 tests/cupy_tests/testing_tests/test_helper.py create mode 100644 tests/cupy_tests/testing_tests/test_loops.py create mode 100644 tests/cupy_tests/testing_tests/test_parameterized.py create mode 100644 tests/cupyx_tests/__init__.py create mode 100644 tests/cupyx_tests/distributed_tests/comm_runner.py create mode 100644 tests/cupyx_tests/distributed_tests/test_array.py create mode 100644 tests/cupyx_tests/distributed_tests/test_comm.py create mode 100644 tests/cupyx_tests/distributed_tests/test_store.py create mode 100644 tests/cupyx_tests/fallback_mode_tests/__init__.py create mode 100644 tests/cupyx_tests/fallback_mode_tests/test_fallback.py create mode 100644 tests/cupyx_tests/fallback_mode_tests/test_notifications.py create mode 100644 tests/cupyx_tests/jit_tests/__init__.py create mode 100644 tests/cupyx_tests/jit_tests/test_cooperative_groups.py create mode 100644 tests/cupyx_tests/jit_tests/test_cub.py create mode 100644 tests/cupyx_tests/jit_tests/test_device_function.py create mode 100644 tests/cupyx_tests/jit_tests/test_raw.py create mode 100644 tests/cupyx_tests/jit_tests/test_thrust.py create mode 100644 tests/cupyx_tests/linalg_tests/__init__.py create mode 100644 tests/cupyx_tests/linalg_tests/sparse_tests/__init__.py create mode 100644 tests/cupyx_tests/linalg_tests/sparse_tests/test_solve.py create mode 100644 tests/cupyx_tests/linalg_tests/test_solve.py create mode 100644 tests/cupyx_tests/profiler_tests/__init__.py create mode 100644 tests/cupyx_tests/profiler_tests/test_benchmark.py create mode 100644 tests/cupyx_tests/profiler_tests/test_profile.py create mode 100644 tests/cupyx_tests/profiler_tests/test_time_range.py create mode 100644 tests/cupyx_tests/scipy_tests/__init__.py create mode 100644 tests/cupyx_tests/scipy_tests/fft_tests/__init__.py create mode 100644 tests/cupyx_tests/scipy_tests/fft_tests/test_fft.py create mode 100644 tests/cupyx_tests/scipy_tests/fft_tests/test_fftlog.py create mode 100644 tests/cupyx_tests/scipy_tests/fft_tests/test_helper.py create mode 100644 tests/cupyx_tests/scipy_tests/fft_tests/test_realtransforms.py create mode 100644 tests/cupyx_tests/scipy_tests/fftpack_tests/__init__.py create mode 100644 tests/cupyx_tests/scipy_tests/fftpack_tests/test_fftpack.py create mode 100644 tests/cupyx_tests/scipy_tests/interpolate_tests/__init__.py create mode 100644 tests/cupyx_tests/scipy_tests/interpolate_tests/test_bspline.py create mode 100644 tests/cupyx_tests/scipy_tests/interpolate_tests/test_bspline2.py create mode 100644 tests/cupyx_tests/scipy_tests/interpolate_tests/test_polyint.py create mode 100644 tests/cupyx_tests/scipy_tests/interpolate_tests/test_ppoly.py create mode 100644 tests/cupyx_tests/scipy_tests/interpolate_tests/test_rbfinterp.py create mode 100644 tests/cupyx_tests/scipy_tests/interpolate_tests/test_rgi.py create mode 100644 tests/cupyx_tests/scipy_tests/linalg_tests/__init__.py create mode 100644 tests/cupyx_tests/scipy_tests/linalg_tests/test_decomp_lu.py create mode 100644 tests/cupyx_tests/scipy_tests/linalg_tests/test_solve_triangular.py create mode 100644 tests/cupyx_tests/scipy_tests/linalg_tests/test_special_matrices.py create mode 100644 tests/cupyx_tests/scipy_tests/linalg_tests/test_uarray.py create mode 100644 tests/cupyx_tests/scipy_tests/ndimage_tests/__init__.py create mode 100644 tests/cupyx_tests/scipy_tests/ndimage_tests/test_filters.py create mode 100644 tests/cupyx_tests/scipy_tests/ndimage_tests/test_fourier.py create mode 100644 tests/cupyx_tests/scipy_tests/ndimage_tests/test_interpolation.py create mode 100644 tests/cupyx_tests/scipy_tests/ndimage_tests/test_measurements.py create mode 100644 tests/cupyx_tests/scipy_tests/ndimage_tests/test_morphology.py create mode 100644 tests/cupyx_tests/scipy_tests/signal_tests/test_bsplines.py create mode 100644 tests/cupyx_tests/scipy_tests/signal_tests/test_signaltools.py create mode 100644 tests/cupyx_tests/scipy_tests/sparse_tests/__init__.py create mode 100644 tests/cupyx_tests/scipy_tests/sparse_tests/csgraph_tests/test_traversal.py create mode 100644 tests/cupyx_tests/scipy_tests/sparse_tests/test_base.py create mode 100644 tests/cupyx_tests/scipy_tests/sparse_tests/test_construct.py create mode 100644 tests/cupyx_tests/scipy_tests/sparse_tests/test_coo.py create mode 100644 tests/cupyx_tests/scipy_tests/sparse_tests/test_csc.py create mode 100644 tests/cupyx_tests/scipy_tests/sparse_tests/test_csr.py create mode 100644 tests/cupyx_tests/scipy_tests/sparse_tests/test_dia.py create mode 100644 tests/cupyx_tests/scipy_tests/sparse_tests/test_extract.py create mode 100644 tests/cupyx_tests/scipy_tests/sparse_tests/test_index.py create mode 100644 tests/cupyx_tests/scipy_tests/sparse_tests/test_linalg.py create mode 100644 tests/cupyx_tests/scipy_tests/spatial_tests/__init__.py create mode 100644 tests/cupyx_tests/scipy_tests/spatial_tests/test_distance.py create mode 100644 tests/cupyx_tests/scipy_tests/special_tests/__init__.py create mode 100644 tests/cupyx_tests/scipy_tests/special_tests/test_basic.py create mode 100644 tests/cupyx_tests/scipy_tests/special_tests/test_bessel.py create mode 100644 tests/cupyx_tests/scipy_tests/special_tests/test_beta.py create mode 100644 tests/cupyx_tests/scipy_tests/special_tests/test_convex_analysis.py create mode 100644 tests/cupyx_tests/scipy_tests/special_tests/test_digamma.py create mode 100644 tests/cupyx_tests/scipy_tests/special_tests/test_erf.py create mode 100644 tests/cupyx_tests/scipy_tests/special_tests/test_exp1.py create mode 100644 tests/cupyx_tests/scipy_tests/special_tests/test_expi.py create mode 100644 tests/cupyx_tests/scipy_tests/special_tests/test_expn.py create mode 100644 tests/cupyx_tests/scipy_tests/special_tests/test_gamma.py create mode 100644 tests/cupyx_tests/scipy_tests/special_tests/test_gammainc.py create mode 100644 tests/cupyx_tests/scipy_tests/special_tests/test_gammaln.py create mode 100644 tests/cupyx_tests/scipy_tests/special_tests/test_log_softmax.py create mode 100644 tests/cupyx_tests/scipy_tests/special_tests/test_logsumexp.py create mode 100644 tests/cupyx_tests/scipy_tests/special_tests/test_polygamma.py create mode 100644 tests/cupyx_tests/scipy_tests/special_tests/test_softmax.py create mode 100644 tests/cupyx_tests/scipy_tests/special_tests/test_sph_harm.py create mode 100644 tests/cupyx_tests/scipy_tests/special_tests/test_statistics.py create mode 100644 tests/cupyx_tests/scipy_tests/special_tests/test_ufunc_dispatch.py create mode 100644 tests/cupyx_tests/scipy_tests/special_tests/test_zeta.py create mode 100644 tests/cupyx_tests/scipy_tests/stats_tests/test_distributions.py create mode 100644 tests/cupyx_tests/scipy_tests/stats_tests/test_morestats.py create mode 100644 tests/cupyx_tests/scipy_tests/stats_tests/test_stats.py create mode 100644 tests/cupyx_tests/scipy_tests/test_get_array_module.py create mode 100644 tests/cupyx_tests/test_cudnn.py create mode 100644 tests/cupyx_tests/test_cupyx.py create mode 100644 tests/cupyx_tests/test_cusolver.py create mode 100644 tests/cupyx_tests/test_cusparse.py create mode 100644 tests/cupyx_tests/test_cutensor.py create mode 100644 tests/cupyx_tests/test_lapack.py create mode 100644 tests/cupyx_tests/test_optimize.py create mode 100644 tests/cupyx_tests/test_pinned_array.py create mode 100644 tests/cupyx_tests/test_rsqrt.py create mode 100644 tests/cupyx_tests/test_runtime.py create mode 100644 tests/cupyx_tests/test_time.py create mode 100644 tests/cupyx_tests/tools_tests/__init__.py create mode 100644 tests/cupyx_tests/tools_tests/test_install_library.py create mode 100644 tests/example_tests/__init__.py create mode 100644 tests/example_tests/example_test.py create mode 100644 tests/example_tests/test_custom_struct.py create mode 100644 tests/example_tests/test_finance.py create mode 100644 tests/example_tests/test_gemm.py create mode 100644 tests/example_tests/test_gmm.py create mode 100644 tests/example_tests/test_kmeans.py create mode 100644 tests/install_tests/__init__.py create mode 100644 tests/install_tests/test_build.py create mode 100644 tests/install_tests/test_cupy_builder/__init__.py create mode 100644 tests/install_tests/test_cupy_builder/test_command.py create mode 100644 tests/install_tests/test_cupy_builder/test_context.py create mode 100644 tests/install_tests/test_cupy_builder/test_features.py create mode 100644 tests/install_tests/test_universal_pkg/__init__.py create mode 100644 tests/install_tests/test_universal_pkg/test_setup.py create mode 100644 tests/install_tests/test_utils.py create mode 100644 third_party/cub/.cproject create mode 100644 third_party/cub/.project create mode 100644 third_party/cub/.settings/.gitignore create mode 100644 third_party/cub/.settings/org.eclipse.cdt.codan.core.prefs create mode 100644 third_party/cub/.settings/org.eclipse.cdt.core.prefs create mode 100644 third_party/cub/.settings/org.eclipse.cdt.ui.prefs create mode 100644 third_party/cub/.settings/org.eclipse.core.runtime.prefs create mode 100644 third_party/cub/CHANGE_LOG.TXT create mode 100644 third_party/cub/LICENSE.TXT create mode 100644 third_party/cub/README.md create mode 100644 third_party/cub/common.mk create mode 100644 third_party/cub/cub/agent/agent_histogram.cuh create mode 100644 third_party/cub/cub/agent/agent_radix_sort_downsweep.cuh create mode 100644 third_party/cub/cub/agent/agent_radix_sort_upsweep.cuh create mode 100644 third_party/cub/cub/agent/agent_reduce.cuh create mode 100644 third_party/cub/cub/agent/agent_reduce_by_key.cuh create mode 100644 third_party/cub/cub/agent/agent_rle.cuh create mode 100644 third_party/cub/cub/agent/agent_scan.cuh create mode 100644 third_party/cub/cub/agent/agent_segment_fixup.cuh create mode 100644 third_party/cub/cub/agent/agent_select_if.cuh create mode 100644 third_party/cub/cub/agent/agent_spmv_orig.cuh create mode 100644 third_party/cub/cub/agent/single_pass_scan_operators.cuh create mode 100644 third_party/cub/cub/block/block_adjacent_difference.cuh create mode 100644 third_party/cub/cub/block/block_discontinuity.cuh create mode 100644 third_party/cub/cub/block/block_exchange.cuh create mode 100644 third_party/cub/cub/block/block_histogram.cuh create mode 100644 third_party/cub/cub/block/block_load.cuh create mode 100644 third_party/cub/cub/block/block_radix_rank.cuh create mode 100644 third_party/cub/cub/block/block_radix_sort.cuh create mode 100644 third_party/cub/cub/block/block_raking_layout.cuh create mode 100644 third_party/cub/cub/block/block_reduce.cuh create mode 100644 third_party/cub/cub/block/block_scan.cuh create mode 100644 third_party/cub/cub/block/block_shuffle.cuh create mode 100644 third_party/cub/cub/block/block_store.cuh create mode 100644 third_party/cub/cub/block/specializations/block_histogram_atomic.cuh create mode 100644 third_party/cub/cub/block/specializations/block_histogram_sort.cuh create mode 100644 third_party/cub/cub/block/specializations/block_reduce_raking.cuh create mode 100644 third_party/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh create mode 100644 third_party/cub/cub/block/specializations/block_reduce_warp_reductions.cuh create mode 100644 third_party/cub/cub/block/specializations/block_scan_raking.cuh create mode 100644 third_party/cub/cub/block/specializations/block_scan_warp_scans.cuh create mode 100644 third_party/cub/cub/block/specializations/block_scan_warp_scans2.cuh create mode 100644 third_party/cub/cub/block/specializations/block_scan_warp_scans3.cuh create mode 100644 third_party/cub/cub/cub.cuh create mode 100644 third_party/cub/cub/device/device_histogram.cuh create mode 100644 third_party/cub/cub/device/device_partition.cuh create mode 100644 third_party/cub/cub/device/device_radix_sort.cuh create mode 100644 third_party/cub/cub/device/device_reduce.cuh create mode 100644 third_party/cub/cub/device/device_run_length_encode.cuh create mode 100644 third_party/cub/cub/device/device_scan.cuh create mode 100644 third_party/cub/cub/device/device_segmented_radix_sort.cuh create mode 100644 third_party/cub/cub/device/device_segmented_reduce.cuh create mode 100644 third_party/cub/cub/device/device_select.cuh create mode 100644 third_party/cub/cub/device/device_spmv.cuh create mode 100644 third_party/cub/cub/device/dispatch/dispatch_histogram.cuh create mode 100644 third_party/cub/cub/device/dispatch/dispatch_radix_sort.cuh create mode 100644 third_party/cub/cub/device/dispatch/dispatch_reduce.cuh create mode 100644 third_party/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh create mode 100644 third_party/cub/cub/device/dispatch/dispatch_rle.cuh create mode 100644 third_party/cub/cub/device/dispatch/dispatch_scan.cuh create mode 100644 third_party/cub/cub/device/dispatch/dispatch_select_if.cuh create mode 100644 third_party/cub/cub/device/dispatch/dispatch_spmv_orig.cuh create mode 100644 third_party/cub/cub/grid/grid_barrier.cuh create mode 100644 third_party/cub/cub/grid/grid_even_share.cuh create mode 100644 third_party/cub/cub/grid/grid_mapping.cuh create mode 100644 third_party/cub/cub/grid/grid_queue.cuh create mode 100644 third_party/cub/cub/host/mutex.cuh create mode 100644 third_party/cub/cub/iterator/arg_index_input_iterator.cuh create mode 100644 third_party/cub/cub/iterator/cache_modified_input_iterator.cuh create mode 100644 third_party/cub/cub/iterator/cache_modified_output_iterator.cuh create mode 100644 third_party/cub/cub/iterator/constant_input_iterator.cuh create mode 100644 third_party/cub/cub/iterator/counting_input_iterator.cuh create mode 100644 third_party/cub/cub/iterator/discard_output_iterator.cuh create mode 100644 third_party/cub/cub/iterator/tex_obj_input_iterator.cuh create mode 100644 third_party/cub/cub/iterator/tex_ref_input_iterator.cuh create mode 100644 third_party/cub/cub/iterator/transform_input_iterator.cuh create mode 100644 third_party/cub/cub/thread/thread_load.cuh create mode 100644 third_party/cub/cub/thread/thread_operators.cuh create mode 100644 third_party/cub/cub/thread/thread_reduce.cuh create mode 100644 third_party/cub/cub/thread/thread_scan.cuh create mode 100644 third_party/cub/cub/thread/thread_search.cuh create mode 100644 third_party/cub/cub/thread/thread_store.cuh create mode 100644 third_party/cub/cub/util_allocator.cuh create mode 100644 third_party/cub/cub/util_arch.cuh create mode 100644 third_party/cub/cub/util_debug.cuh create mode 100644 third_party/cub/cub/util_device.cuh create mode 100644 third_party/cub/cub/util_macro.cuh create mode 100644 third_party/cub/cub/util_namespace.cuh create mode 100644 third_party/cub/cub/util_ptx.cuh create mode 100644 third_party/cub/cub/util_type.cuh create mode 100644 third_party/cub/cub/warp/specializations/warp_reduce_shfl.cuh create mode 100644 third_party/cub/cub/warp/specializations/warp_reduce_smem.cuh create mode 100644 third_party/cub/cub/warp/specializations/warp_scan_shfl.cuh create mode 100644 third_party/cub/cub/warp/specializations/warp_scan_smem.cuh create mode 100644 third_party/cub/cub/warp/warp_reduce.cuh create mode 100644 third_party/cub/cub/warp/warp_scan.cuh create mode 100644 third_party/cub/eclipse code style profile.xml create mode 100644 third_party/cub/examples/block/.gitignore create mode 100644 third_party/cub/examples/block/Makefile create mode 100644 third_party/cub/examples/block/example_block_radix_sort.cu create mode 100644 third_party/cub/examples/block/example_block_reduce.cu create mode 100644 third_party/cub/examples/block/example_block_scan.cu create mode 100644 third_party/cub/examples/block/reduce_by_key.cu create mode 100644 third_party/cub/examples/device/.gitignore create mode 100644 third_party/cub/examples/device/Makefile create mode 100644 third_party/cub/examples/device/example_device_partition_flagged.cu create mode 100644 third_party/cub/examples/device/example_device_partition_if.cu create mode 100644 third_party/cub/examples/device/example_device_radix_sort.cu create mode 100644 third_party/cub/examples/device/example_device_reduce.cu create mode 100644 third_party/cub/examples/device/example_device_scan.cu create mode 100644 third_party/cub/examples/device/example_device_select_flagged.cu create mode 100644 third_party/cub/examples/device/example_device_select_if.cu create mode 100644 third_party/cub/examples/device/example_device_select_unique.cu create mode 100644 third_party/cub/examples/device/example_device_sort_find_non_trivial_runs.cu create mode 100644 third_party/cub/experimental/.gitignore create mode 100644 third_party/cub/experimental/Makefile create mode 100644 third_party/cub/experimental/defunct/example_coo_spmv.cu create mode 100644 third_party/cub/experimental/defunct/test_device_seg_reduce.cu create mode 100644 third_party/cub/experimental/histogram/histogram_cub.h create mode 100644 third_party/cub/experimental/histogram/histogram_gmem_atomics.h create mode 100644 third_party/cub/experimental/histogram/histogram_smem_atomics.h create mode 100644 third_party/cub/experimental/histogram_compare.cu create mode 100644 third_party/cub/experimental/sparse_matrix.h create mode 100644 third_party/cub/experimental/spmv_compare.cu create mode 100755 third_party/cub/experimental/spmv_script.sh create mode 100644 third_party/cub/test/.gitignore create mode 100644 third_party/cub/test/Makefile create mode 100644 third_party/cub/test/half.h create mode 100644 third_party/cub/test/link_a.cu create mode 100644 third_party/cub/test/link_b.cu create mode 100644 third_party/cub/test/link_main.cpp create mode 100644 third_party/cub/test/mersenne.h create mode 100644 third_party/cub/test/test_allocator.cu create mode 100644 third_party/cub/test/test_block_histogram.cu create mode 100644 third_party/cub/test/test_block_load_store.cu create mode 100644 third_party/cub/test/test_block_radix_sort.cu create mode 100644 third_party/cub/test/test_block_reduce.cu create mode 100644 third_party/cub/test/test_block_scan.cu create mode 100644 third_party/cub/test/test_device_histogram.cu create mode 100644 third_party/cub/test/test_device_radix_sort.cu create mode 100644 third_party/cub/test/test_device_reduce.cu create mode 100644 third_party/cub/test/test_device_reduce_by_key.cu create mode 100644 third_party/cub/test/test_device_run_length_encode.cu create mode 100644 third_party/cub/test/test_device_scan.cu create mode 100644 third_party/cub/test/test_device_select_if.cu create mode 100644 third_party/cub/test/test_device_select_unique.cu create mode 100644 third_party/cub/test/test_grid_barrier.cu create mode 100644 third_party/cub/test/test_iterator.cu create mode 100644 third_party/cub/test/test_util.h create mode 100644 third_party/cub/test/test_warp_reduce.cu create mode 100644 third_party/cub/test/test_warp_scan.cu create mode 100644 third_party/cub/tune/.gitignore create mode 100644 third_party/cub/tune/Makefile create mode 100644 third_party/cub/tune/tune_device_reduce.cu diff --git a/CITATION.bib b/CITATION.bib new file mode 100644 index 0000000..2b4ee94 --- /dev/null +++ b/CITATION.bib @@ -0,0 +1,7 @@ +@inproceedings{cupy_learningsys2017, + author = "Okuta, Ryosuke and Unno, Yuya and Nishino, Daisuke and Hido, Shohei and Loomis, Crissman", + title = "CuPy: A NumPy-Compatible Library for NVIDIA GPU Calculations", + booktitle = "Proceedings of Workshop on Machine Learning Systems (LearningSys) in The Thirty-first Annual Conference on Neural Information Processing Systems (NIPS)", + year = "2017", + url = "http://learningsys.org/nips17/assets/papers/paper_16.pdf" +} diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..3ad1a10 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,7 @@ +# CuPy Code of Conduct + +CuPy follows the [NumFOCUS Code of Conduct][homepage] available at https://numfocus.org/code-of-conduct. + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at `dlfw@preferred.jp`. + +[homepage]: https://numfocus.org/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..de1113b --- /dev/null +++ b/LICENSE @@ -0,0 +1,20 @@ +Copyright (c) 2015 Preferred Infrastructure, Inc. +Copyright (c) 2015 Preferred Networks, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..254ff70 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,18 @@ +# Contents of sdist. See also `setup.py`. +recursive-include cupy *.h *.hpp +recursive-include cupy *.pyx *.pxd *.pxi +recursive-include cupy_backends *.h *.hpp +recursive-include cupy_backends *.pyx *.pxd *.pxi + +# Fail-safe to avoid including Cythoinzed sources in sdist. +recursive-exclude cupy *.cpp +recursive-exclude cupy_backends *.cpp + +# Installers +recursive-include install *.py +recursive-include tests *.py + +# Licenses +include LICENSE +include docs/LICENSE_THIRD_PARTY +include docs/source/license.rst diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000..e98b0be --- /dev/null +++ b/codecov.yml @@ -0,0 +1,13 @@ +comment: false +github_checks: + annotations: false + +coverage: + status: + # Disable coverage measurement for overall codebase. + project: off + # Enable coverage measurement for diff introduced in the pull-request, + # but do not mark "X" on commit status for now. + patch: + default: + target: '0%' diff --git a/cupy/__init__.py b/cupy/__init__.py new file mode 100644 index 0000000..2852d86 --- /dev/null +++ b/cupy/__init__.py @@ -0,0 +1,920 @@ +import functools as _functools +import sys as _sys + +import numpy as _numpy + +from cupy import _environment +from cupy import _version + + +_environment._detect_duplicate_installation() # NOQA +_environment._setup_win32_dll_directory() # NOQA +_environment._preload_library('cutensor') # NOQA +_environment._preload_library('nccl') # NOQA + + +try: + from cupy import _core # NOQA +except ImportError as exc: + raise ImportError(f''' +================================================================ +{_environment._diagnose_import_error()} + +Original error: + {type(exc).__name__}: {exc} +================================================================ +''') from exc + + +from cupy import cuda # NOQA +# Do not make `cupy.cupyx` available because it is confusing. +import cupyx as _cupyx # NOQA + + +def is_available(): + return cuda.is_available() + + +__version__ = _version.__version__ + + +from cupy import fft # NOQA +from cupy import linalg # NOQA +from cupy import polynomial # NOQA +from cupy import random # NOQA +# `cupy.sparse` is deprecated in v8 +from cupy import sparse # NOQA +from cupy import testing # NOQA # NOQA + + +# import class and function +from cupy._core import ndarray # NOQA +from cupy._core import ufunc # NOQA + + +# ============================================================================= +# Constants (borrowed from NumPy) +# ============================================================================= +from numpy import e # NOQA +from numpy import euler_gamma # NOQA +from numpy import inf # NOQA +from numpy import nan # NOQA +from numpy import newaxis # == None # NOQA +from numpy import pi # NOQA + +# APIs to be removed in NumPy 2.0. +# Remove these when bumping the baseline API to NumPy 2.0. +# https://github.com/cupy/cupy/pull/7800 +PINF = Inf = Infinity = infty = inf # NOQA +NINF = -inf # NOQA +NAN = NaN = nan # NOQA +PZERO = 0.0 # NOQA +NZERO = -0.0 # NOQA + +# ============================================================================= +# Data types (borrowed from NumPy) +# +# The order of these declarations are borrowed from the NumPy document: +# https://numpy.org/doc/stable/reference/arrays.scalars.html +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Generic types +# ----------------------------------------------------------------------------- +from numpy import complexfloating # NOQA +from numpy import floating # NOQA +from numpy import generic # NOQA +from numpy import inexact # NOQA +from numpy import integer # NOQA +from numpy import number # NOQA +from numpy import signedinteger # NOQA +from numpy import unsignedinteger # NOQA + +# Not supported by CuPy: +# from numpy import flexible +# from numpy import character + +# ----------------------------------------------------------------------------- +# Booleans +# ----------------------------------------------------------------------------- +from numpy import bool_ # NOQA + +# ----------------------------------------------------------------------------- +# Integers +# ----------------------------------------------------------------------------- +from numpy import byte # NOQA +from numpy import short # NOQA +from numpy import intc # NOQA +from numpy import int_ # NOQA +from numpy import longlong # NOQA +from numpy import intp # NOQA +from numpy import int8 # NOQA +from numpy import int16 # NOQA +from numpy import int32 # NOQA +from numpy import int64 # NOQA + +# ----------------------------------------------------------------------------- +# Unsigned integers +# ----------------------------------------------------------------------------- +from numpy import ubyte # NOQA +from numpy import ushort # NOQA +from numpy import uintc # NOQA +from numpy import uint # NOQA +from numpy import ulonglong # NOQA +from numpy import uintp # NOQA +from numpy import uint8 # NOQA +from numpy import uint16 # NOQA +from numpy import uint32 # NOQA +from numpy import uint64 # NOQA + +# ----------------------------------------------------------------------------- +# Floating-point numbers +# ----------------------------------------------------------------------------- +from numpy import half # NOQA +from numpy import single # NOQA +from numpy import double # NOQA +from numpy import float_ # NOQA +from numpy import longfloat # NOQA +from numpy import float16 # NOQA +from numpy import float32 # NOQA +from numpy import float64 # NOQA + +# Not supported by CuPy: +# from numpy import float96 +# from numpy import float128 + +# ----------------------------------------------------------------------------- +# Complex floating-point numbers +# ----------------------------------------------------------------------------- +from numpy import csingle # NOQA +from numpy import singlecomplex # NOQA +from numpy import cdouble # NOQA +from numpy import cfloat # NOQA +from numpy import complex_ # NOQA +from numpy import complex64 # NOQA +from numpy import complex128 # NOQA + +# Not supported by CuPy: +# from numpy import complex192 +# from numpy import complex256 +# from numpy import clongfloat + +# ----------------------------------------------------------------------------- +# Any Python object +# ----------------------------------------------------------------------------- + +# Not supported by CuPy: +# from numpy import object_ +# from numpy import bytes_ +# from numpy import unicode_ +# from numpy import void + +# ----------------------------------------------------------------------------- +# Built-in Python types +# ----------------------------------------------------------------------------- + +# ============================================================================= +# Routines +# +# The order of these declarations are borrowed from the NumPy document: +# https://numpy.org/doc/stable/reference/routines.html +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Array creation routines +# ----------------------------------------------------------------------------- +from cupy._creation.basic import empty # NOQA +from cupy._creation.basic import empty_like # NOQA +from cupy._creation.basic import eye # NOQA +from cupy._creation.basic import full # NOQA +from cupy._creation.basic import full_like # NOQA +from cupy._creation.basic import identity # NOQA +from cupy._creation.basic import ones # NOQA +from cupy._creation.basic import ones_like # NOQA +from cupy._creation.basic import zeros # NOQA +from cupy._creation.basic import zeros_like # NOQA + +from cupy._creation.from_data import copy # NOQA +from cupy._creation.from_data import array # NOQA +from cupy._creation.from_data import asanyarray # NOQA +from cupy._creation.from_data import asarray # NOQA +from cupy._creation.from_data import ascontiguousarray # NOQA +from cupy._creation.from_data import fromfile # NOQA +from cupy._creation.from_data import fromfunction # NOQA +from cupy._creation.from_data import fromiter # NOQA +from cupy._creation.from_data import frombuffer # NOQA +from cupy._creation.from_data import fromstring # NOQA +from cupy._creation.from_data import loadtxt # NOQA +from cupy._creation.from_data import genfromtxt # NOQA + +from cupy._creation.ranges import arange # NOQA +from cupy._creation.ranges import linspace # NOQA +from cupy._creation.ranges import logspace # NOQA +from cupy._creation.ranges import meshgrid # NOQA +from cupy._creation.ranges import mgrid # NOQA +from cupy._creation.ranges import ogrid # NOQA + +from cupy._creation.matrix import diag # NOQA +from cupy._creation.matrix import diagflat # NOQA +from cupy._creation.matrix import tri # NOQA +from cupy._creation.matrix import tril # NOQA +from cupy._creation.matrix import triu # NOQA +from cupy._creation.matrix import vander # NOQA + +# ----------------------------------------------------------------------------- +# Functional routines +# ----------------------------------------------------------------------------- +from cupy._functional.piecewise import piecewise # NOQA +from cupy._functional.vectorize import vectorize # NOQA +from cupy.lib._shape_base import apply_along_axis # NOQA + +# ----------------------------------------------------------------------------- +# Array manipulation routines +# ----------------------------------------------------------------------------- +from cupy._manipulation.basic import copyto # NOQA + +from cupy._manipulation.shape import shape # NOQA +from cupy._manipulation.shape import ravel # NOQA +from cupy._manipulation.shape import reshape # NOQA + +from cupy._manipulation.transpose import moveaxis # NOQA +from cupy._manipulation.transpose import rollaxis # NOQA +from cupy._manipulation.transpose import swapaxes # NOQA +from cupy._manipulation.transpose import transpose # NOQA + +from cupy._manipulation.dims import atleast_1d # NOQA +from cupy._manipulation.dims import atleast_2d # NOQA +from cupy._manipulation.dims import atleast_3d # NOQA +from cupy._manipulation.dims import broadcast # NOQA +from cupy._manipulation.dims import broadcast_arrays # NOQA +from cupy._manipulation.dims import broadcast_to # NOQA +from cupy._manipulation.dims import expand_dims # NOQA +from cupy._manipulation.dims import squeeze # NOQA + +from cupy._manipulation.join import column_stack # NOQA +from cupy._manipulation.join import concatenate # NOQA +from cupy._manipulation.join import dstack # NOQA +from cupy._manipulation.join import hstack # NOQA +from cupy._manipulation.join import stack # NOQA +from cupy._manipulation.join import vstack # NOQA +from cupy._manipulation.join import vstack as row_stack # NOQA + +from cupy._manipulation.kind import asarray_chkfinite # NOQA +from cupy._manipulation.kind import asfarray # NOQA +from cupy._manipulation.kind import asfortranarray # NOQA +from cupy._manipulation.kind import require # NOQA + +from cupy._manipulation.split import array_split # NOQA +from cupy._manipulation.split import dsplit # NOQA +from cupy._manipulation.split import hsplit # NOQA +from cupy._manipulation.split import split # NOQA +from cupy._manipulation.split import vsplit # NOQA + +from cupy._manipulation.tiling import repeat # NOQA +from cupy._manipulation.tiling import tile # NOQA + +from cupy._manipulation.add_remove import append # NOQA +from cupy._manipulation.add_remove import resize # NOQA +from cupy._manipulation.add_remove import unique # NOQA +from cupy._manipulation.add_remove import trim_zeros # NOQA + +from cupy._manipulation.rearrange import flip # NOQA +from cupy._manipulation.rearrange import fliplr # NOQA +from cupy._manipulation.rearrange import flipud # NOQA +from cupy._manipulation.rearrange import roll # NOQA +from cupy._manipulation.rearrange import rot90 # NOQA + +# Borrowed from NumPy +if hasattr(_numpy, 'broadcast_shapes'): # NumPy 1.20 + from numpy import broadcast_shapes # NOQA + +# ----------------------------------------------------------------------------- +# Binary operations +# ----------------------------------------------------------------------------- +from cupy._binary.elementwise import bitwise_and # NOQA +from cupy._binary.elementwise import bitwise_or # NOQA +from cupy._binary.elementwise import bitwise_xor # NOQA +from cupy._binary.elementwise import bitwise_not # NOQA +from cupy._binary.elementwise import invert # NOQA +from cupy._binary.elementwise import left_shift # NOQA +from cupy._binary.elementwise import right_shift # NOQA + +from cupy._binary.packing import packbits # NOQA +from cupy._binary.packing import unpackbits # NOQA + + +def binary_repr(num, width=None): + """Return the binary representation of the input number as a string. + + .. seealso:: :func:`numpy.binary_repr` + """ + return _numpy.binary_repr(num, width) + + +# ----------------------------------------------------------------------------- +# Data type routines (mostly borrowed from NumPy) +# ----------------------------------------------------------------------------- +def can_cast(from_, to, casting='safe'): + """Returns True if cast between data types can occur according to the + casting rule. If from is a scalar or array scalar, also returns True if the + scalar value can be cast without overflow or truncation to an integer. + + .. seealso:: :func:`numpy.can_cast` + """ + from_ = from_.dtype if isinstance(from_, ndarray) else from_ + return _numpy.can_cast(from_, to, casting=casting) + + +def common_type(*arrays): + """Return a scalar type which is common to the input arrays. + + .. seealso:: :func:`numpy.common_type` + """ + if len(arrays) == 0: + return _numpy.float16 + + default_float_dtype = _numpy.dtype('float64') + dtypes = [] + for a in arrays: + if a.dtype.kind == 'b': + raise TypeError('can\'t get common type for non-numeric array') + elif a.dtype.kind in 'iu': + dtypes.append(default_float_dtype) + else: + dtypes.append(a.dtype) + + return _functools.reduce(_numpy.promote_types, dtypes).type + + +def result_type(*arrays_and_dtypes): + """Returns the type that results from applying the NumPy type promotion + rules to the arguments. + + .. seealso:: :func:`numpy.result_type` + """ + dtypes = [a.dtype if isinstance(a, ndarray) + else a for a in arrays_and_dtypes] + return _numpy.result_type(*dtypes) + + +from cupy._core.core import min_scalar_type # NOQA + +from numpy import obj2sctype # NOQA +from numpy import promote_types # NOQA + +from numpy import dtype # NOQA +from numpy import format_parser # NOQA + +from numpy import finfo # NOQA +from numpy import iinfo # NOQA + +from numpy import find_common_type # NOQA +from numpy import issctype # NOQA +from numpy import issubclass_ # NOQA +from numpy import issubdtype # NOQA +from numpy import issubsctype # NOQA + +from numpy import mintypecode # NOQA +from numpy import sctype2char # NOQA +from numpy import typename # NOQA + +# ----------------------------------------------------------------------------- +# Optionally Scipy-accelerated routines +# ----------------------------------------------------------------------------- +# TODO(beam2d): Implement it + +# ----------------------------------------------------------------------------- +# Discrete Fourier Transform +# ----------------------------------------------------------------------------- +# TODO(beam2d): Implement it + +# ----------------------------------------------------------------------------- +# Indexing routines +# ----------------------------------------------------------------------------- +from cupy._indexing.generate import c_ # NOQA +from cupy._indexing.generate import indices # NOQA +from cupy._indexing.generate import ix_ # NOQA +from cupy._indexing.generate import mask_indices # NOQA +from cupy._indexing.generate import tril_indices # NOQA +from cupy._indexing.generate import tril_indices_from # NOQA +from cupy._indexing.generate import triu_indices # NOQA +from cupy._indexing.generate import triu_indices_from # NOQA +from cupy._indexing.generate import r_ # NOQA +from cupy._indexing.generate import ravel_multi_index # NOQA +from cupy._indexing.generate import unravel_index # NOQA + +from cupy._indexing.indexing import choose # NOQA +from cupy._indexing.indexing import compress # NOQA +from cupy._indexing.indexing import diagonal # NOQA +from cupy._indexing.indexing import extract # NOQA +from cupy._indexing.indexing import select # NOQA +from cupy._indexing.indexing import take # NOQA +from cupy._indexing.indexing import take_along_axis # NOQA + +from cupy._indexing.insert import place # NOQA +from cupy._indexing.insert import put # NOQA +from cupy._indexing.insert import putmask # NOQA +from cupy._indexing.insert import fill_diagonal # NOQA +from cupy._indexing.insert import diag_indices # NOQA +from cupy._indexing.insert import diag_indices_from # NOQA + +from cupy._indexing.iterate import flatiter # NOQA + +# Borrowed from NumPy +from numpy import get_array_wrap # NOQA +from numpy import index_exp # NOQA +from numpy import ndindex # NOQA +from numpy import s_ # NOQA + +# ----------------------------------------------------------------------------- +# Input and output +# ----------------------------------------------------------------------------- +from cupy._io.npz import load # NOQA +from cupy._io.npz import save # NOQA +from cupy._io.npz import savez # NOQA +from cupy._io.npz import savez_compressed # NOQA + +from cupy._io.formatting import array_repr # NOQA +from cupy._io.formatting import array_str # NOQA +from cupy._io.formatting import array2string # NOQA +from cupy._io.formatting import format_float_positional # NOQA +from cupy._io.formatting import format_float_scientific # NOQA + +from cupy._io.text import savetxt # NOQA + + +def base_repr(number, base=2, padding=0): # NOQA (needed to avoid redefinition of `number`) + """Return a string representation of a number in the given base system. + + .. seealso:: :func:`numpy.base_repr` + """ + return _numpy.base_repr(number, base, padding) + + +# Borrowed from NumPy +from numpy import DataSource # NOQA +from numpy import get_printoptions # NOQA +from numpy import set_printoptions # NOQA +from numpy import printoptions # NOQA +from numpy import set_string_function # NOQA + + +# ----------------------------------------------------------------------------- +# Linear algebra +# ----------------------------------------------------------------------------- +from cupy.linalg._einsum import einsum # NOQA + +from cupy.linalg._product import cross # NOQA +from cupy.linalg._product import dot # NOQA +from cupy.linalg._product import inner # NOQA +from cupy.linalg._product import kron # NOQA +from cupy.linalg._product import matmul # NOQA +from cupy.linalg._product import outer # NOQA +from cupy.linalg._product import tensordot # NOQA +from cupy.linalg._product import vdot # NOQA + +from cupy.linalg._norms import trace # NOQA + +# ----------------------------------------------------------------------------- +# Logic functions +# ----------------------------------------------------------------------------- +from cupy._logic.comparison import allclose # NOQA +from cupy._logic.comparison import array_equal # NOQA +from cupy._logic.comparison import array_equiv # NOQA +from cupy._logic.comparison import isclose # NOQA + +from cupy._logic.content import isfinite # NOQA +from cupy._logic.content import isinf # NOQA +from cupy._logic.content import isnan # NOQA +from cupy._logic.content import isneginf # NOQA +from cupy._logic.content import isposinf # NOQA + +from cupy._logic.truth import in1d # NOQA +from cupy._logic.truth import isin # NOQA + +from cupy._logic.type_testing import iscomplex # NOQA +from cupy._logic.type_testing import iscomplexobj # NOQA +from cupy._logic.type_testing import isfortran # NOQA +from cupy._logic.type_testing import isreal # NOQA +from cupy._logic.type_testing import isrealobj # NOQA + +from cupy._logic.truth import in1d # NOQA +from cupy._logic.truth import intersect1d # NOQA +from cupy._logic.truth import isin # NOQA +from cupy._logic.truth import setdiff1d # NOQA +from cupy._logic.truth import setxor1d # NOQA +from cupy._logic.truth import union1d # NOQA + + +def isscalar(element): + """Returns True if the type of num is a scalar type. + + .. seealso:: :func:`numpy.isscalar` + """ + return _numpy.isscalar(element) + + +from cupy._logic.ops import logical_and # NOQA +from cupy._logic.ops import logical_not # NOQA +from cupy._logic.ops import logical_or # NOQA +from cupy._logic.ops import logical_xor # NOQA + +from cupy._logic.comparison import equal # NOQA +from cupy._logic.comparison import greater # NOQA +from cupy._logic.comparison import greater_equal # NOQA +from cupy._logic.comparison import less # NOQA +from cupy._logic.comparison import less_equal # NOQA +from cupy._logic.comparison import not_equal # NOQA + +from cupy._logic.truth import all # NOQA +from cupy._logic.truth import all as alltrue # NOQA +from cupy._logic.truth import any # NOQA +from cupy._logic.truth import any as sometrue # NOQA + +# ------------------------------------------------------------------------------ +# Polynomial functions +# ------------------------------------------------------------------------------ +from cupy.lib._polynomial import poly1d # NOQA +from cupy.lib._routines_poly import poly # NOQA +from cupy.lib._routines_poly import polyadd # NOQA +from cupy.lib._routines_poly import polysub # NOQA +from cupy.lib._routines_poly import polymul # NOQA +from cupy.lib._routines_poly import polyfit # NOQA +from cupy.lib._routines_poly import polyval # NOQA +from cupy.lib._routines_poly import roots # NOQA + +# Borrowed from NumPy +from numpy import RankWarning # NOQA + +# ----------------------------------------------------------------------------- +# Mathematical functions +# ----------------------------------------------------------------------------- +from cupy._math.trigonometric import arccos # NOQA +from cupy._math.trigonometric import arcsin # NOQA +from cupy._math.trigonometric import arctan # NOQA +from cupy._math.trigonometric import arctan2 # NOQA +from cupy._math.trigonometric import cos # NOQA +from cupy._math.trigonometric import deg2rad # NOQA +from cupy._math.trigonometric import degrees # NOQA +from cupy._math.trigonometric import hypot # NOQA +from cupy._math.trigonometric import rad2deg # NOQA +from cupy._math.trigonometric import radians # NOQA +from cupy._math.trigonometric import sin # NOQA +from cupy._math.trigonometric import tan # NOQA +from cupy._math.trigonometric import unwrap # NOQA + +from cupy._math.hyperbolic import arccosh # NOQA +from cupy._math.hyperbolic import arcsinh # NOQA +from cupy._math.hyperbolic import arctanh # NOQA +from cupy._math.hyperbolic import cosh # NOQA +from cupy._math.hyperbolic import sinh # NOQA +from cupy._math.hyperbolic import tanh # NOQA + +from cupy._math.rounding import around # NOQA +from cupy._math.rounding import ceil # NOQA +from cupy._math.rounding import fix # NOQA +from cupy._math.rounding import floor # NOQA +from cupy._math.rounding import rint # NOQA +from cupy._math.rounding import round_ # NOQA +from cupy._math.rounding import round_ as round # NOQA +from cupy._math.rounding import trunc # NOQA + +from cupy._math.sumprod import prod # NOQA +from cupy._math.sumprod import prod as product # NOQA +from cupy._math.sumprod import sum # NOQA +from cupy._math.sumprod import cumprod # NOQA +from cupy._math.sumprod import cumprod as cumproduct # NOQA +from cupy._math.sumprod import cumsum # NOQA +from cupy._math.sumprod import ediff1d # NOQA +from cupy._math.sumprod import nancumprod # NOQA +from cupy._math.sumprod import nancumsum # NOQA +from cupy._math.sumprod import nansum # NOQA +from cupy._math.sumprod import nanprod # NOQA +from cupy._math.sumprod import diff # NOQA +from cupy._math.sumprod import gradient # NOQA +from cupy._math.sumprod import trapz # NOQA +from cupy._math.window import bartlett # NOQA +from cupy._math.window import blackman # NOQA +from cupy._math.window import hamming # NOQA +from cupy._math.window import hanning # NOQA +from cupy._math.window import kaiser # NOQA + +from cupy._math.explog import exp # NOQA +from cupy._math.explog import exp2 # NOQA +from cupy._math.explog import expm1 # NOQA +from cupy._math.explog import log # NOQA +from cupy._math.explog import log10 # NOQA +from cupy._math.explog import log1p # NOQA +from cupy._math.explog import log2 # NOQA +from cupy._math.explog import logaddexp # NOQA +from cupy._math.explog import logaddexp2 # NOQA + +from cupy._math.special import i0 # NOQA +from cupy._math.special import sinc # NOQA + +from cupy._math.floating import copysign # NOQA +from cupy._math.floating import frexp # NOQA +from cupy._math.floating import ldexp # NOQA +from cupy._math.floating import nextafter # NOQA +from cupy._math.floating import signbit # NOQA + +from cupy._math.rational import gcd # NOQA +from cupy._math.rational import lcm # NOQA + +from cupy._math.arithmetic import add # NOQA +from cupy._math.arithmetic import divide # NOQA +from cupy._math.arithmetic import divmod # NOQA +from cupy._math.arithmetic import floor_divide # NOQA +from cupy._math.arithmetic import float_power # NOQA +from cupy._math.arithmetic import fmod # NOQA +from cupy._math.arithmetic import modf # NOQA +from cupy._math.arithmetic import multiply # NOQA +from cupy._math.arithmetic import negative # NOQA +from cupy._math.arithmetic import positive # NOQA +from cupy._math.arithmetic import power # NOQA +from cupy._math.arithmetic import reciprocal # NOQA +from cupy._math.arithmetic import remainder # NOQA +from cupy._math.arithmetic import remainder as mod # NOQA +from cupy._math.arithmetic import subtract # NOQA +from cupy._math.arithmetic import true_divide # NOQA + +from cupy._math.arithmetic import angle # NOQA +from cupy._math.arithmetic import conjugate as conj # NOQA +from cupy._math.arithmetic import conjugate # NOQA +from cupy._math.arithmetic import imag # NOQA +from cupy._math.arithmetic import real # NOQA + +from cupy._math.misc import absolute as abs # NOQA +from cupy._math.misc import absolute # NOQA +from cupy._math.misc import cbrt # NOQA +from cupy._math.misc import clip # NOQA +from cupy._math.misc import fabs # NOQA +from cupy._math.misc import fmax # NOQA +from cupy._math.misc import fmin # NOQA +from cupy._math.misc import interp # NOQA +from cupy._math.misc import maximum # NOQA +from cupy._math.misc import minimum # NOQA +from cupy._math.misc import nan_to_num # NOQA +from cupy._math.misc import real_if_close # NOQA +from cupy._math.misc import sign # NOQA +from cupy._math.misc import heaviside # NOQA +from cupy._math.misc import sqrt # NOQA +from cupy._math.misc import square # NOQA +from cupy._math.misc import convolve # NOQA + +# ----------------------------------------------------------------------------- +# Miscellaneous routines +# ----------------------------------------------------------------------------- +from cupy._misc.byte_bounds import byte_bounds # NOQA +from cupy._misc.memory_ranges import may_share_memory # NOQA +from cupy._misc.memory_ranges import shares_memory # NOQA +from cupy._misc.who import who # NOQA + +# Borrowed from NumPy +from numpy import disp # NOQA +from numpy import iterable # NOQA +from numpy import safe_eval # NOQA +from numpy import AxisError # NOQA + + +# ----------------------------------------------------------------------------- +# Padding +# ----------------------------------------------------------------------------- +from cupy._padding.pad import pad # NOQA + + +# ----------------------------------------------------------------------------- +# Sorting, searching, and counting +# ----------------------------------------------------------------------------- +from cupy._sorting.count import count_nonzero # NOQA + +from cupy._sorting.search import argmax # NOQA +from cupy._sorting.search import argmin # NOQA +from cupy._sorting.search import argwhere # NOQA +from cupy._sorting.search import flatnonzero # NOQA +from cupy._sorting.search import nanargmax # NOQA +from cupy._sorting.search import nanargmin # NOQA +from cupy._sorting.search import nonzero # NOQA +from cupy._sorting.search import searchsorted # NOQA +from cupy._sorting.search import where # NOQA + +from cupy._sorting.sort import argpartition # NOQA +from cupy._sorting.sort import argsort # NOQA +from cupy._sorting.sort import lexsort # NOQA +from cupy._sorting.sort import msort # NOQA +from cupy._sorting.sort import sort_complex # NOQA +from cupy._sorting.sort import partition # NOQA +from cupy._sorting.sort import sort # NOQA + +# ----------------------------------------------------------------------------- +# Statistics +# ----------------------------------------------------------------------------- +from cupy._statistics.correlation import corrcoef # NOQA +from cupy._statistics.correlation import cov # NOQA +from cupy._statistics.correlation import correlate # NOQA + +from cupy._statistics.order import amax # NOQA +from cupy._statistics.order import amax as max # NOQA +from cupy._statistics.order import amin # NOQA +from cupy._statistics.order import amin as min # NOQA +from cupy._statistics.order import nanmax # NOQA +from cupy._statistics.order import nanmin # NOQA +from cupy._statistics.order import percentile # NOQA +from cupy._statistics.order import ptp # NOQA +from cupy._statistics.order import quantile # NOQA + +from cupy._statistics.meanvar import median # NOQA +from cupy._statistics.meanvar import average # NOQA +from cupy._statistics.meanvar import mean # NOQA +from cupy._statistics.meanvar import std # NOQA +from cupy._statistics.meanvar import var # NOQA +from cupy._statistics.meanvar import nanmedian # NOQA +from cupy._statistics.meanvar import nanmean # NOQA +from cupy._statistics.meanvar import nanstd # NOQA +from cupy._statistics.meanvar import nanvar # NOQA + +from cupy._statistics.histogram import bincount # NOQA +from cupy._statistics.histogram import digitize # NOQA +from cupy._statistics.histogram import histogram # NOQA +from cupy._statistics.histogram import histogram2d # NOQA +from cupy._statistics.histogram import histogramdd # NOQA + +# ----------------------------------------------------------------------------- +# Classes without their own docs +# ----------------------------------------------------------------------------- +from numpy import ComplexWarning # NOQA +from numpy import ModuleDeprecationWarning # NOQA +from numpy import TooHardError # NOQA +from numpy import VisibleDeprecationWarning # NOQA + + +# ----------------------------------------------------------------------------- +# Undocumented functions +# ----------------------------------------------------------------------------- +from cupy._core import size # NOQA + + +def ndim(a): + """Returns the number of dimensions of an array. + + Args: + a (array-like): If it is not already an `cupy.ndarray`, a conversion + via :func:`numpy.asarray` is attempted. + + Returns: + (int): The number of dimensions in `a`. + + """ + try: + return a.ndim + except AttributeError: + return _numpy.ndim(a) + + +# ----------------------------------------------------------------------------- +# CuPy specific functions +# ----------------------------------------------------------------------------- + +from cupy._util import clear_memo # NOQA +from cupy._util import memoize # NOQA + +from cupy._core import ElementwiseKernel # NOQA +from cupy._core import RawKernel # NOQA +from cupy._core import RawModule # NOQA +from cupy._core._reduction import ReductionKernel # NOQA + +# ----------------------------------------------------------------------------- +# DLPack +# ----------------------------------------------------------------------------- + +from cupy._core import fromDlpack # NOQA +from cupy._core import from_dlpack # NOQA + + +def asnumpy(a, stream=None, order='C', out=None): + """Returns an array on the host memory from an arbitrary source array. + + Args: + a: Arbitrary object that can be converted to :class:`numpy.ndarray`. + stream (cupy.cuda.Stream): CUDA stream object. If it is specified, then + the device-to-host copy runs asynchronously. Otherwise, the copy is + synchronous. Note that if ``a`` is not a :class:`cupy.ndarray` + object, then this argument has no effect. + order ({'C', 'F', 'A'}): The desired memory layout of the host + array. When ``order`` is 'A', it uses 'F' if ``a`` is + fortran-contiguous and 'C' otherwise. + out (numpy.ndarray): The output array to be written to. It must have + compatible shape and dtype with those of ``a``'s. + + Returns: + numpy.ndarray: Converted array on the host memory. + + """ + if isinstance(a, ndarray): + return a.get(stream=stream, order=order, out=out) + elif hasattr(a, "__cuda_array_interface__"): + return array(a).get(stream=stream, order=order, out=out) + else: + temp = _numpy.asarray(a, order=order) + if out is not None: + out[...] = temp + else: + out = temp + return out + + +_cupy = _sys.modules[__name__] + + +def get_array_module(*args): + """Returns the array module for arguments. + + This function is used to implement CPU/GPU generic code. If at least one of + the arguments is a :class:`cupy.ndarray` object, the :mod:`cupy` module is + returned. + + Args: + args: Values to determine whether NumPy or CuPy should be used. + + Returns: + module: :mod:`cupy` or :mod:`numpy` is returned based on the types of + the arguments. + + .. admonition:: Example + + A NumPy/CuPy generic function can be written as follows + + >>> def softplus(x): + ... xp = cupy.get_array_module(x) + ... return xp.maximum(0, x) + xp.log1p(xp.exp(-abs(x))) + + """ + for arg in args: + if isinstance(arg, (ndarray, _cupyx.scipy.sparse.spmatrix, + _core.fusion._FusionVarArray, + _core.new_fusion._ArrayProxy)): + return _cupy + return _numpy + + +fuse = _core.fusion.fuse + +disable_experimental_feature_warning = False + + +# set default allocator +_default_memory_pool = cuda.MemoryPool() +_default_pinned_memory_pool = cuda.PinnedMemoryPool() + +cuda.set_allocator(_default_memory_pool.malloc) +cuda.set_pinned_memory_allocator(_default_pinned_memory_pool.malloc) + + +def get_default_memory_pool(): + """Returns CuPy default memory pool for GPU memory. + + Returns: + cupy.cuda.MemoryPool: The memory pool object. + + .. note:: + If you want to disable memory pool, please use the following code. + + >>> cupy.cuda.set_allocator(None) + + """ + return _default_memory_pool + + +def get_default_pinned_memory_pool(): + """Returns CuPy default memory pool for pinned memory. + + Returns: + cupy.cuda.PinnedMemoryPool: The memory pool object. + + .. note:: + If you want to disable memory pool, please use the following code. + + >>> cupy.cuda.set_pinned_memory_allocator(None) + + """ + return _default_pinned_memory_pool + + +def show_config(*, _full=False): + """Prints the current runtime configuration to standard output.""" + _sys.stdout.write(str(_cupyx.get_runtime_info(full=_full))) + _sys.stdout.flush() + + +_deprecated_apis = [ + 'int0', + 'uint0', + 'bool8', +] + + +def __getattr__(name): + if name in _deprecated_apis: + return getattr(_numpy, name) + + raise AttributeError(f"module 'cupy' has no attribute {name!r}") diff --git a/cupy/_binary/__init__.py b/cupy/_binary/__init__.py new file mode 100644 index 0000000..3feab0f --- /dev/null +++ b/cupy/_binary/__init__.py @@ -0,0 +1,2 @@ +# Functions from the following NumPy document +# https://numpy.org/doc/stable/reference/routines.bitwise.html diff --git a/cupy/_binary/elementwise.py b/cupy/_binary/elementwise.py new file mode 100644 index 0000000..11e4fa1 --- /dev/null +++ b/cupy/_binary/elementwise.py @@ -0,0 +1,22 @@ +from cupy import _core + + +bitwise_and = _core.bitwise_and + + +bitwise_or = _core.bitwise_or + + +bitwise_xor = _core.bitwise_xor + + +bitwise_not = _core.invert + + +invert = _core.invert + + +left_shift = _core.left_shift + + +right_shift = _core.right_shift diff --git a/cupy/_binary/packing.py b/cupy/_binary/packing.py new file mode 100644 index 0000000..8d6c5a0 --- /dev/null +++ b/cupy/_binary/packing.py @@ -0,0 +1,104 @@ +import cupy +from cupy import _core + + +_packbits_kernel = { + 'big': _core.ElementwiseKernel( + 'raw T a, raw int32 a_size', 'uint8 packed', + '''for (int j = 0; j < 8; ++j) { + int k = i * 8 + j; + int bit = k < a_size && a[k] != 0; + packed |= bit << (7 - j); + }''', + 'cupy_packbits_big' + ), + 'little': _core.ElementwiseKernel( + 'raw T a, raw int32 a_size', 'uint8 packed', + '''for (int j = 0; j < 8; ++j) { + int k = i * 8 + j; + int bit = k < a_size && a[k] != 0; + packed |= bit << j; + }''', + 'cupy_packbits_little' + ) +} + + +def packbits(a, axis=None, bitorder='big'): + """Packs the elements of a binary-valued array into bits in a uint8 array. + + This function currently does not support ``axis`` option. + + Args: + a (cupy.ndarray): Input array. + axis (int, optional): Not supported yet. + bitorder (str, optional): bit order to use when packing the array, + allowed values are `'little'` and `'big'`. Defaults to `'big'`. + + Returns: + cupy.ndarray: The packed array. + + .. note:: + When the input array is empty, this function returns a copy of it, + i.e., the type of the output array is not necessarily always uint8. + This exactly follows the NumPy's behaviour (as of version 1.11), + alghough this is inconsistent to the documentation. + + .. seealso:: :func:`numpy.packbits` + """ + if a.dtype.kind not in 'biu': + raise TypeError( + 'Expected an input array of integer or boolean data type') + + if axis is not None: + raise NotImplementedError('axis option is not supported yet') + + if bitorder not in ('big', 'little'): + raise ValueError("bitorder must be either 'big' or 'little'") + + a = a.ravel() + packed_size = (a.size + 7) // 8 + packed = cupy.zeros((packed_size,), dtype=cupy.uint8) + return _packbits_kernel[bitorder](a, a.size, packed) + + +_unpackbits_kernel = { + 'big': _core.ElementwiseKernel( + 'raw uint8 a', 'T unpacked', + 'unpacked = (a[i / 8] >> (7 - i % 8)) & 1;', + 'cupy_unpackbits_big' + ), + 'little': _core.ElementwiseKernel( + 'raw uint8 a', 'T unpacked', + 'unpacked = (a[i / 8] >> (i % 8)) & 1;', + 'cupy_unpackbits_little' + ) +} + + +def unpackbits(a, axis=None, bitorder='big'): + """Unpacks elements of a uint8 array into a binary-valued output array. + + This function currently does not support ``axis`` option. + + Args: + a (cupy.ndarray): Input array. + bitorder (str, optional): bit order to use when unpacking the array, + allowed values are `'little'` and `'big'`. Defaults to `'big'`. + + Returns: + cupy.ndarray: The unpacked array. + + .. seealso:: :func:`numpy.unpackbits` + """ + if a.dtype != cupy.uint8: + raise TypeError('Expected an input array of unsigned byte data type') + + if axis is not None: + raise NotImplementedError('axis option is not supported yet') + + if bitorder not in ('big', 'little'): + raise ValueError("bitorder must be either 'big' or 'little'") + + unpacked = cupy.ndarray((a.size * 8), dtype=cupy.uint8) + return _unpackbits_kernel[bitorder](a, unpacked) diff --git a/cupy/_core/__init__.pxd b/cupy/_core/__init__.pxd new file mode 100644 index 0000000..e69de29 diff --git a/cupy/_core/__init__.py b/cupy/_core/__init__.py new file mode 100644 index 0000000..5531b27 --- /dev/null +++ b/cupy/_core/__init__.py @@ -0,0 +1,79 @@ +# mypy: ignore-errors + +from cupy._core import core # NOQA +from cupy._core import fusion # NOQA +from cupy._core import internal # NOQA + + +# internal APIs for testing and developement +from cupy._core._accelerator import set_elementwise_accelerators # NOQA +from cupy._core._accelerator import set_reduction_accelerators # NOQA +from cupy._core._accelerator import set_routine_accelerators # NOQA +from cupy._core._accelerator import get_elementwise_accelerators # NOQA +from cupy._core._accelerator import get_reduction_accelerators # NOQA +from cupy._core._accelerator import get_routine_accelerators # NOQA + + +# import class and function +from cupy._core._kernel import create_ufunc # NOQA +from cupy._core._kernel import ElementwiseKernel # NOQA +from cupy._core._kernel import ufunc # NOQA +from cupy._core._kernel import _get_warpsize # NOQA +from cupy._core._reduction import create_reduction_func # NOQA +from cupy._core._reduction import ReductionKernel # NOQA +from cupy._core._routines_binary import bitwise_and # NOQA +from cupy._core._routines_binary import bitwise_or # NOQA +from cupy._core._routines_binary import bitwise_xor # NOQA +from cupy._core._routines_binary import invert # NOQA +from cupy._core._routines_binary import left_shift # NOQA +from cupy._core._routines_binary import right_shift # NOQA +from cupy._core._routines_linalg import _mat_ptrs # NOQA +from cupy._core._routines_linalg import dot # NOQA +from cupy._core._routines_linalg import get_compute_type # NOQA +from cupy._core._routines_linalg import matmul # NOQA +from cupy._core._routines_linalg import set_compute_type # NOQA +from cupy._core._routines_linalg import tensordot_core # NOQA +from cupy._core._routines_logic import create_comparison # NOQA +from cupy._core._routines_logic import equal # NOQA +from cupy._core._routines_logic import greater # NOQA +from cupy._core._routines_logic import greater_equal # NOQA +from cupy._core._routines_logic import less # NOQA +from cupy._core._routines_logic import less_equal # NOQA +from cupy._core._routines_logic import not_equal # NOQA +from cupy._core._routines_manipulation import array_split # NOQA +from cupy._core._routines_manipulation import broadcast # NOQA +from cupy._core._routines_manipulation import broadcast_to # NOQA +from cupy._core._routines_manipulation import concatenate_method # NOQA +from cupy._core._routines_manipulation import moveaxis # NOQA +from cupy._core._routines_manipulation import rollaxis # NOQA +from cupy._core._routines_manipulation import size # NOQA' +from cupy._core._routines_math import absolute # NOQA +from cupy._core._routines_math import add # NOQA +from cupy._core._routines_math import angle, angle_deg # NOQA +from cupy._core._routines_math import conjugate # NOQA +from cupy._core._routines_math import divide # NOQA +from cupy._core._routines_math import floor_divide # NOQA +from cupy._core._routines_math import multiply # NOQA +from cupy._core._routines_math import negative # NOQA +from cupy._core._routines_math import positive # NOQA +from cupy._core._routines_math import power # NOQA +from cupy._core._routines_math import remainder # NOQA +from cupy._core._routines_math import sqrt # NOQA +from cupy._core._routines_math import subtract # NOQA +from cupy._core._routines_math import true_divide # NOQA +from cupy._core._routines_statistics import nanmax # NOQA +from cupy._core._routines_statistics import nanmin # NOQA +from cupy._core.core import _internal_ascontiguousarray # NOQA +from cupy._core.core import _internal_asfortranarray # NOQA +from cupy._core.core import array # NOQA +from cupy._core.core import ascontiguousarray # NOQA +from cupy._core.core import asfortranarray # NOQA +from cupy._core.core import divmod # NOQA +from cupy._core.core import elementwise_copy # NOQA +from cupy._core.core import ndarray # NOQA +from cupy._core.dlpack import fromDlpack # NOQA +from cupy._core.dlpack import from_dlpack # NOQA +from cupy._core.internal import complete_slice # NOQA +from cupy._core.internal import get_size # NOQA +from cupy._core.raw import RawKernel # NOQA +from cupy._core.raw import RawModule # NOQA diff --git a/cupy/_core/_accelerator.pxd b/cupy/_core/_accelerator.pxd new file mode 100644 index 0000000..773a5ec --- /dev/null +++ b/cupy/_core/_accelerator.pxd @@ -0,0 +1,10 @@ +cdef list _elementwise_accelerators + +cdef list _reduction_accelerators + +cdef list _routine_accelerators + +cpdef enum accelerator_type: + ACCELERATOR_CUB = 1 + ACCELERATOR_CUTENSOR = 2 + ACCELERATOR_CUTENSORNET = 3 diff --git a/cupy/_core/_accelerator.pyx b/cupy/_core/_accelerator.pyx new file mode 100644 index 0000000..6ce2991 --- /dev/null +++ b/cupy/_core/_accelerator.pyx @@ -0,0 +1,59 @@ +import os + +from cupy_backends.cuda.api cimport runtime + + +cdef list _elementwise_accelerators = [] +cdef list _reduction_accelerators = [] +cdef list _routine_accelerators = [] + + +cdef int _get_accelerator(accelerator) except -1: + if isinstance(accelerator, int): + return accelerator + if accelerator == 'cub': + return ACCELERATOR_CUB + if accelerator == 'cutensor': + return ACCELERATOR_CUTENSOR + if accelerator == 'cutensornet': + return ACCELERATOR_CUTENSORNET + raise ValueError('Unknown accelerator: {}'.format(accelerator)) + + +def set_elementwise_accelerators(accelerators): + global _elementwise_accelerators + _elementwise_accelerators = [_get_accelerator(b) for b in accelerators] + + +def set_reduction_accelerators(accelerators): + global _reduction_accelerators + _reduction_accelerators = [_get_accelerator(b) for b in accelerators] + + +def set_routine_accelerators(accelerators): + global _routine_accelerators + _routine_accelerators = [_get_accelerator(b) for b in accelerators] + + +def get_elementwise_accelerators(): + return _elementwise_accelerators + + +def get_reduction_accelerators(): + return _reduction_accelerators + + +def get_routine_accelerators(): + return _routine_accelerators + + +cdef _set_default_accelerators(): + cdef str b, accelerator_names = os.getenv( + 'CUPY_ACCELERATORS', '' if runtime._is_hip_environment else 'cub') + cdef list accelerators = [b for b in accelerator_names.split(',') if b] + set_elementwise_accelerators(accelerators) + set_reduction_accelerators(accelerators) + set_routine_accelerators(accelerators) + + +_set_default_accelerators() diff --git a/cupy/_core/_carray.pxd b/cupy/_core/_carray.pxd new file mode 100644 index 0000000..c443917 --- /dev/null +++ b/cupy/_core/_carray.pxd @@ -0,0 +1,55 @@ +cimport cython # NOQA +from libcpp cimport vector + +from cupy.cuda cimport function + + +ctypedef vector.vector[Py_ssize_t] shape_t +ctypedef vector.vector[Py_ssize_t] strides_t + +# this matches NPY_MAXDIMS +# Note: we make it an enum to work around cython/cython#4369 +cdef enum: MAX_NDIM = 32 + + +cdef struct _CArray: + void* data + Py_ssize_t size + Py_ssize_t shape_and_strides[MAX_NDIM * 2] + + +@cython.final +cdef class CArray(function.CPointer): + + cdef: + _CArray val + + cdef void init( + self, void* data_ptr, Py_ssize_t data_size, + const shape_t& shape, const strides_t& strides) except* + + +cdef struct _CIndexer: + Py_ssize_t size + Py_ssize_t shape_and_index[MAX_NDIM * 2] + + +cdef class CIndexer(function.CPointer): + cdef: + _CIndexer val + + cdef void init(self, Py_ssize_t size, const shape_t &shape) except* + + +cdef class Indexer: + cdef: + readonly Py_ssize_t size + readonly shape_t shape + readonly bint _index_32_bits + + cdef void init(self, const shape_t& shape) + + cdef function.CPointer get_pointer(self) + + +cdef Indexer _indexer_init(const shape_t& shape) diff --git a/cupy/_core/_carray.pyx b/cupy/_core/_carray.pyx new file mode 100644 index 0000000..e9b7841 --- /dev/null +++ b/cupy/_core/_carray.pyx @@ -0,0 +1,57 @@ +from cupy.cuda cimport function +from cupy._core cimport internal + + +cdef class CArray(function.CPointer): + + cdef void init( + self, void* data_ptr, Py_ssize_t data_size, + const shape_t& shape, const strides_t& strides) except*: + cdef size_t ndim = shape.size() + assert ndim == strides.size() + assert ndim <= MAX_NDIM + cdef Py_ssize_t* shape_and_strides = ( + self.val.shape_and_strides) + cdef size_t i + + self.val.data = data_ptr + self.val.size = data_size + for i in range(ndim): + shape_and_strides[i] = shape[i] + shape_and_strides[i + ndim] = strides[i] + self.ptr = &self.val + + +cdef class CIndexer(function.CPointer): + + cdef void init(self, Py_ssize_t size, const shape_t &shape) except*: + cdef size_t ndim = shape.size() + assert ndim <= MAX_NDIM + self.val.size = size + cdef Py_ssize_t i + for i in range(shape.size()): + self.val.shape_and_index[i] = shape[i] + self.ptr = &self.val + + +cdef class Indexer: + + cdef void init(self, const shape_t& shape): + self.shape = shape + self.size = internal.prod(shape) + self._index_32_bits = self.size <= (1 << 31) + + @property + def ndim(self): + return self.shape.size() + + cdef function.CPointer get_pointer(self): + cdef CIndexer indexer = CIndexer.__new__(CIndexer) + indexer.init(self.size, self.shape) + return indexer + + +cdef inline Indexer _indexer_init(const shape_t& shape): + cdef Indexer indexer = Indexer.__new__(Indexer) + indexer.init(shape) + return indexer diff --git a/cupy/_core/_codeblock.py b/cupy/_core/_codeblock.py new file mode 100644 index 0000000..b5ac4bc --- /dev/null +++ b/cupy/_core/_codeblock.py @@ -0,0 +1,38 @@ +from typing import Any, List + +_CodeType = Any # TODO(asi1024): Correct type annotation + + +class CodeBlock: + """Code fragment for the readable format. + """ + + def __init__(self, head: str, codes: _CodeType) -> None: + self._head = '' if head == '' else head + ' ' + self._codes = codes + + def _to_str_list(self, indent_width: int = 0) -> List[str]: + codes: List[str] = [] + codes.append(' ' * indent_width + self._head + '{') + for code in self._codes: + next_indent_width = indent_width + 2 + if isinstance(code, str): + codes.append(' ' * next_indent_width + code) + elif isinstance(code, CodeBlock): + codes += code._to_str_list(indent_width=next_indent_width) + else: + assert False + codes.append(' ' * indent_width + '}') + return codes + + def __str__(self) -> str: + """Emit CUDA program like the following format. + + <> { + <> + ...; + <> + } + """ + + return '\n'.join(self._to_str_list()) diff --git a/cupy/_core/_cub_reduction.pxd b/cupy/_core/_cub_reduction.pxd new file mode 100644 index 0000000..2eae617 --- /dev/null +++ b/cupy/_core/_cub_reduction.pxd @@ -0,0 +1,12 @@ +from cupy._core._carray cimport shape_t +from cupy._core._kernel cimport _TypeMap +from cupy._core.core cimport _ndarray_base + + +cdef bint _try_to_call_cub_reduction( + self, list in_args, list out_args, const shape_t& a_shape, + stream, optimize_context, tuple key, + map_expr, reduce_expr, post_map_expr, + reduce_type, _TypeMap type_map, + tuple reduce_axis, tuple out_axis, const shape_t& out_shape, + _ndarray_base ret) except * diff --git a/cupy/_core/_cub_reduction.pyx b/cupy/_core/_cub_reduction.pyx new file mode 100644 index 0000000..b5504ed --- /dev/null +++ b/cupy/_core/_cub_reduction.pyx @@ -0,0 +1,712 @@ +from cupy._core._carray cimport shape_t +from cupy._core cimport _kernel +from cupy._core cimport _optimize_config +from cupy._core cimport _reduction +from cupy._core cimport _scalar +from cupy._core.core cimport compile_with_cache +from cupy._core.core cimport _ndarray_base +from cupy._core.core cimport _internal_ascontiguousarray +from cupy._core cimport internal +from cupy.cuda cimport cub +from cupy.cuda cimport function +from cupy.cuda cimport memory +from cupy_backends.cuda.api cimport runtime + +import math +import string +import sys +from cupy import _environment +from cupy._core._kernel import _get_param_info +from cupy.cuda import driver +from cupy import _util + + +cdef function.Function _create_cub_reduction_function( + name, block_size, items_per_thread, + reduce_type, params, arginfos, identity, + pre_map_expr, reduce_expr, post_map_expr, + _kernel._TypeMap type_map, preamble, options): + # A (incomplete) list of internal variables: + # _J : the index of an element in the array + # ROCm5.3 and above requires c++14 + if runtime._is_hip_environment: + options += ('--std=c++14',) + else: + # static_assert needs at least C++11 in NVRTC + options += ('--std=c++11',) + + cdef str backend + if runtime._is_hip_environment: + # In ROCm, we need to set the include path. This does not work for + # hiprtc as of ROCm 3.5.0, so we must use hipcc. + options += ('-I' + _rocm_path + '/include', '-O2') + backend = 'nvcc' # this is confusing... + elif sys.platform.startswith('win32'): + # See #4771. NVRTC on Windows seems to have problems in handling empty + # macros, so any usage like this: + # #ifndef CUB_NS_PREFIX + # #define CUB_NS_PREFIX + # #endif + # will drive NVRTC nuts (error: this declaration has no storage class + # or type specifier). However, we cannot find a minimum reproducer to + # confirm this is the root cause, so we work around by using nvcc. + backend = 'nvcc' + else: + # use jitify + nvrtc + # TODO(leofang): how about simply specifying jitify=True when calling + # compile_with_cache()? + options += ('-DCUPY_USE_JITIFY',) + backend = 'nvrtc' + + # TODO(leofang): try splitting the for-loop into full tiles and partial + # tiles to utilize LoadDirectBlockedVectorized? See, for example, + # https://github.com/NVlabs/cub/blob/c3cceac115c072fb63df1836ff46d8c60d9eb304/cub/agent/agent_reduce.cuh#L311-L346 + + cdef str module_code = _get_cub_header_include() + module_code += ''' +${type_preamble} +${preamble} + +typedef ${reduce_type} _type_reduce; + +static_assert(sizeof(_type_reduce) <= 32, + "The intermediate reduction type is assumed to be at most 32 bytes."); + +// Compile-time constants for CUB template specializations +#define ITEMS_PER_THREAD ${items_per_thread} +#define BLOCK_SIZE ${block_size} + +// for hipCUB: use the hipcub namespace +#ifdef __HIP_DEVICE_COMPILE__ +#define cub hipcub +#endif + +#if defined FIRST_PASS + typedef type_in0_raw type_mid_in; + typedef _type_reduce type_mid_out; + #define POST_MAP(a) out0 = a; +#elif defined SECOND_PASS + typedef _type_reduce type_mid_in; + typedef type_out0_raw type_mid_out; + #define POST_MAP(a) (${post_map_expr}) +#else // one-pass reduction + typedef type_in0_raw type_mid_in; + typedef type_out0_raw type_mid_out; + #define POST_MAP(a) (${post_map_expr}) +#endif + +struct _reduction_op { + __device__ __forceinline__ _type_reduce operator()( + const _type_reduce &a, const _type_reduce &b) const { + return ${reduce_expr}; + } +}; + +extern "C" +__global__ void ${name}(${params}) { + unsigned int _tid = threadIdx.x; +''' + + if pre_map_expr == 'in0': + module_code += ''' + // Specialize BlockLoad type for faster (?) loading + typedef cub::BlockLoad<_type_reduce, BLOCK_SIZE, + ITEMS_PER_THREAD, cub::BLOCK_LOAD_DIRECT> BlockLoadT; + + // Shared memory for loading + __shared__ typename BlockLoadT::TempStorage temp_storage_load; +''' + + module_code += ''' + // Specialize BlockReduce type for our thread block + typedef cub::BlockReduce<_type_reduce, BLOCK_SIZE> BlockReduceT; + + // Shared memory for reduction + __shared__ typename BlockReduceT::TempStorage temp_storage; + + // Declare reduction operation + _reduction_op op; + + // input & output raw pointers + const type_mid_in* _in0 = static_cast(_raw_in0); + type_mid_out* _out0 = static_cast(_raw_out0); + + // Per-thread tile data + _type_reduce _sdata[ITEMS_PER_THREAD]; + #pragma unroll + for (int j = 0; j < ITEMS_PER_THREAD; j++) { + _sdata[j] = _type_reduce(${identity}); + } + + // each block handles the reduction of 1 segment + size_t segment_idx = blockIdx.x * _segment_size; + const type_mid_in* segment_head = _in0 + segment_idx; + size_t i = 0; // tile head within the segment + int tile_size = (BLOCK_SIZE * ITEMS_PER_THREAD < _segment_size ? + BLOCK_SIZE * ITEMS_PER_THREAD : + _segment_size); + sizeT _seg_size = _segment_size; + + #if defined FIRST_PASS + // for two-pass reduction only: "last segment" is special + if (_array_size > 0) { + if (_array_size - segment_idx <= _segment_size) { + _seg_size = _array_size - segment_idx; + } + #ifdef __HIP_DEVICE_COMPILE__ + // We don't understand HIP... + __syncthreads(); // Propagate the new value back to memory + #endif + } + #endif + + // loop over tiles within 1 segment + _type_reduce aggregate = _type_reduce(${identity}); + for (i = 0; i < _seg_size; i += BLOCK_SIZE * ITEMS_PER_THREAD) { + // for the last tile + if (_seg_size - i <= tile_size) { tile_size = _seg_size - i; } +''' + + if pre_map_expr == 'in0': + module_code += ''' + // load a tile + BlockLoadT(temp_storage_load).Load(segment_head + i, _sdata, tile_size, + _type_reduce(${identity})); +''' + else: # pre_map_expr could be something like "in0 != type_in0_raw(0)" + module_code += ''' + // load a tile + #pragma unroll + for (int j = 0; j < ITEMS_PER_THREAD; j++) { + // index of the element in a tile + int e_idx = _tid * ITEMS_PER_THREAD + j; + + // some pre_map_expr uses _J internally... + #if defined FIRST_PASS + int _J = (segment_idx + i + e_idx); + #else // only one pass + int _J = (segment_idx + i + e_idx) % _seg_size; + #endif + + if (e_idx < tile_size) { + const type_mid_in in0 = *(segment_head + i + e_idx); + _sdata[j] = static_cast<_type_reduce>(${pre_map_expr}); + } else { + _sdata[j] = _type_reduce(${identity}); + } + } +''' + + module_code += ''' + // Compute block reduction + // Note that the output is only meaningful for thread 0 + aggregate = op(aggregate, BlockReduceT(temp_storage).Reduce(_sdata, op)); + + __syncthreads(); // for reusing temp_storage + } + + if (_tid == 0) { + type_mid_out& out0 = *(_out0 + blockIdx.x); + POST_MAP(aggregate); + } +} +''' + + module_code = string.Template(module_code).substitute( + name=name, + block_size=block_size, + items_per_thread=items_per_thread, + reduce_type=reduce_type, + params=_get_cub_kernel_params(params, arginfos), + identity=identity, + reduce_expr=reduce_expr, + pre_map_expr=pre_map_expr, + post_map_expr=post_map_expr, + type_preamble=type_map.get_typedef_code(), + preamble=preamble) + + # To specify the backend, we have to explicitly spell out the default + # values for arch, cachd, and prepend_cupy_headers to bypass cdef/cpdef + # limitation... + module = compile_with_cache( + module_code, options, arch=None, cachd_dir=None, + prepend_cupy_headers=True, backend=backend) + return module.get_function(name) + + +@_util.memoize(for_each_device=True) +def _SimpleCubReductionKernel_get_cached_function( + map_expr, reduce_expr, post_map_expr, reduce_type, + params, arginfos, _kernel._TypeMap type_map, + name, block_size, identity, preamble, + options, cub_params): + items_per_thread = cub_params[0] + name = name.replace('cupy_', 'cupy_cub_') + name = name.replace('cupyx_', 'cupyx_cub_') + return _create_cub_reduction_function( + name, block_size, items_per_thread, + reduce_type, params, arginfos, identity, + map_expr, reduce_expr, post_map_expr, + type_map, preamble, options) + + +cdef str _cub_path = _environment.get_cub_path() +cdef str _nvcc_path = _environment.get_nvcc_path() +cdef str _rocm_path = _environment.get_rocm_path() +cdef str _hipcc_path = _environment.get_hipcc_path() +cdef str _cub_header = None + + +cdef str _get_cub_header_include(): + global _cub_header + if _cub_header is not None: + return _cub_header + + assert _cub_path is not None + if _cub_path == '': + _cub_header = ''' +#include +#include +#include +''' + elif _cub_path == '': + _cub_header = ''' +#include +#include +''' + elif _cub_path == '': + # As of ROCm 3.5.0, the block headers cannot be included by themselves + # (many macros left undefined), so we must use the master header. + _cub_header = ''' +#include +''' + return _cub_header + + +# make it cpdef'd for unit tests +cpdef inline tuple _can_use_cub_block_reduction( + list in_args, list out_args, tuple reduce_axis, tuple out_axis): + ''' + If CUB BlockReduce can be used, this function returns a tuple of the needed + parameters, otherwise returns None. + ''' + cdef tuple axis_permutes_cub + cdef _ndarray_base in_arr + cdef Py_ssize_t contiguous_size = 1 + cdef str order + + # detect whether CUB headers exists somewhere: + if _cub_path is None: + import warnings + warnings.warn('CUB headers are not found.', RuntimeWarning) + return None + + # we currently support reductions with 1 input and 1 output + if len(in_args) != 1 or len(out_args) != 1: + return None + + in_arr = in_args[0] + + # the axes might not be sorted when we arrive here... + reduce_axis = tuple(sorted(reduce_axis)) + out_axis = tuple(sorted(out_axis)) + + # check reduction axes, if not contiguous then fall back to old kernel + if in_arr._f_contiguous: + order = 'F' + if not cub._cub_device_segmented_reduce_axis_compatible( + reduce_axis, in_arr.ndim, order): + return None + axis_permutes_cub = reduce_axis + out_axis + elif in_arr._c_contiguous: + order = 'C' + if not cub._cub_device_segmented_reduce_axis_compatible( + reduce_axis, in_arr.ndim, order): + return None + axis_permutes_cub = out_axis + reduce_axis + else: + return None + if axis_permutes_cub != tuple(range(in_arr.ndim)): + return None + + # full-reduction of N-D array: need to invoke the kernel twice + cdef bint full_reduction = True if len(out_axis) == 0 else False + + # check if the number of elements is too large + # (ref: cupy/cupy#3309 for CUB limit) + for i in reduce_axis: + contiguous_size *= in_arr.shape[i] + if contiguous_size > 0x7fffffffffffffff or contiguous_size == 0: + return None + if full_reduction: + # assume a GPU has at most 64 GB of physical memory + if contiguous_size > 0x1000000000: + return None + else: + # the number of blocks to be launched exceeds INT_MAX: + if in_arr.size // contiguous_size > 0x7fffffff: + return None + + # rare event (mainly for conda-forge users): nvcc is not found! + if not runtime._is_hip_environment: + if _nvcc_path is None: + return None + else: + if _hipcc_path is None: + return None + + return (axis_permutes_cub, contiguous_size, full_reduction) + + +# similar to cupy._core._kernel._get_kernel_params() +cdef str _get_cub_kernel_params(tuple params, tuple arginfos): + cdef _kernel.ParameterInfo p + cdef _kernel._ArgInfo arginfo + cdef lst = [] + cdef str c_type, c_name + cdef int i + assert len(params) == len(arginfos) + + for i, (p, arginfo) in enumerate(zip(params, arginfos)): + c_name = arginfo.get_c_var_name(p) + if i < len(params) - 2: + c_type = 'const void*' if p.is_const else 'void*' + else: + # for segment size and array size + c_type = arginfo.get_param_c_type(p) + lst.append('{} {}'.format(c_type, c_name)) + return ', '.join(lst) + + +cdef Py_ssize_t _cub_default_block_size = ( + 256 if runtime._is_hip_environment else 512) + + +cdef (Py_ssize_t, Py_ssize_t) _get_cub_block_specs( # NOQA + Py_ssize_t contiguous_size): + # This is recommended in the CUB internal and should be an + # even number + items_per_thread = 4 + + # Calculate the reduction block dimensions. + # Ideally, we want each block to handle one segment, so: + # 1. block size < segment size: the block loops over the segment + # 2. block size >= segment size: the segment fits in the block + block_size = (contiguous_size + items_per_thread - 1) // items_per_thread + block_size = internal.clp2(block_size) + warp_size = 32 if not runtime._is_hip_environment else 64 + if block_size < warp_size: + block_size = warp_size + elif block_size > _cub_default_block_size: + block_size = _cub_default_block_size + + return items_per_thread, block_size + + +cdef _scalar.CScalar _cub_convert_to_c_scalar( + Py_ssize_t segment_size, Py_ssize_t value): + if segment_size > 0x7fffffff: + return _scalar.scalar_to_c_scalar(value) + else: + return _scalar.CScalar.from_int32(value) + + +cdef inline void _cub_two_pass_launch( + str name, Py_ssize_t block_size, Py_ssize_t segment_size, + Py_ssize_t items_per_thread, str reduce_type, tuple params, + list in_args, list out_args, + str identity, str pre_map_expr, str reduce_expr, str post_map_expr, + _kernel._TypeMap type_map, str preamble, + tuple options, stream) except*: + ''' + Notes: + 1. Two-pass reduction: the first pass distributes an even share over + a number of blocks (with block_size threads), and the second pass + does reduction over 1 block of threads + ''' + + cdef list out_args_2nd_pass = [out_args[0]] + cdef Py_ssize_t contiguous_size, out_block_num + cdef function.Function func + cdef memory.MemoryPointer memptr + cdef str post_map_expr1, post_map_expr2, f + cdef list inout_args + cdef tuple cub_params + cdef size_t gridx, blockx + cdef _ndarray_base in_arr + + # fair share + contiguous_size = min(segment_size, block_size * items_per_thread) + out_block_num = (segment_size + contiguous_size - 1) // contiguous_size + assert out_block_num <= 0x7fffffff + + # Because we can't know sizeof(reduce_type) in advance, here we + # conservatively assume it's 32 bytes and allocate a work area + memptr = memory.alloc(out_block_num * 32) + out_args[0] = memptr + + # ************************ 1st pass ************************ + name += '_pass1' + inout_args = [in_args[0], out_args[0], + _cub_convert_to_c_scalar(segment_size, contiguous_size), + _cub_convert_to_c_scalar(segment_size, segment_size)] + cub_params = (items_per_thread,) + + if 'mean' in name: + post_map_expr1 = post_map_expr.replace('_in_ind.size()', '1.0') + post_map_expr1 = post_map_expr1.replace('_out_ind.size()', '1.0') + elif any((f in name for f in ('argmax', 'argmin'))): + # Workaround: in NumPy the indices are always generated based on + # a C-order array (since PyArray_ContiguousFromAny was called). + # We have to do a conversion here (?) since we do not retain the + # info on strides. + # TODO(leofang): improve this workaround + in_arr = in_args[0] + if in_arr.ndim > 1 and in_arr._f_contiguous: + in_arr = _internal_ascontiguousarray(in_arr) + inout_args[0] = in_args[0] = in_arr + post_map_expr1 = post_map_expr + else: + post_map_expr1 = post_map_expr + + # Retrieve the kernel function + func = _SimpleCubReductionKernel_get_cached_function( + pre_map_expr, reduce_expr, post_map_expr1, reduce_type, + params, + _kernel._get_arginfos(inout_args), + type_map, + name, block_size, identity, preamble, + ('-DFIRST_PASS=1',), cub_params) + + # Kernel arguments passed to the __global__ function. + gridx = (out_block_num * block_size) + blockx = block_size + + # Launch the kernel + func.linear_launch(gridx, inout_args, 0, blockx, stream) + + # ************************ 2nd pass ************************ + name = name[:-1] + '2' + contiguous_size = out_block_num + out_block_num = 1 + in_args = out_args + out_args = out_args_2nd_pass + inout_args = [in_args[0], out_args[0], + _cub_convert_to_c_scalar(segment_size, contiguous_size), + _cub_convert_to_c_scalar(segment_size, segment_size)] + + # For mean() + if 'mean' in name: + post_map_expr2 = post_map_expr.replace('_in_ind.size()', + '_array_size') + post_map_expr2 = post_map_expr2.replace('_out_ind.size()', '1.0') + else: + post_map_expr2 = post_map_expr + + # Retrieve the kernel function + func = _SimpleCubReductionKernel_get_cached_function( + 'in0', reduce_expr, post_map_expr2, reduce_type, + params, + _kernel._get_arginfos(inout_args), + type_map, + name, block_size, identity, preamble, + ('-DSECOND_PASS=1',), cub_params) + + # Kernel arguments passed to the __global__ function. + gridx = (out_block_num * block_size) + blockx = block_size + + # Launch the kernel + func.linear_launch(gridx, inout_args, 0, blockx, stream) + + +cdef inline void _launch_cub( + self, out_block_num, block_size, block_stride, + in_args, out_args, in_shape, out_shape, type_map, + map_expr, reduce_expr, post_map_expr, reduce_type, + stream, params, cub_params) except *: + cdef bint full_reduction + cdef Py_ssize_t contiguous_size, items_per_thread + cdef function.Function func + + # Kernel arguments passed to the __global__ function. + items_per_thread = cub_params[0] + contiguous_size = cub_params[1] + full_reduction = cub_params[2] + + if full_reduction: + _cub_two_pass_launch( + self.name, block_size, contiguous_size, items_per_thread, + reduce_type, params, in_args, out_args, self.identity, + map_expr, reduce_expr, post_map_expr, + type_map, self.preamble, (), stream) + return + else: + inout_args = ( + in_args + out_args + + [_cub_convert_to_c_scalar( + contiguous_size, contiguous_size), + _cub_convert_to_c_scalar( + contiguous_size, 0)]) + arginfos = _kernel._get_arginfos(inout_args) + func = _SimpleCubReductionKernel_get_cached_function( + map_expr, reduce_expr, post_map_expr, reduce_type, + params, arginfos, type_map, + self.name, block_size, self.identity, self.preamble, + (), cub_params) + + func.linear_launch( + out_block_num * block_size, inout_args, 0, block_size, stream) + + +def _get_cub_optimized_params( + self, optimize_config, in_args, out_args, in_shape, out_shape, + type_map, map_expr, reduce_expr, post_map_expr, reduce_type, + stream, full_reduction, out_block_num, contiguous_size, params): + in_args = [_reduction._optimizer_copy_arg(a) for a in in_args] + out_args = [_reduction._optimizer_copy_arg(a) for a in out_args] + + items_per_thread, block_size = ( + _get_cub_block_specs(contiguous_size)) + default_block_size_log = math.floor(math.log2(block_size)) + default_items_per_thread = items_per_thread + + def target_func(block_size, items_per_thread): + block_stride = block_size * items_per_thread + cub_params = ( + items_per_thread, contiguous_size, full_reduction) + _launch_cub( + self, + out_block_num, block_size, block_stride, in_args, out_args, + in_shape, out_shape, type_map, map_expr, reduce_expr, + post_map_expr, reduce_type, stream, params, cub_params) + + def suggest_func(trial): + block_size_log = trial.suggest_int('block_size_log', 5, 10) + block_size = 2 ** block_size_log + items_per_thread = trial.suggest_int( + 'items_per_thread', 2, 32, step=2) + + trial.set_user_attr('block_size', block_size) + return block_size, items_per_thread + + # CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES is a possible error + optimize_impl = optimize_config.optimize_impl + best = optimize_impl( + optimize_config, target_func, suggest_func, + default_best={ + 'block_size_log': default_block_size_log, + 'items_per_thread': default_items_per_thread, + }, ignore_error=(driver.CUDADriverError,)) + + return best.params['items_per_thread'], best.user_attrs['block_size'] + + +cdef bint _try_to_call_cub_reduction( + self, list in_args, list out_args, const shape_t& a_shape, + stream, optimize_context, tuple key, + map_expr, reduce_expr, post_map_expr, + reduce_type, _kernel._TypeMap type_map, + tuple reduce_axis, tuple out_axis, const shape_t& out_shape, + _ndarray_base ret) except *: + """Try to use cub. + + Updates `ret` and returns a boolean value whether cub is used. + + Note: input_expr and output_expr are not used in CUB kernels. + """ + cdef tuple axis_permutes + cdef tuple params, opt_params + cdef shape_t in_shape + cdef Py_ssize_t i + cdef Py_ssize_t contiguous_size = -1 + cdef Py_ssize_t block_size, block_stride, out_block_num = 0 + + # decide to use CUB or not + can_use_cub = _can_use_cub_block_reduction( + in_args, out_args, reduce_axis, out_axis) + + if can_use_cub is None: + return False + + axis_permutes, contiguous_size, full_reduction = can_use_cub + + in_shape = _reduction._set_permuted_args( + in_args, axis_permutes, a_shape, self.in_params) + + if in_args[0]._f_contiguous: + ret._set_contiguous_strides(ret.dtype.itemsize, False) + out_args[0] = ret + + if not full_reduction: # just need one pass + out_block_num = 1 # = number of segments + for i in out_axis: + out_block_num *= in_shape[i] + + if 'mean' in self.name: + post_map_expr = post_map_expr.replace( + '_in_ind.size()', '_segment_size') + post_map_expr = post_map_expr.replace( + '_out_ind.size()', '1.0') + + if contiguous_size > 0x7fffffff: # INT_MAX + size_type = 'uint64' + else: + size_type = 'int32' + type_map = _kernel._TypeMap(type_map._pairs + (('sizeT', size_type),)) + params = (self._params[0:2] + + _get_param_info(size_type + ' _segment_size', True) + + _get_param_info(size_type + ' _array_size', True)) + + # HACK for ReductionKernel: + # 1. input/output arguments might not be named as in0/out0 + # 2. pre-/post- maps might not contain in0/out0 + # 3. type_map does not contain the expected names (type_in0_raw and + # type_out0_raw) + cdef str old_in0 = params[0].name, old_out0 = params[1].name + if old_in0 != 'in0' or old_out0 != 'out0': + # avoid overwriting self's attributes + params = (_get_param_info('T in0', True) + + _get_param_info('T out0', False) + + params[2:]) + map_expr = map_expr.replace(old_in0, 'in0') + post_map_expr = post_map_expr.replace(old_out0, 'out0') + type_map = _kernel._TypeMap(type_map._pairs + ( + ('type_in0_raw', in_args[0].dtype.type), + ('type_out0_raw', out_args[0].dtype.type), + )) + + # Calculate the reduction block dimensions. + optimize_context = _optimize_config.get_current_context() + if optimize_context is None: + # Calculate manually + items_per_thread, block_size = _get_cub_block_specs(contiguous_size) + else: + # Optimize dynamically + key = ('cub_reduction',) + key + opt_params = optimize_context.get_params(key) + if opt_params is None: + opt_params = _get_cub_optimized_params( + self, + optimize_context.config, in_args, out_args, + in_shape, out_shape, type_map, map_expr, reduce_expr, + post_map_expr, reduce_type, stream, + full_reduction, out_block_num, contiguous_size, params) + optimize_context.set_params(key, opt_params) + items_per_thread, block_size = opt_params + + block_stride = block_size * items_per_thread + cub_params = (items_per_thread, contiguous_size, full_reduction) + + _launch_cub( + self, + out_block_num, + block_size, + block_stride, + in_args, out_args, + in_shape, out_shape, + type_map, + map_expr, reduce_expr, post_map_expr, reduce_type, + stream, params, cub_params) + + return True diff --git a/cupy/_core/_dtype.pxd b/cupy/_core/_dtype.pxd new file mode 100644 index 0000000..6dfc4af --- /dev/null +++ b/cupy/_core/_dtype.pxd @@ -0,0 +1,10 @@ +cpdef get_dtype(t) +cpdef tuple get_dtype_with_itemsize(t) +cpdef int to_cuda_dtype(dtype, bint is_half_allowed=*) except -1 + +cpdef void _raise_if_invalid_cast( + from_dt, + to_dt, + str casting, + argname=* +) except * diff --git a/cupy/_core/_dtype.pyx b/cupy/_core/_dtype.pyx new file mode 100644 index 0000000..e5bc725 --- /dev/null +++ b/cupy/_core/_dtype.pyx @@ -0,0 +1,126 @@ +cimport cython # NOQA +import numpy +import warnings + +from cupy_backends.cuda.api cimport runtime + + +all_type_chars = '?bhilqBHILQefdFD' +# for c in '?bhilqBHILQefdFD': +# print('#', c, '...', np.dtype(c).name) +# ? ... bool +# b ... int8 +# h ... int16 +# i ... int32 +# l ... int64 (int32 in windows) +# q ... int64 +# B ... uint8 +# H ... uint16 +# I ... uint32 +# L ... uint64 (uint32 in windows) +# Q ... uint64 +# e ... float16 +# f ... float32 +# d ... float64 +# F ... complex64 +# D ... complex128 + +cdef dict _dtype_dict = {} +cdef _dtype = numpy.dtype + + +cdef _init_dtype_dict(): + for i in (int, float, bool, complex, None): + dtype = _dtype(i) + _dtype_dict[i] = (dtype, dtype.itemsize) + for i in all_type_chars: + dtype = _dtype(i) + item = (dtype, dtype.itemsize) + _dtype_dict[i] = item + _dtype_dict[dtype.type] = item + for i in {str(_dtype(i)) for i in all_type_chars}: + dtype = _dtype(i) + _dtype_dict[i] = (dtype, dtype.itemsize) + + +_init_dtype_dict() + + +@cython.profile(False) +cpdef get_dtype(t): + ret = _dtype_dict.get(t, None) + if ret is None: + return _dtype(t) + return ret[0] + + +@cython.profile(False) +cpdef tuple get_dtype_with_itemsize(t): + ret = _dtype_dict.get(t, None) + if ret is None: + t = _dtype(t) + return t, t.itemsize + return ret + + +cpdef int to_cuda_dtype(dtype, bint is_half_allowed=False) except -1: + cdef str dtype_char + try: + dtype_char = dtype.char + except AttributeError: + dtype_char = dtype + + if dtype_char == 'e' and is_half_allowed: + return runtime.CUDA_R_16F + elif dtype_char == 'f': + return runtime.CUDA_R_32F + elif dtype_char == 'd': + return runtime.CUDA_R_64F + elif dtype_char == 'F': + return runtime.CUDA_C_32F + elif dtype_char == 'D': + return runtime.CUDA_C_64F + elif dtype_char == 'E' and is_half_allowed: + # complex32, not supported in NumPy + return runtime.CUDA_C_16F + else: + raise TypeError('dtype is not supported: {}'.format(dtype)) + + +cdef _numpy_can_cast = numpy.can_cast + + +cpdef void _raise_if_invalid_cast( + from_dt, to_dt, str casting, argname="array data" +) except *: + """Raise an error if a cast is not valid. Also checks whether the cast + goes from complex to real and warns if it does. + + The error raised can be customized by giving `obj`. May pass a (lambda) + function to avoid string construction on success. + This function exists mainly to build a similar error everywhere. + + """ + if from_dt is to_dt: + return + + to_dt = get_dtype(to_dt) # may still be a type not a dtype instance + + if casting == "same_kind" and from_dt.kind == to_dt.kind: + # same-kind is the most common casting used and for NumPy dtypes. + return + if _numpy_can_cast(from_dt, to_dt, casting): + if casting == "unsafe" and from_dt.kind == "c" and to_dt.kind in "iuf": + # Complex warning, we are dropping the imagine part: + warnings.warn( + 'Casting complex values to real discards the imaginary part', + numpy.ComplexWarning) + + return + + # Casting is not possible, raise the error + if not isinstance(argname, str): + argname = argname() + raise TypeError( + f'Cannot cast {argname} from {from_dt!r} to {to_dt!r} ' + f'according to the rule \'{casting}\'') diff --git a/cupy/_core/_fusion_interface.py b/cupy/_core/_fusion_interface.py new file mode 100644 index 0000000..44f9a4b --- /dev/null +++ b/cupy/_core/_fusion_interface.py @@ -0,0 +1,272 @@ +import numpy + +from cupy._core._dtype import get_dtype +import cupy +from cupy._core import _fusion_thread_local +from cupy._core import core +from cupy._core._scalar import get_typename + + +_thread_local = _fusion_thread_local.thread_local + + +_dtype_to_astype_dict = None + + +def _set_dtype_to_astype_dict(): + """Set a dict with dtypes and astype ufuncs to `_dtype_to_astype_dict`. + + Creates a ufunc for type cast operations, and set a dict with keys + as the dtype of the output array and values as astype ufuncs. + This function is called at most once. + """ + global _dtype_to_astype_dict + _dtype_to_astype_dict = {} + + dtype_list = [numpy.dtype(type_char) for type_char in '?bhilqBHILQefdFD'] + + for t in dtype_list: + name = 'astype_{}'.format(t) + rules = tuple(['{}->{}'.format(s.char, t.char) for s in dtype_list]) + command = 'out0 = static_cast< {} >(in0)'.format(get_typename(t)) + _dtype_to_astype_dict[t] = core.create_ufunc(name, rules, command) + + +class _VariableProxy: + """Abstracted array/scalar object passed to the target function. + """ + + def __init__(self, content): + assert isinstance(content, cupy._core._fusion_variable._TraceVariable) + self.content = content + + def __neg__(self): + return cupy.negative(self) + + def __add__(self, other): + return cupy.add(self, other) + + def __radd__(self, other): + return cupy.add(other, self) + + def __sub__(self, other): + return cupy.subtract(self, other) + + def __rsub__(self, other): + return cupy.subtract(other, self) + + def __mul__(self, other): + return cupy.multiply(self, other) + + def __rmul__(self, other): + return cupy.multiply(other, self) + + def __div__(self, other): + return cupy.divide(self, other) + + def __rdiv__(self, other): + return cupy.divide(other, self) + + def __truediv__(self, other): + return cupy.true_divide(self, other) + + def __rtruediv__(self, other): + return cupy.true_divide(other, self) + + def __floordiv__(self, other): + return cupy.floor_divide(self, other) + + def __rfloordiv__(self, other): + return cupy.floor_divide(other, self) + + def __mod__(self, other): + return cupy.remainder(self, other) + + def __rmod__(self, other): + return cupy.remainder(other, self) + + def __pow__(self, other): + return cupy.power(self, other) + + def __lshift__(self, other): + return cupy.left_shift(self, other) + + def __rlshift__(self, other): + return cupy.left_shift(other, self) + + def __rshift__(self, other): + return cupy.right_shift(self, other) + + def __rrshift__(self, other): + return cupy.right_shift(other, self) + + def __invert__(self): + return cupy.invert(self) + + def __and__(self, other): + return cupy.bitwise_and(self, other) + + def __rand__(self, other): + return cupy.bitwise_and(other, self) + + def __or__(self, other): + return cupy.bitwise_or(self, other) + + def __ror__(self, other): + return cupy.bitwise_or(other, self) + + def __xor__(self, other): + return cupy.bitwise_xor(self, other) + + def __rxor__(self, other): + return cupy.bitwise_xor(other, self) + + def __lt__(self, other): + return cupy.less(self, other) + + def __le__(self, other): + return cupy.less_equal(self, other) + + def __eq__(self, other): + return cupy.equal(self, other) + + def __ne__(self, other): + return cupy.not_equal(self, other) + + def __ge__(self, other): + return cupy.greater_equal(self, other) + + def __gt__(self, other): + return cupy.greater(self, other) + + def copy(self): + return cupy.copy(self) + + def astype(self, dtype, order=None, casting=None, subok=None, copy=True): + dtype = get_dtype(dtype) + if order is not None: + raise TypeError('order is not supported yet') + if casting is not None: + raise TypeError('casting is not supported yet') + if subok is not None: + raise TypeError('subok is not supported yet') + if not copy and self.dtype == dtype: + return self + if _dtype_to_astype_dict is None: + _set_dtype_to_astype_dict() + return _dtype_to_astype_dict[dtype](self) + + def sum(self, axis=None, dtype=None, out=None, keepdims=False): + return cupy.sum( + self, axis=axis, dtype=dtype, out=out, keepdims=keepdims) + + def prod(self, axis=None, dtype=None, out=None, keepdims=False): + return cupy.prod( + self, axis=axis, dtype=dtype, out=out, keepdims=keepdims) + + def max(self, axis=None, out=None, keepdims=False): + return cupy.max(self, axis=axis, out=out, keepdims=keepdims) + + def min(self, axis=None, out=None, keepdims=False): + return cupy.min(self, axis=axis, out=out, keepdims=keepdims) + + def all(self, axis=None, out=None, keepdims=False): + return cupy.all(self, axis=axis, out=out, keepdims=keepdims) + + def any(self, axis=None, out=None, keepdims=False): + return cupy.any(self, axis=axis, out=out, keepdims=keepdims) + + @property + def dtype(self): + return self.content.dtype + + @property + def ndim(self): + return self.content.ndim + + @property + def shape(self): + raise NotImplementedError('`shape` is not supported, currently.') + + +class _ScalarProxy(_VariableProxy): + """An abstracted scalar object passed to the target function. + + Attributes: + dtype(dtype): The dtype of the array. + imag(_ArrayProxy): The imaginary part of the array (Not implemented) + real(_ArrayProxy): The real part of the array (Not implemented) + ndim(int): The number of dimensions of the array. + """ + + def __repr__(self): + return '_ScalarProxy({}, dtype={})'.format( + self._emit_param_name(), self.dtype) + + +class _ArrayProxy(_VariableProxy): + """An abstracted array object passed to the target function. + + Attributes: + dtype(dtype): The dtype of the array. + imag(_ArrayProxy): The imaginary part of the array (Not implemented) + real(_ArrayProxy): The real part of the array (Not implemented) + ndim(int): The number of dimensions of the array. + """ + + def __repr__(self): + return '_ArrayProxy([...], dtype=\'{}\', ndim={})'.format( + self.dtype.char, self.ndim) + + def _inplace_op(self, ufunc, other): + return ufunc(self, other, self) + + def __iadd__(self, other): + return self._inplace_op(cupy.add, other) + + def __isub__(self, other): + return self._inplace_op(cupy.subtract, other) + + def __imul__(self, other): + return self._inplace_op(cupy.multiply, other) + + def __idiv__(self, other): + return self._inplace_op(cupy.divide, other) + + def __itruediv__(self, other): + return self._inplace_op(cupy.true_divide, other) + + def __ifloordiv__(self, other): + return self._inplace_op(cupy.floor_divide, other) + + def __imod__(self, other): + return self._inplace_op(cupy.remainder, other) + + def __ipow__(self, other): + return self._inplace_op(cupy.power, other) + + def __ilshift__(self, other): + return self._inplace_op(cupy.left_shift, other) + + def __irshift__(self, other): + return self._inplace_op(cupy.right_shift, other) + + def __iand__(self, other): + return self._inplace_op(cupy.bitwise_and, other) + + def __ior__(self, other): + return self._inplace_op(cupy.bitwise_or, other) + + def __ixor__(self, other): + return self._inplace_op(cupy.bitwise_xor, other) + + def __getitem__(self, index): + return _fusion_thread_local.call_indexing(self, index) + + def __setitem__(self, slices, value): + if slices is Ellipsis or ( + isinstance(slices, slice) and slices == slice(None)): + _fusion_thread_local.call_ufunc( + core.elementwise_copy, value, out=self) + else: + raise ValueError('The fusion supports `[...]` or `[:]`.') diff --git a/cupy/_core/_fusion_kernel.pyx b/cupy/_core/_fusion_kernel.pyx new file mode 100644 index 0000000..ed8f837 --- /dev/null +++ b/cupy/_core/_fusion_kernel.pyx @@ -0,0 +1,364 @@ +import itertools +import string + +from libcpp cimport vector + +from cupy._core cimport _carray +from cupy._core.core cimport _ndarray_init +from cupy._core.core cimport compile_with_cache +from cupy._core.core cimport _ndarray_base +from cupy._core cimport internal +from cupy._core cimport _routines_manipulation as _manipulation +from cupy_backends.cuda.api cimport driver +from cupy_backends.cuda.api cimport runtime + +import cupy as _cupy +from cupy._core import _dtype +from cupy import _util +from cupy._core import _codeblock +from cupy._core import _fusion_op +from cupy._core._fusion_variable import _TraceVariable +from cupy._core._fusion_variable import _TraceScalar +from cupy._core._fusion_variable import _TraceArray + + +cdef Py_ssize_t _default_block_size = ( + 256 if runtime._is_hip_environment else 512) + + +@_util.memoize(for_each_device=True) +def _cuda_compile(preamble, name, cuda_params, cuda_body, use_grid_sync): + template = ( + '${preamble}\n\n' + 'extern "C" __global__ void ${name}(${cuda_params}) ${cuda_body}\n' + ) + + if use_grid_sync: + template = '#include \n\n' + template + + code = string.Template(template).substitute( + preamble=preamble, + name=name, + cuda_params=cuda_params, + cuda_body=cuda_body) + + # (For contributers) We can view the whole generated CUDA code + # by uncommenting the following line. + # print(code) + + module = compile_with_cache( + code, (), None, None, True, 'nvrtc', False, use_grid_sync) + return module.get_function(name) + + +cdef class FusedKernel: + cdef: + readonly object shape_constraints + + readonly str _name + readonly list _params + readonly int _return_size + readonly str _submodule_code + readonly str _cuda_body + readonly dict _cuda_params_memo + readonly list _block_strides + readonly bint _use_grid_sync + + readonly list _reduction_in_array + readonly list _reduction_out_array + readonly vector.vector[bint] _is_base + readonly list _dtypes + readonly vector.vector[Py_ssize_t] _input_index + readonly vector.vector[Py_ssize_t] _view_of + readonly vector.vector[Py_ssize_t] _out_params + + def __init__(self, name, trace_result): + op_list = trace_result.op_list + params = trace_result.params + return_size = trace_result.return_size + self.shape_constraints = trace_result.shape_constraints + + self._name = name + self._params = sorted(params, key=lambda x: x.serial_number) + self._cuda_params_memo = {} + + # Generate the device functions. + submodule_code = '\n\n'.join(set(itertools.chain.from_iterable([ + op.emit_preamble_codes() for op in op_list]))) + '\n\n' + submodule_code += '\n\n'.join(itertools.chain.from_iterable([ + op.emit_submodule_codes() for op in op_list])) + + # Generate the function body of a __global__ function. + codes = [] + + self._use_grid_sync = len(op_list) > 1 + + if self._use_grid_sync: + codes.append('namespace _cg = cooperative_groups;') + codes.append('_cg::grid_group _grid = _cg::this_grid();') + + for i, op in enumerate(op_list): + if i > 0: + codes.append('_cg::sync(_grid);') + codes.append(op.emit_code()) + + self._submodule_code = submodule_code + self._cuda_body = str(_codeblock.CodeBlock('', codes)) + + # Check the format of the return value. + if return_size == 'none': + self._return_size = -1 + self._out_params.resize(0) + elif return_size == 'single': + self._return_size = -2 + self._out_params.resize(1) + else: + assert isinstance(return_size, int) + assert return_size >= 0 + self._return_size = return_size + self._out_params.resize(return_size) + + for p in self._params: + assert isinstance(p, _TraceVariable) + + # Analyse the relationship between variables. + + array_dict = {} + self._reduction_in_array = [] + self._reduction_out_array = [] + self._dtypes = [] + + for i, p in enumerate(self._params): + view_of = -1 + input_index = -1 + if p.input_index is not None: + input_index = p.input_index + if isinstance(p, _TraceArray): + if p._view_of is not None: + view_of = array_dict[p._view_of.key()] + if p.is_output: + self._out_params[p.output_index] = i + array_dict[p.key()] = i + self._is_base.push_back(p.is_base) + self._dtypes.append(_dtype.get_dtype(p.dtype)) + self._input_index.push_back(input_index) + self._view_of.push_back(view_of) + + self._block_strides = [] + + for op in op_list: + if isinstance(op, _fusion_op._ReductionTraceOp): + self._reduction_in_array.append( + array_dict[op.in_params.item().key()]) + self._reduction_out_array.append( + array_dict[op.out_params.item().key()]) + self._block_strides.append( + 'int {}'.format(op.block_stride_name)) + + def get_shapes_of_kernel_params(self, tuple args): + """Returns the shapes of parameters passed to kern.linear_launch. + """ + cdef list kernel_param_shapes = [] + cdef int axis + cdef list shape + + for param in self._params: + shape = [] + if isinstance(param, _TraceArray): + ashape = param.ashape + for axis in range(len(ashape)): + dim = ashape[axis] + if not isinstance(dim, int): + dim = args[dim.input_index].shape[dim.axis] + shape.append(dim) + kernel_param_shapes.append(tuple(shape)) + return kernel_param_shapes + + cdef list _get_ndarray_list(self, tuple args, list shapes): + """Get the list of ndarray corresponding to ``self._params``. + """ + cdef list ndarray_list = [] + cdef list params = self._params + cdef int i + for i in range(len(params)): + param = params[i] + shape = shapes[i] + if self._input_index[i] >= 0: + array = args[self._input_index[i]] + elif isinstance(param, _TraceScalar): + array = None + elif self._is_base[i]: + array = _ndarray_init( + _cupy.ndarray, shape, self._dtypes[i], None) + else: + view_of = ndarray_list[self._view_of[i]] + if param.is_broadcast: + array = _manipulation.broadcast_to(view_of, shape) + elif param.slice_key is not None: + array = view_of[param.slice_key] + elif param.rotate_axis is not None: + axis_permutes = list(param.rotate_axis) + for i in range(param.ndim): + if i not in param.rotate_axis: + axis_permutes.append(i) + axis_permutes = tuple(axis_permutes) + array = _manipulation._transpose(view_of, axis_permutes) + else: + assert False + # For debug + # if isinstance(array, ndarray) and param.rotate_axis is None: + # assert array.shape == shape, (array.shape, shape) + ndarray_list.append(array) + + return ndarray_list + + cdef object _get_return_value(self, list ndarray_list): + """Get the return value of ``self.execute``. + """ + cdef int i + + if self._return_size == -1: + return None + + if self._return_size == -2: + return ndarray_list[self._out_params[0]] + + return tuple([ + ndarray_list[self._out_params[i]] + for i in range(self._return_size) + ]) + + cdef tuple _get_kernel_size(self, list ndarray_list): + """Calculate the numnber of contiguous blocks in non-reduction axes + of input arrays, and set them to ``self._contiguous_size``. + """ + cdef _ndarray_base in_array, out_array + cdef Py_ssize_t block_size, block_stride, contiguous_size + + cdef list block_strides = [] + + if len(self._reduction_in_array) == 0: + return [], 256, 0 + + block_size = _default_block_size + for i in range(len(self._reduction_in_array)): + in_array = ndarray_list[self._reduction_in_array[i]] + out_array = ndarray_list[self._reduction_out_array[i]] + + # TODO(asi1024): Fix block strides for performance. + contiguous_size = 1 + itemsize = in_array.dtype.itemsize + for i in range(out_array.ndim): + if in_array.strides[-i-1] != contiguous_size * itemsize: + break + contiguous_size *= in_array.shape[-i-1] + contiguous_size = min(contiguous_size, 32) + + reduce_block_size = max(1, in_array.size // max(1, out_array.size)) + block_stride = max( + contiguous_size, block_size // reduce_block_size) + block_stride = internal.clp2(block_stride // 2 + 1) # floor + block_strides.append(block_stride) + + shared_mem = block_size * 32 # max bytesize of reduce_ctype. + return block_strides, block_size, shared_mem + + cdef tuple _reduce_dims(self, list ndarray_list): + """Reduce number of dimensions of ndarrays and returns the cache key. + """ + cdef list params = self._params + cdef list ndims = [] + cdef _ndarray_base array + cdef int i + + for i in range(len(params)): + param = params[i] + if param.ndim <= 1: + continue + array = ndarray_list[i] + array = array.reduced_view() + ndarray_list[i] = array + ndims.append(array.ndim) + + return tuple(ndims) + + cdef list _get_inout_args(self, tuple args, list ndarray_list): + """Get the arguments passed to ``kern.linear_launch``. + """ + cdef list params = [] + cdef list indexers = [] + cdef _carray.Indexer indexer + + for i in range(len(self._params)): + array = ndarray_list[i] + if isinstance(array, _ndarray_base): + indexer = _carray.Indexer.__new__(_carray.Indexer) + indexer.init(array._shape) + indexers.append(indexer) + params.append(array) + elif self._input_index[i] >= 0: + obj = args[self._input_index[i]] + params.append(obj) + + return params + indexers + + cdef str _get_cuda_params(self, tuple key, list ndarray_list): + """Get a string of parameters of CUDA main function code. + """ + cdef int i + + if key in self._cuda_params_memo: + return self._cuda_params_memo[key] + + cuda_params = [] + indexers = [] + + for i in range(len(self._params)): + a = self._params[i] + if isinstance(a, _TraceArray): + array = ndarray_list[i] + ndim = array.ndim + c_contiguous = 'true' if array._c_contiguous else 'false' + index_32_bits = 'true' if array._index_32_bits else 'false' + cuda_params.append(a.format( + 'CArray<${type}, ${ndim}, ${cont}, ${ind32}> ${var}', + ndim=ndim, cont=c_contiguous, ind32=index_32_bits)) + indexers.append( + a.format('CIndexer<${ndim}> ${indexer}', ndim=ndim)) + elif isinstance(a, _TraceScalar): + if a.const_value is None: + cuda_params.append(a.format('${type} ${var}')) + else: + raise TypeError('Unknown type {}.'.format(type(a))) + + ret = cuda_params + indexers + self._block_strides + ret = ', '.join(ret) + self._cuda_params_memo[key] = ret + return ret + + def execute(self, tuple args, list shapes): + ndarray_list = self._get_ndarray_list(args, shapes) + ret = self._get_return_value(ndarray_list) + reduce_key = self._reduce_dims(ndarray_list) + inout_args = self._get_inout_args(args, ndarray_list) + cuda_params = self._get_cuda_params(reduce_key, ndarray_list) + kern = _cuda_compile( + self._submodule_code, self._name, cuda_params, self._cuda_body, + self._use_grid_sync) + + block_strides, block_size, shared_mem = ( + self._get_kernel_size(ndarray_list)) + + # TODO(asi1024): Optimize kernel size parameter. + if not runtime._is_hip_environment: + kern_size = driver.occupancyMaxActiveBlocksPerMultiprocessor( + kern.ptr, block_size, shared_mem) * block_size + else: + # In HIP sometimes the occupancy calc seems to be broken + kern_size = block_size * 512 + + kargs = inout_args + block_strides + kern.linear_launch( + kern_size, kargs, shared_mem, block_size, + enable_cooperative_groups=self._use_grid_sync) + return ret diff --git a/cupy/_core/_fusion_op.py b/cupy/_core/_fusion_op.py new file mode 100644 index 0000000..e5a1bad --- /dev/null +++ b/cupy/_core/_fusion_op.py @@ -0,0 +1,316 @@ +import string + +import numpy + +from cupy._core import _codeblock +from cupy._core._fusion_variable import _TraceVariable +from cupy._core._fusion_variable import _TraceArray +from cupy._core._fusion_variable import _VariableSet +from cupy._core import _fusion_thread_local +from cupy._core import _kernel +from cupy._core import _reduction +from cupy._core._scalar import get_typename + + +class _UfuncRoutine: + """A device function for single elementwise operations. + """ + + def __init__( + self, name, ufunc, routine_code, in_params, out_params, + compute_dtypes): + assert isinstance(name, str) + assert isinstance(ufunc, _kernel.ufunc) + assert isinstance(routine_code, str) + assert isinstance(compute_dtypes, tuple) + assert all(isinstance(t, numpy.dtype) for t in compute_dtypes) + assert isinstance(in_params, list) + assert all(isinstance(p, _TraceVariable) for p in in_params) + assert isinstance(out_params, list) + assert all(isinstance(p, _TraceArray) for p in out_params) + + self.name = name + self.in_params = in_params + self.out_params = out_params + self.preamble = ufunc._preamble + self.routine_code = routine_code + self.compute_dtypes = compute_dtypes + + def emit_code(self): + """Returns a CUDA device function code. + + Returns a string like: + ``` + __device__ void cupy_add_0(int &in0_, float &in1_, double &out0_) { + typedef double in0_type; + typedef double in1_type; + typedef double out0_type; + double in0 = (double) in0_; + double in1 = (double) in1_; + double out0 = (double) out0_; + out0 = in0 + in1; + out0_ = out0; + } + ``` + """ + nin = len(self.in_params) + dtypes = self.compute_dtypes + assert len(self.in_params) == len(self.compute_dtypes[:nin]) + in_params = [ + (get_typename(p.dtype), get_typename(t), 'in{}'.format(i)) + for i, (p, t) in enumerate(zip(self.in_params, dtypes[:nin])) + ] + out_params = [ + (get_typename(p.dtype), get_typename(t), 'out{}'.format(i)) + for i, (p, t) in enumerate(zip(self.out_params, dtypes[nin:])) + ] + params = in_params + out_params + + params_code = ', '.join(['{} &{}_'.format(t, s) for t, _, s in params]) + typedef = ['typedef {} {}_type;'.format(t, s) for _, t, s in params] + read = ['{} {} = ({}) {}_;'.format(t, s, t, s) for _, t, s in params] + write = ['{}_ = {};'.format(s, s) for _, _, s in out_params] + + return _codeblock.CodeBlock( + '__device__ void {}({})'.format(self.name, params_code), + typedef + read + [self.routine_code + ';'] + write) + + def emit_call_code(self): + params = self.in_params + self.out_params + return '{op_name}({params});'.format( + op_name=self.name, + params=', '.join([var.lvar_name for var in params])) + + +class _ElementwiseTraceOp: + """Ufunc or elementwise kernel with types. + """ + + def __init__(self, ufunc_routines, in_params, out_params, ashape): + # The `in_params` and `out_params` should be already broadcasted to + # `ashape`, but they don't guarantee to be exactly same as + # `param.ashape`. + + _fusion_thread_local.check_not_runtime() + assert isinstance(ufunc_routines, list) + assert all(isinstance(r, _UfuncRoutine) for r in ufunc_routines) + assert isinstance(ashape, tuple) + + self.ops = ufunc_routines + self.in_params = _VariableSet(*in_params) + self.out_params = _VariableSet(*out_params) + self.ashape = ashape + + @property + def params(self): + """Returns the set of all variable the loop uses. + """ + res = _VariableSet() + for op in self.ops: + res += _VariableSet(*op.in_params) + res += _VariableSet(*op.out_params) + return res + + @staticmethod + def _emit_declaration(params, in_params): + """Returns a tuple of size 2. + + 1. CUDA code: declaring local variables. + 2. The set of arrays which require indexer. + """ + _fusion_thread_local.check_not_runtime() + + indexed_arrays = _VariableSet() + code = [] + for var in params: + if var in in_params: + if isinstance(var, _TraceArray): + indexed_arrays.add(var) + f = '${type} ${lvar} = ${var}[${indexer}.get()];' + else: + f = '${type} ${lvar} = ${var};' + else: + f = '${type} ${lvar};' + code.append(var.format(f)) + + return code, indexed_arrays + + @staticmethod + def _emit_after_operation(out_params): + """Returns a tuple of size 2. + 1. CUDA code: writing the results of operations back to global memory. + 2. The set of arrays which require indexer. + """ + + _fusion_thread_local.check_not_runtime() + + indexed_arrays = _VariableSet() + codes = [] + for var in out_params: + if isinstance(var, _TraceArray): + indexed_arrays.add(var) + f = '${var}[${indexer}.get()] = ${lvar};' + else: + f = '${var} = ${lvar};' + codes.append(var.format(f)) + + return codes, indexed_arrays + + @staticmethod + def _emit_set_index(indexed_params, tid): + """Returns a CUDA code: setting a raw index to indexers. + """ + _fusion_thread_local.check_not_runtime() + assert isinstance(indexed_params, _VariableSet) + + return [ + p.format('${indexer}.set(${tid});', tid=tid) + for p in indexed_params + ] + + def emit_code(self): + _fusion_thread_local.check_not_runtime() + + declaration, s1 = self._emit_declaration(self.params, self.in_params) + operation = [op.emit_call_code() for op in self.ops] + after_operation, s2 = self._emit_after_operation(self.out_params) + index_name = 'i' + indexed_array = s1 + s2 + indexer_name = next(iter(indexed_array)).indexer_name + indexer_setup = self._emit_set_index(indexed_array, index_name) + + return _codeblock.CodeBlock( + 'CUPY_FOR({}, {}.size())'.format(index_name, indexer_name), + indexer_setup + declaration + operation + after_operation) + + def emit_preamble_codes(self): + return [subm.preamble for subm in self.ops if subm.preamble != ''] + + def emit_submodule_codes(self): + return [str(subm.emit_code()) for subm in self.ops] + + +class _ReductionTraceOp: + def __init__(self, name, reduce_func, expr, in_param, out_param, axis): + """Reduction operation. + """ + _fusion_thread_local.check_not_runtime() + assert isinstance(name, str) + assert isinstance(reduce_func, _reduction._SimpleReductionKernel) + assert isinstance(in_param, _TraceArray) + assert isinstance(out_param, _TraceArray) + assert isinstance(axis, tuple) + assert all(0 <= x < in_param.ndim for x in axis) + + self.name = name + self.preamble = reduce_func.preamble + self.in_params = _VariableSet(in_param) + self.out_params = _VariableSet(out_param) + self.block_stride_name = 'block_stride_' + name + self.axis = axis + + if reduce_func.identity is None: + self.identity = '' + else: + self.identity = str(reduce_func.identity) + + _, self.expr, self.postmap_cast_code, self.reduce_ctype = expr + if self.reduce_ctype is None: + out_param, = self.out_params + self.reduce_ctype = get_typename(out_param.dtype) + + self.premap_op = None + self.postmap_op = None + + @property + def params(self): + return self.in_params + self.out_params + + def emit_code(self): + _fusion_thread_local.check_not_runtime() + assert len(self.in_params) == 1 + assert len(self.out_params) == 1 + in_param = list(self.in_params)[0] + out_param = list(self.out_params)[0] + params = ', '.join([ + in_param.var_name, + out_param.var_name, + in_param.indexer_name, + out_param.indexer_name, + ]) + return '{}({}, {});'.format( + self.name, params, self.block_stride_name) + + def emit_preamble_codes(self): + preamble = self.preamble + return [preamble] if preamble != '' else [] + + def emit_submodule_codes(self): + """Returns a CUDA device function code. + + The emitted code assumes that ``block_stride`` and `blockDim.x` is a + power of 2. + """ + + in_param, = self.in_params + out_param, = self.out_params + op_name = '{}_op'.format(self.name) + postmap_name = '{}_postmap'.format(self.name) + + template = string.Template(''' +#define ${op_name}(a, b) (${reduce_expr}) +#define ${postmap_name}(a, out0) (${postmap_cast}) + +template +__device__ void ${name}( + InType in_arr, OutType out_arr, + InIndexerType in_ind, OutIndexerType out_ind, int block_stride) { + typedef ${in_type} type_in0_raw; + typedef ${out_type} type_out0_raw; + typedef ${reduce_ctype} _type_reduce; + extern __shared__ char _sdata_raw[]; + _type_reduce *sdata = reinterpret_cast<_type_reduce*>(_sdata_raw); + unsigned int tid = threadIdx.x; + int _J = tid >> __popc(block_stride - 1); + ptrdiff_t _j = (ptrdiff_t)_J * out_ind.size(); + int J_stride = blockDim.x >> __popc(block_stride - 1); + ptrdiff_t j_stride = (ptrdiff_t)J_stride * out_ind.size(); + + for (ptrdiff_t _i = (ptrdiff_t)blockIdx.x * block_stride; _i < out_ind.size(); _i += (ptrdiff_t)gridDim.x * block_stride) { + _type_reduce s = _type_reduce(${identity}); + ptrdiff_t i = _i + (tid & (block_stride - 1)); + for (ptrdiff_t j = i + _j; j < in_ind.size(); j += j_stride) { + in_ind.set(j); + s = ${op_name}(s, static_cast<_type_reduce>(in_arr[in_ind.get()])); + } + sdata[tid] = s; + __syncthreads(); + for (unsigned int block = blockDim.x / 2; block >= block_stride; block >>= 1) { + if (tid < block) { + sdata[tid] = ${op_name}(sdata[tid], sdata[tid + block]); + } + __syncthreads(); + } + if (tid < block_stride) { + s = sdata[tid]; + } + if (tid < block_stride && i < out_ind.size()) { + out_ind.set(i); + ${postmap_name}(s, out_arr[out_ind.get()]); + } + __syncthreads(); + } +}''') # NOQA + code = template.substitute( + name=self.name, + op_name=op_name, + postmap_name=postmap_name, + in_type=get_typename(in_param.dtype), + out_type=get_typename(out_param.dtype), + reduce_ctype=self.reduce_ctype, + reduce_expr=self.expr, + identity=self.identity, + postmap_cast=self.postmap_cast_code + ) + + return [code] diff --git a/cupy/_core/_fusion_optimization.py b/cupy/_core/_fusion_optimization.py new file mode 100644 index 0000000..d7f2485 --- /dev/null +++ b/cupy/_core/_fusion_optimization.py @@ -0,0 +1,90 @@ +from cupy._core import _fusion_variable +from cupy._core import _fusion_op + + +def _reduce_memory_access(ops): + required_memories = set() + + for op in ops: + for p in op.in_params + op.out_params: + if p.memory.is_inout: + required_memories.add(p.memory) + + for op in ops[::-1]: + in_memories = set([p.memory for p in op.in_params]) + + new_out_params = [] + for p in op.out_params: + if p.memory in required_memories: + new_out_params.append(p) + op.out_params = _fusion_variable._VariableSet(*new_out_params) + + # TODO(asi1024): The following improvement can be applicable only + # when the memory space is used at most once. + # `required_memories -= out_memories` + required_memories |= in_memories + + return [op for op in ops if len(op.out_params) > 0] + + +def _normalize_ashapes(ops, variables, shape_constraints): + def normalize(shape): + return tuple([shape_constraints.evaluate(d) for d in shape]) + + for var in variables: + var.ashape = normalize(var.ashape) + + for op in ops: + if isinstance(op, _fusion_op._ElementwiseTraceOp): + op.ashape = normalize(op.ashape) + + +def _fuse_two_ops(op1, op2): + """Returns a fused Op if the two ops can be fused, and ``None`` otherwise. + """ + # TODO(asi1024): Supoort reduction postmap. + if not isinstance(op1, _fusion_op._ElementwiseTraceOp): + return None + + # TODO(asi1024): Supoort reduction premap. + if not isinstance(op2, _fusion_op._ElementwiseTraceOp): + return None + + if op1.ashape != op2.ashape: + return None + + new_in_params = op1.in_params + (op2.in_params - op1.out_params) + new_out_params = op1.out_params + op2.out_params + for in_param in new_in_params: + for out_param in new_out_params: + # Checks if two arrays may share the same memory space. + if in_param.memory == out_param.memory and in_param != out_param: + return None + + op1.ops.extend(op2.ops) + op1.in_params = new_in_params + op1.out_params = new_out_params + return op1 + + +def _fuse_consecutive_ops(ops, shape_constraints): + res = [] + for op in ops: + if len(res) == 0: + res.append(op) + else: + prev_op = res.pop(-1) + new_op = _fuse_two_ops(prev_op, op) + if new_op is None: + res.extend([prev_op, op]) + else: + res.append(new_op) + return res + + +def optimize(ops, variables, shape_constraints): + _normalize_ashapes(ops, variables, shape_constraints) + ops = _reduce_memory_access(ops) + ops = _fuse_consecutive_ops(ops, shape_constraints) + ops = _reduce_memory_access(ops) + return ops diff --git a/cupy/_core/_fusion_thread_local.pyx b/cupy/_core/_fusion_thread_local.pyx new file mode 100644 index 0000000..d3b6c0d --- /dev/null +++ b/cupy/_core/_fusion_thread_local.pyx @@ -0,0 +1,46 @@ +import threading + + +thread_local = threading.local() + + +cpdef inline bint is_old_fusing() except? -1: + try: + return thread_local.is_old_fusing + except AttributeError: + thread_local.is_old_fusing = False + return False + + +cpdef inline bint is_new_fusing() except? -1: + try: + return thread_local.is_new_fusing + except AttributeError: + thread_local.is_new_fusing = False + return False + + +cpdef inline bint is_fusing() except? -1: + return is_old_fusing() or is_new_fusing() + + +def check_not_runtime(): + assert is_new_fusing() + + +def call_ufunc(fusion_op, *args, **kwargs): + if is_new_fusing(): + return thread_local.history.call_ufunc(fusion_op, *args, **kwargs) + import cupy + return cupy._core.fusion._call_ufunc(fusion_op, *args, **kwargs) + + +def call_reduction(fusion_op, *args, **kwargs): + if is_new_fusing(): + return thread_local.history.call_reduction(fusion_op, *args, **kwargs) + import cupy + return cupy._core.fusion._call_reduction(fusion_op, *args, **kwargs) + + +def call_indexing(fusion_op, *args, **kwargs): + return thread_local.history.call_indexing(fusion_op, *args, **kwargs) diff --git a/cupy/_core/_fusion_trace.pyx b/cupy/_core/_fusion_trace.pyx new file mode 100644 index 0000000..ff0775b --- /dev/null +++ b/cupy/_core/_fusion_trace.pyx @@ -0,0 +1,616 @@ +import numpy + +from cupy._core import _kernel +from cupy._core import _reduction +from cupy._core import core +from cupy._core._fusion_interface import _VariableProxy +from cupy._core._fusion_interface import _ArrayProxy +from cupy._core import _fusion_thread_local +from cupy._core import _fusion_variable +from cupy._core._fusion_variable import _AbstractDim +from cupy._core._fusion_variable import _TraceScalar +from cupy._core._fusion_variable import _TraceArray +from cupy._core._fusion_variable import _VariableSet +from cupy._core import _fusion_op +from cupy._core import _fusion_optimization + +from cupy._core cimport internal +from cupy._core._dtype cimport _raise_if_invalid_cast + + +_thread_local = _fusion_thread_local.thread_local +_accepted_types = (int, float, bool, complex, numpy.generic) + + +cdef class _ShapeConstraints: + """A data structure that manages the conditions between the shapes. + """ + + cdef: + # A list of tuple of _AbstractDim and _AbstractDim which represents + # the equality between dimensions. + readonly list eq_constraints + # A list of tuple of _AbstractDim and int which is an associative list + readonly list const_constraints + + def __init__(self): + self.eq_constraints = [] + self.const_constraints = [] + + def add_eq_constraint(self, x, y): + """Add a constraint: x == y. + """ + _fusion_thread_local.check_not_runtime() + assert isinstance(x, (_AbstractDim, int)) + assert isinstance(y, (_AbstractDim, int)) + x = self.evaluate(x) + y = self.evaluate(y) + if x == y: + return + if isinstance(x, _AbstractDim) and isinstance(y, _AbstractDim): + self.eq_constraints.append((x, y)) + elif isinstance(x, _AbstractDim) and not isinstance(y, _AbstractDim): + self.add_const_constraint(x, y) + elif not isinstance(x, _AbstractDim) and isinstance(y, _AbstractDim): + self.add_const_constraint(y, x) + else: + assert False + + def add_const_constraint(self, x, value): + """Add a constraint: x == value. + """ + _fusion_thread_local.check_not_runtime() + assert isinstance(x, (_AbstractDim, int)) + assert isinstance(value, int) + x = self.evaluate(x) + if isinstance(x, _AbstractDim): + self.const_constraints.append((x, value)) + else: + assert x == value + + def evaluate(self, x): + """Substitute repeatedly from the equalities. + """ + _fusion_thread_local.check_not_runtime() + assert isinstance(x, (_AbstractDim, int)) + for src, dest in self.eq_constraints + self.const_constraints: + if isinstance(x, int): + return x + if x == src: + x = dest + return x + + # Used in runtime. + def satisfy(self, dict dim_map): + """Check if the given dicionary satisfies the constraints. + + Args: + dim_map (dict): + A dictionary with keys of _AbstractDim type and + values of int type. + """ + for a, b in self.eq_constraints: + if dim_map[a] != dim_map[b]: + return False + for a, b in self.const_constraints: + if dim_map[a] != b: + return False + return True + + +def _guess_routine(func, args, dtype): + assert isinstance(func, (_kernel.ufunc, _reduction._SimpleReductionKernel)) + + # Feeds dummy arguments with appropriate dtypes passed to `guess_routine`. + dummy_args = [] + for x in args: + if isinstance(x, _TraceScalar): + obj = x.dtype.type(0) + else: + assert isinstance(x, _TraceArray) + obj = core.ndarray((0,), x.dtype) + dummy_args.append(obj) + + op = func._ops.guess_routine( + func.name, func._routine_cache, dummy_args, dtype, None) + return op.get_in_dtypes(), op.get_out_dtypes(), op.routine + + +def _base(array): + """Returns the base array object of given array. + """ + assert isinstance(array, core.ndarray) + return array if array.base is None else array.base + + +class _VariableCoordinator: + """Variable constuct manager. + + This class calls ``_TraceArray`` or ``_TraceScalar`` internally + with unique serial numbers and returns the variable object. In + ``TraceImpl`` class, a method of ``history.vc``, which is of + ``_VariableConduductor`` class, should be called instead of + ```_TraceArray.__init__`` or ``_TraceScalar.__init__``. + """ + + def __init__(self): + self._memory_number = 0 + self._serial_number = 0 + self._variables_dict = {} + + def _normalize_variable(self, var): + """If the input variable is already generated previously, returns it. + """ + key = var.key() + if key not in self._variables_dict: + self._variables_dict[key] = var + return self._variables_dict[key] + + def _generate_new_variable(self, var_module, dtype, **kwargs): + serial_number = self._serial_number + memory = _fusion_variable._MemorySpace( + self._memory_number, serial_number) + self._serial_number += 1 + self._memory_number += 1 + + ret = var_module(memory, serial_number, dtype, **kwargs) + memory.is_input = ret.is_input + return self._normalize_variable(ret) + + def generate_new_array(self, dtype, rshape, ashape, input_index=None): + """Generate new _TraceArray object with a new memory space. + """ + ret = self._generate_new_variable( + _TraceArray, + dtype, rshape=rshape, ashape=ashape, input_index=input_index) + ret.memory.base_ashape = ret.ashape + return ret + + def generate_new_scalar(self, dtype, **kwargs): + """Generate new _TraceScalar object with a new memory space. + """ + return self._generate_new_variable(_TraceScalar, dtype, **kwargs) + + def make_view(self, var, **kwargs): + assert isinstance(var, _TraceArray) + serial_number = self._serial_number + self._serial_number += 1 + ret = var.make_view(serial_number, **kwargs) + return self._normalize_variable(ret) + + def broadcast_to(self, var, ashape, rshape): + """Make a view of the input array with the given shape. + """ + return self.make_view( + var, ashape=ashape, rshape=rshape, broadcasted_from=var) + + def rotate_with_axis(self, var, axis): + """Make a view of an array by rotating ``var`` with given axis. + """ + assert isinstance(var, _TraceArray) + return self.make_view(var, rotated_from=var, axis=axis) + + def indexing(self, var, indices): + """Make a view of an array. by indexing ``var`` with given tuple. + """ + skip = var.ndim - sum([isinstance(x, (int, slice)) for x in indices]) + it = 0 + ashape = [] + rshape = [] + + if skip < 0: + raise IndexError('Too many indices for array.') + + for index in indices: + if isinstance(index, int): + it += 1 + elif isinstance(index, slice): + if not (index.start is None + and index.stop is None + and index.step in (1, -1, None)): + raise NotImplementedError( + 'Only full range ``x[::]`` or reverse ``x[::-1]`` is ' + 'supported for basic slicing in CuPy fusion.') + ashape.append(var.ashape[it]) + rshape.append(var.rshape[it]) + it += 1 + elif index is None: + ashape.append(1) + rshape.append(1) + elif index is Ellipsis: + ashape.extend(var.ashape[it:it + skip]) + rshape.extend(var.rshape[it:it + skip]) + it += skip + + ashape.extend(var.ashape[it:var.ndim]) + rshape.extend(var.rshape[it:var.ndim]) + + return self.make_view( + var, indexed_from=var, index_key=indices, + ashape=tuple(ashape), rshape=tuple(rshape)) + + @property + def all_variables(self): + """Returns the list of all variables this class emitted. + """ + return list(self._variables_dict.values()) + + +class TraceImpl: + """Emit a fused kernel from the given target function. + """ + + def __init__(self): + self.vc = _VariableCoordinator() + self.shape_constraints = _ShapeConstraints() + self.op_list = [] + + @staticmethod + def _make_interface(x): + """Returns an _array or a _scalar object which packs the given value. + """ + if x is None: + return None + assert isinstance(x, _fusion_variable._TraceVariable) + return x.as_interface() + + def _unwrap_interface(self, x, *, allow_none=False): + """Returns ``_TraceVariable`` object from the input. + """ + if allow_none and x is None: + return None + if isinstance(x, _VariableProxy): + return x.content + if isinstance(x, _accepted_types): + dtype = numpy.dtype(type(x)) + return self.vc.generate_new_scalar(dtype, const_value=x) + if isinstance(x, (numpy.ndarray, core.ndarray)): + raise TypeError('Concrete ndarray is not supported in fusion.') + raise TypeError('{} type is not supported'.format(type(x))) + + def call_ufunc(self, ufunc, *args, **kwargs): + """Register an elementwise operation with the given parameters. + + Args: + ufunc(_kernel.ufunc): The ufunc to operate. + args(tuple): The arguments. + kwargs(dict): The keyword arguments. + """ + + assert isinstance(ufunc, _kernel.ufunc) + + # Parse Inputs. + nin = ufunc.nin + nout = ufunc.nout + dtype = kwargs.pop('dtype', None) + + if 'out' in kwargs and len(args) > nin: + raise ValueError( + 'cannot specify \'out\' as both a positional and ' + 'keyword argument') + + in_params = [self._unwrap_interface(x) for x in args[:nin]] + out_params = [ + self._unwrap_interface(x, allow_none=True) + for x in args[nin:] + (kwargs.pop('out', None),) + if x is not None + ] + params = in_params + out_params + + if len(kwargs) > 0: + raise TypeError('Wrong arguments {}'.format(kwargs)) + if len(in_params) != nin or len(out_params) > nout: + raise ValueError('Invalid number of arguments') + if not all([isinstance(v, _TraceArray) for v in out_params]): + raise TypeError('Return arrays must be of ArrayType') + + # Check for inplace operation. + for i, out_param1 in enumerate(out_params): + for out_param2 in out_params[:i]: + if out_param1.memory == out_param2.memory: + # NumPy does not raise this error. + raise ValueError('Outputs of ufunc must not share memory') + + # Copy the input array data before the operation when the input array + # shares the same memory area with an output array. + for i, in_param in enumerate(in_params): + should_copy = any([ + in_param.memory == out_param.memory and in_param != out_param + for out_param in out_params + ]) + if should_copy: + in_params[i] = self._unwrap_interface( + self.call_ufunc( + core.elementwise_copy, + self._make_interface(in_param))) + + # Broadcast shapes + out_rshape = internal._broadcast_shapes([p.rshape for p in params]) + out_ashape = [None for _ in range(len(out_rshape))] + + for p in params: + for axis in range(-p.ndim, 0): + if p.rshape[axis] == out_rshape[axis]: + out_ashape[axis] = p.ashape[axis] + + assert all([dim is not None for dim in out_ashape]) + out_ashape = tuple(out_ashape) + + # Broadcast input params and make their views. + for i, p in enumerate(in_params): + for axis in range(-p.ndim, 0): + if p.rshape[axis] == out_rshape[axis]: + self.shape_constraints.add_eq_constraint( + p.ashape[axis], out_ashape[axis]) + elif p.rshape[axis] == 1: + self.shape_constraints.add_const_constraint( + p.ashape[axis], 1) + else: + assert False + if isinstance(p, _TraceArray) and p.rshape != out_rshape: + # Broadcst input if needed. + in_params[i] = self.vc.broadcast_to(p, out_ashape, out_rshape) + + # Get operation code from dtypes. + in_dtypes, out_dtypes, expr = _guess_routine( + ufunc, in_params, dtype) + + # Make output arrays. + ret = [] + for i in range(nout): + if i >= len(out_params): + # Omitted output. + out_pvar = self.vc.generate_new_array( + out_dtypes[i], out_rshape, out_ashape) + out_params.append(out_pvar) + elif isinstance(out_params, _TraceScalar): + raise TypeError('return arrays must be of ArrayType') + elif out_params[i].rshape != out_rshape: + raise ValueError( + 'non-broadcastable output operand with shape {} ' + 'doesn\'t match the broadcast shape {}'.format( + out_params[i].rshape, out_rshape)) + + _raise_if_invalid_cast( + out_dtypes[i], out_params[i].dtype, 'same_kind', + 'output operand') + + out_pvar = out_params[i] + ret.append(out_pvar) + + # Register Op. + name = ufunc.name + '_' + str(len(self.op_list)) + ufunc_routine = _fusion_op._UfuncRoutine( + name, ufunc, expr, in_params, out_params, in_dtypes + out_dtypes) + op = _fusion_op._ElementwiseTraceOp( + [ufunc_routine], in_params, out_params, out_ashape) + self.op_list.append(op) + + # Returns. + assert len(ret) > 0 + if len(ret) == 1: + return self._make_interface(ret[0]) + else: + return tuple([self._make_interface(x) for x in ret]) + + def call_reduction( + self, reduce_func, a, axis=None, dtype=None, out=None, + keepdims=False): + """Register a reduction operation with the given parameters. + + Args: + reduce_func(_reduction._SimpleReductionKernel): + The reduction function to operate. + a(array_like): The input array. + axis(int, tuple of int or None): The axis. + dtype(numpy.dtype or None): The dtype + out(_array or None): The output array. + """ + + assert isinstance(reduce_func, _reduction._SimpleReductionKernel) + + # Parse inputs. + in_param = self._unwrap_interface(a) + + if not isinstance(in_param, _TraceArray): + raise NotImplementedError( + 'Reduction for scalar arguments is not supported.') + + axes = internal._normalize_axis_indices(axis, in_param.ndim) + + if dtype is not None: + dtype = numpy.dtype(dtype) + + if keepdims: + raise NotImplementedError('keepdims is not supported.') + + # Determine the shape of out_param. + out_ashape = tuple([ + d for axis, d in enumerate(in_param.ashape) if axis not in axes]) + out_rshape = tuple([ + d for axis, d in enumerate(in_param.rshape) if axis not in axes]) + + # Rotate axes. + # This condition is only for performance improvement, + if not all([i == axis for i, axis in enumerate(axes)]): + in_param = self.vc.rotate_with_axis(in_param, axes) + + # Get operation code from dtypes. + _, (out_dtype,), expr = _guess_routine(reduce_func, [in_param], dtype) + + # Make an output array. + if out is None: + # Omitted output. + out_param = self.vc.generate_new_array( + out_dtype, out_rshape, out_ashape) + else: + out_param = self._unwrap_interface(out) + if out_param.rshape != out_rshape: + raise ValueError( + 'Shape of specified output variable is not consistent ' + 'with reduced shape.') + + # Register Op. + name = 'reduce{}'.format(len(self.op_list)) + op = _fusion_op._ReductionTraceOp( + name, reduce_func, expr, in_param, out_param, axes) + self.op_list.append(op) + + # Returns. + return self._make_interface(out_param) + + def call_indexing(self, in_param, indices): + """Call indexing routines. + """ + in_param = self._unwrap_interface(in_param) + + if not isinstance(indices, tuple): + indices = (indices,) + + for x in indices: + if isinstance(indices, (list, _TraceArray)): + # Advanced indexing + raise NotImplementedError( + 'Advanced indexing is not supported, currently.') + + if not (isinstance(x, (int, slice)) or x is None or x is Ellipsis): + raise IndexError( + 'Indices must be integers, slices, ellipsis, None or ' + 'integer or boolean arrays.') + + # Basic indexing + out_param = self.vc.indexing(in_param, indices) + return self._make_interface(out_param) + + def trace(self, func, args): + """Call ``self.func`` with _TraceVariable arguments. + + Returns: + out_params(list of _TraceVariable): The list of outputs. + return_size(int or str): If ``return_size`` is of int type, + it indicates the size of tuple of outputs. + If `none`, the output is ``None`` and ``out_params`` is empty. + If `single`, the output is single array and ``out_params`` + is a singleton list. + + During the function call, ``call_ufunc``, ``call_reduction`` and + ``call_indexing`` are called internally. + """ + + # Register input variables. + in_params = [] + array_dict = {} + memory_dict = {} + for input_index, arg in enumerate(args): + if arg is None: + var = None + elif isinstance(arg, core.ndarray): + arg_id = id(arg) + base_id = id(_base(arg)) + if arg_id in array_dict: + # The array is already given as an input. + var = in_params[array_dict[arg_id]] + assert isinstance(var, _TraceArray) + elif base_id in memory_dict: + # The is an array which shares the same memory. + base = in_params[memory_dict[base_id]] + assert isinstance(base, _TraceArray) + var = self.vc.make_view(base, input_index=input_index) + else: + # Otherwise. + var = self.vc.generate_new_array( + arg.dtype, arg.shape, None, input_index=input_index) + array_dict[arg_id] = input_index + memory_dict[base_id] = input_index + else: + # Scalar input. + dtype = numpy.dtype(type(arg)) + var = self.vc.generate_new_scalar( + dtype, input_index=input_index) + in_params.append(var) + + # Call the target function. + inputs = [self._make_interface(x) for x in in_params] + output = func(*inputs) + + # Register output variables. + if output is None: + return_size = 'none' + out_params = [] + elif isinstance(output, _ArrayProxy): + return_size = 'single' + out_params = [self._unwrap_interface(output, allow_none=True)] + elif isinstance(output, tuple): + if all(isinstance(x, _ArrayProxy) for x in output): + return_size = len(output) + out_params = [ + self._unwrap_interface(x, allow_none=True) for x in output] + else: + raise ValueError( + 'The all elements of return value of fused function ' + 'must be of _ArrayProxy type.' + ) + else: + raise ValueError( + 'The return value of fused functions must be `None`, ' + 'ndarray or a tuple of ndarays.' + ) + + for output_index, out_param in enumerate(out_params): + assert isinstance(out_param, _TraceArray) + out_param.output_index = output_index + out_param.memory.is_output = True + + return out_params, return_size + + +def _get_ancestors_of_trace_variable(var): + if var is None: + return _VariableSet() + res = _VariableSet(var) + if isinstance(var, _TraceArray): + res += _get_ancestors_of_trace_variable(var._view_of) + return res + + +class _TraceResult: + + def __init__(self, op_list, params, return_size, shape_constraints): + self.op_list = op_list + self.params = params + self.return_size = return_size + self.shape_constraints = shape_constraints + + +def trace(func, args): + history = TraceImpl() + + try: + _thread_local.history = history + + # Call `func(args)` and update `op_list`. + out_params, return_size = history.trace(func, args) + finally: + _thread_local.history = None + + op_list = history.op_list + shape_constraints = history.shape_constraints + all_variables = history.vc.all_variables + + op_list = _fusion_optimization.optimize( + op_list, all_variables, shape_constraints) + + # Make info passed to FusedKernel. + kernel_params = _VariableSet() + for p in out_params: + kernel_params += _get_ancestors_of_trace_variable(p) + for op in op_list: + for p in op.in_params + op.out_params: + kernel_params += _get_ancestors_of_trace_variable(p) + kernel_params = list(kernel_params) + + # used in mock tests. + history.kernel_params = kernel_params + history.op_list = op_list + + return _TraceResult(op_list, kernel_params, return_size, shape_constraints) diff --git a/cupy/_core/_fusion_variable.pxd b/cupy/_core/_fusion_variable.pxd new file mode 100644 index 0000000..7515ec7 --- /dev/null +++ b/cupy/_core/_fusion_variable.pxd @@ -0,0 +1,5 @@ +cdef class _AbstractDim: + + cdef: + readonly int input_index + readonly int axis diff --git a/cupy/_core/_fusion_variable.pyx b/cupy/_core/_fusion_variable.pyx new file mode 100644 index 0000000..d76911a --- /dev/null +++ b/cupy/_core/_fusion_variable.pyx @@ -0,0 +1,340 @@ +import string + +import numpy + +from cupy._core import _fusion_interface + +from cupy._core._scalar cimport get_typename + + +cdef class _AbstractDim: + """An abstracted data structure for a length of dimensions. + + Attributes: + input_index (int): + The position of the element in the arguments passed to the + fused function + axis (int): + The index of dimensions + """ + + def __init__(self, int input_index, int axis): + self.input_index = input_index + self.axis = axis + + def __hash__(self): + return hash((self.input_index, self.axis)) + + def __eq__(self, object other): + if isinstance(other, int): + return False + return ( + self.input_index == other.input_index + and self.axis == other.axis + ) + + +class _MemorySpace: + """A memory space object. + + Attributes: + id(int): The serial number of memory space. + base_serial_number(int): The serial number of the base variable + which have this memory space. + is_input(bool): If this is set to ``True``, the memory space is + already allocated as an input array. If this is set to ``False``, + the memory space should be allocated before launching the kernel. + is_output(bool): If this is set to ``True``, the memory space is + used in the return values. + """ + def __init__(self, memory_id, base_serial_number): + assert isinstance(memory_id, int) + assert isinstance(base_serial_number, int) + + self.id = memory_id + self.base_serial_number = base_serial_number + + # Initially, these attributes are set to be `False`, but might be + # updated from outside. + self.is_input = False + self.is_output = False + + @property + def is_inout(self): + """Returns ``True`` if the memory space is used for inputs or outputs. + + If ``True``, the memory space should not be deallocated just after + the kernel launch. If ``False``, the memory space is used only for + temporary value in the fused kernel.""" + return self.is_input or self.is_output + + +class _TraceVariable: + """Variable object to trace operations in the target function to be fused. + + Attributes: + index(_MemorySpace): The memory space the variable uses. + serial_number(int): The serial number of the variable object. + dtype(dtype): The dtype of the variable. + rshape(tuple of int): The real shape of the variable. + ashape(tuple of _AbstractDim): An abstracted shape of the variable. + input_index(int or None): If not `None`, this variable is used as + the `input_index`-th input parameter. + output_index(int or None): If not `None`, this variable is used as + the `output_index`-th output parameter. + """ + def __init__( + self, memory_space, serial_number, dtype, rshape, ashape, + input_index, output_index): + assert isinstance(memory_space, _MemorySpace) + assert isinstance(serial_number, int) + assert isinstance(dtype, numpy.dtype) + assert input_index is None or isinstance(input_index, int) + assert output_index is None or isinstance(output_index, int) + assert isinstance(rshape, tuple) + assert isinstance(ashape, tuple) + assert len(rshape) == len(ashape) + for rdim, adim in zip(rshape, ashape): + assert isinstance(rdim, int) + assert isinstance(adim, (int, _AbstractDim)) + + self.memory = memory_space + self.serial_number = serial_number + self.dtype = dtype + self.rshape = rshape + self.ashape = ashape + self.input_index = input_index + self.output_index = output_index + + @property + def ndim(self): + return len(self.ashape) + + @property + def is_base(self): + return self.serial_number == self.memory.base_serial_number + + @property + def is_input(self): + return self.input_index is not None + + @property + def is_output(self): + return self.output_index is not None + + @property + def var_name(self): + # The name of varialbe stored in global memory space. + raise NotImplementedError + + @property + def lvar_name(self): + # The name of varialbe stored in registers in each thread. + raise NotImplementedError + + @property + def indexer_name(self): + """The name of CUDA CIndxer variable for the variable. + """ + # TODO(asi1024): Unify indexer with other variables which have the + # same shape, for performance improvements. + return 'ind{}_{}'.format(self.memory.id, self.serial_number) + + def format(self, form, **kwargs): + """Returns a string following the format taken as an input. + """ + kwargs = dict([ + (k, get_typename(v) if isinstance(v, numpy.dtype) else v) + for k, v in kwargs.items()] + ) + return string.Template(form).substitute( + type=get_typename(self.dtype), + var=self.var_name, + lvar=self.lvar_name, + indexer=self.indexer_name, + **kwargs + ) + + def __hash__(self): + assert False, ( + '__hash__ is not defined. Use _VariableSet instead of ' + 'set/dict because they do not guarantee the order of contents.') + + +class _TraceScalar(_TraceVariable): + """An abstracted scalar object. + + Attributes: + const_value(scalar object or None): A compile-time constant value. + Actually, it is `None` iff self.is_input is `True`. + """ + + # TODO(asi1024): Remove index argument. + def __init__( + self, index, serial_number, dtype, input_index=None, *, + const_value=None,): + super().__init__( + index, serial_number, dtype, (), (), input_index, None) + + self.const_value = const_value + + @property + def var_name(self): + if self.const_value is None: + return 'a{}'.format(self.memory.id) + if self.dtype == '?': + return str(self.const_value).lower() + if self.dtype.kind == 'c': + return '{}({}, {})'.format( + get_typename(self.dtype), + self.const_value.real, + self.const_value.imag) + return str(self.const_value) + + @property + def lvar_name(self): + return 'v{}'.format(self.memory.id) + + def as_interface(self): + return _fusion_interface._ScalarProxy(self) + + def key(self): + return (self.memory.id,) + + +class _TraceArray(_TraceVariable): + """An abstracted array object. + + Attributes: + broadcasted_from(_TraceArray optional): TODO + rotated_from(_TraceArray optional): TODO + axis(int optional): The axis to rotate. + indexed_from(_TraceArray optional): TODO + index_key(slice): TODO + """ + + def __init__( + self, index, serial_number, dtype, input_index=None, + output_index=None, *, rshape, ashape, **kwargs): + + if ashape is None: + assert input_index is not None + ndim = len(rshape) + ashape = tuple([ + _AbstractDim(input_index, axis) for axis in range(ndim)]) + + super().__init__( + index, serial_number, dtype, rshape, ashape, + input_index, output_index) + + self._view_of = None + self.is_broadcast = False + self.rotate_axis = None + self.slice_key = None + + if 'broadcasted_from' in kwargs: + self._view_of = kwargs.pop('broadcasted_from') + self.is_broadcast = True + elif 'rotated_from' in kwargs: + self._view_of = kwargs.pop('rotated_from') + self.rotate_axis = kwargs.pop('axis') + elif 'indexed_from' in kwargs: + self._view_of = kwargs.pop('indexed_from') + self.slice_key = kwargs.pop('index_key') + + assert len(kwargs) == 0, kwargs + + @property + def var_name(self): + return 'a{}_{}'.format(self.memory.id, self.serial_number) + + @property + def lvar_name(self): + return 'v{}_{}'.format(self.memory.id, self.serial_number) + + def as_interface(self): + return _fusion_interface._ArrayProxy(self) + + def make_view(self, serial_number, **kwargs): + rshape = kwargs.pop('rshape', self.rshape) + ashape = kwargs.pop('ashape', self.ashape) + return _TraceArray( + self.memory, serial_number, self.dtype, + rshape=rshape, ashape=ashape, **kwargs) + + def key(self): + """Two variables can be identified if they have the same key. + """ + if isinstance(self.slice_key, tuple): + slice_key = [] + for s in self.slice_key: + if isinstance(s, slice): + if not (s.start is None + and s.stop is None + and s.step in (None, 1, -1)): + raise NotImplementedError( + 'Basic slice supports only x[::] and x[::-1].') + slice_key.append((s.start, s.stop, s.step)) + else: + slice_key.append(s) + slice_key = tuple(slice_key) + else: + slice_key = self.slice_key + + return ( + self.memory.id, self.ashape, self.input_index, + getattr(self._view_of, 'serial_number', None), + self.is_broadcast, self.rotate_axis, slice_key, + ) + + +class _VariableSet: + """A stable set of variables + """ + + def __init__(self, *args): + self.contents = [] + for x in args: + assert isinstance(x, _TraceVariable) + if x not in self.contents: + self.contents.append(x) + + def __len__(self): + return len(self.contents) + + def item(self): + assert len(self.contents) == 1 + return self.contents[0] + + def add(self, x): + if x not in self.contents: + self.contents.append(x) + + def __iadd__(self, other): + assert isinstance(other, _VariableSet) + for x in other.contents: + self.add(x) + return self + + def __add__(self, other): + res = _VariableSet(*self.contents) + res += other + return res + + def __contains__(self, elem): + return elem in self.contents + + def __iter__(self): + return iter(self.contents) + + def __isub__(self, other): + assert isinstance(other, _VariableSet) + for x in other.contents: + if x in self.contents: + self.contents.remove(x) + return self + + def __sub__(self, other): + res = _VariableSet(*self.contents) + res -= other + return res diff --git a/cupy/_core/_gufuncs.py b/cupy/_core/_gufuncs.py new file mode 100644 index 0000000..f1adcc6 --- /dev/null +++ b/cupy/_core/_gufuncs.py @@ -0,0 +1,729 @@ +import re + +import numpy + +import cupy +import cupy._core._routines_manipulation as _manipulation +from cupy._core._dtype import get_dtype, _raise_if_invalid_cast +from cupy._core import internal + + +# Signature parsing code and dimension accessing has been borrowed +# from dask +# https://github.com/dask/dask/blob/61b578f5a3ad88cbc6a8b9a73ce08c551bd969fa/dask/array/gufunc.py#L12-L55 +_DIMENSION_NAME = r'\w+\?*' +_CORE_DIMENSION_LIST = '(?:{0:}(?:,{0:})*,?)?'.format(_DIMENSION_NAME) +_ARGUMENT = r'\({}\)'.format(_CORE_DIMENSION_LIST) +_INPUT_ARGUMENTS = '(?:{0:}(?:,{0:})*,?)?'.format(_ARGUMENT) +_OUTPUT_ARGUMENTS = '{0:}(?:,{0:})*'.format( + _ARGUMENT +) # Use `'{0:}(?:,{0:})*,?'` if gufunc- +# signature should be allowed for length 1 tuple returns +_SIGNATURE = '^{0:}->{1:}$'.format(_INPUT_ARGUMENTS, _OUTPUT_ARGUMENTS) + + +def _parse_gufunc_signature(signature): + # The code has been modifyed from dask to support optional dimensions + if not isinstance(signature, str): + raise TypeError('Signature is not a string') + + if signature == '' or signature is None: + raise ValueError('Signature cannot be empty') + + signature = signature.replace(' ', '') + if not re.match(_SIGNATURE, signature): + raise ValueError('Not a valid gufunc signature: {}'.format(signature)) + in_txt, out_txt = signature.split('->') + ins = [tuple(x.split(',')) if x != '' else () + for x in in_txt[1:-1].split('),(')] + outs = [tuple(y.split(',')) if y != '' else () + for y in out_txt[1:-1].split('),(')] + # TODO(ecastill) multiple output support + if len(outs) > 1: + raise ValueError('Currently more than 1 output is not supported') + return ins, outs + + +def _validate_normalize_axes( + axes, axis, keepdims, input_coredimss, output_coredimss +): + # This code credit goes to Dask + # https://github.com/dask/dask/blob/61b578f5a3ad88cbc6a8b9a73ce08c551bd969fa/dask/array/gufunc.py#L58-L172 + nin = len(input_coredimss) + nout = ( + 1 if not isinstance(output_coredimss, list) else len(output_coredimss) + ) + + if axes is not None and axis is not None: + raise ValueError( + 'Only one of `axis` or `axes` keyword arguments should be given') + if axes and not isinstance(axes, list): + raise ValueError('`axes` has to be of type list') + + # output_coredimss = output_coredimss if nout > 1 else [output_coredimss] + filtered_core_dims = list(filter(len, input_coredimss)) + nr_outputs_with_coredims = len( + [True for x in output_coredimss if len(x) > 0]) + + if keepdims: + if nr_outputs_with_coredims > 0: + raise ValueError('`keepdims` can only be used for scalar outputs') + output_coredimss = len(output_coredimss) * [filtered_core_dims[0]] + + core_dims = input_coredimss + output_coredimss + if axis is not None: + if not isinstance(axis, int): + raise ValueError('`axis` argument has to be an integer value') + if filtered_core_dims: + cd0 = filtered_core_dims[0] + if len(cd0) != 1: + raise ValueError( + '`axis` can be used only, if one core dimension is present' + ) + for cd in filtered_core_dims: + if cd0 != cd: + raise ValueError( + 'To use `axis`, all core dimensions have to be equal' + ) + + # Expand defaults or axis + if axes is None: + if axis is not None: + axes = [(axis,) if cd else tuple() for cd in core_dims] + else: + axes = [tuple(range(-len(icd), 0)) for icd in core_dims] + + axes = [(a,) if isinstance(a, int) else a for a in axes] + + if ( + (nr_outputs_with_coredims == 0) + and (nin != len(axes)) + and (nin + nout != len(axes)) + ) or ((nr_outputs_with_coredims > 0) and (nin + nout != len(axes))): + raise ValueError( + 'The number of `axes` entries is not equal the number' + ' of input and output arguments') + + # Treat outputs + output_axes = axes[nin:] + output_axes = ( + output_axes + if output_axes + else [tuple(range(-len(ocd), 0)) for ocd in output_coredimss] + ) + input_axes = axes[:nin] + + # Assert we have as many axes as output core dimensions + for idx, (iax, icd) in enumerate(zip(input_axes, input_coredimss)): + if len(iax) != len(icd): + raise ValueError( + f'The number of `axes` entries for argument #{idx}' + ' is not equal the number of respective input core' + ' dimensions in signature') + if not keepdims: + for idx, (oax, ocd) in enumerate(zip(output_axes, output_coredimss)): + if len(oax) != len(ocd): + raise ValueError( + f'The number of `axes` entries for argument #{idx}' + ' is not equal the number of respective output core' + ' dimensions in signature') + else: + if input_coredimss: + icd0 = input_coredimss[0] + for icd in input_coredimss: + if icd0 != icd: + raise ValueError( + 'To use `keepdims`, all core dimensions' + ' have to be equal') + iax0 = input_axes[0] + output_axes = [iax0 for _ in output_coredimss] + + return input_axes, output_axes + + +class _OpsRegister: + ''' + Holds the ops for each dtypes signature like ('ff->f', func1) + and allows to do look ups for these + ''' + class _Op: + def __init__(self, in_types, out_types, func): + self.func = func + self.in_types = tuple(numpy.dtype(i) for i in in_types) + self.out_types = tuple(numpy.dtype(o) for o in out_types) + self.sig_str = (''.join( + in_t.char for in_t in self.in_types) + '->' + ''.join( + out_t.char for out_t in self.out_types)) + + def __init__(self, signatures, default_func, nin, nout, name): + self._default_func = default_func + self._nin = nin + self._nout = nout + self._ops = self._process_signatures(signatures) + self._name = name + + def _sig_str_to_tuple(self, sig): + sig = sig.replace(' ', '') + toks = sig.split('->') + if len(toks) != 2: + raise ValueError(f'signature {sig} for dtypes is invalid') + else: + ins, outs = toks + return ins, outs + + def _process_signatures(self, signatures): + ops = [] + for sig in signatures: + if isinstance(sig, tuple): + sig, op = sig + else: + op = self._default_func + ins, outs = self._sig_str_to_tuple(sig) + # Check the number of inputs and outputs matches the gufunc sig + if len(ins) != self._nin: + raise ValueError( + f'signature {sig} for dtypes is invalid number of inputs ' + 'is not consistent with general signature') + if len(outs) != self._nout: + raise ValueError( + f'signature {sig} for dtypes is invalid number of inputs ' + 'is not consistent with general signature') + + ops.append(_OpsRegister._Op(ins, outs, op)) + return ops + + def _determine_from_args(self, args, casting): + n = len(args) + in_types = tuple(arg.dtype for arg in args) + for op in self._ops: + op_types = op.in_types + for i in range(n): + it = in_types[i] + ot = op_types[i] + if not numpy.can_cast(it, ot, casting=casting): + break + else: + return op + return None + + def _determine_from_dtype(self, dtype): + for op in self._ops: + op_types = op.out_types + for t in op_types: + if t != dtype: + break + else: + return op + return None + + def _determine_from_signature(self, signature): + # Lets convert the signature as it can be a tuple of tuples + # or a string + if isinstance(signature, tuple): + # create a string to do a look-up on the ops + if len(signature) == 1: + raise TypeError( + 'The use of a length 1 tuple for the ufunc `signature` is' + ' not allowed. Use `dtype` or fill the tuple with' + ' `None`s.') + nin = self._nin + nout = self._nout + if len(signature) != (nin + nout): + raise TypeError( + 'A type-tuple must be specified of length 1 or 3 for ufunc' + f' {self._name}') + signature = ''.join( + numpy.dtype(t).char for t in signature[:nin]) + '->' + ''.join( + numpy.dtype(t).char for t in signature[nin:nin+nout]) + + if isinstance(signature, str): + is_out = len(signature) == 1 + for op in self._ops: + if is_out: + for t in op.out_types: + if t.char != signature: + break + else: + return op + else: + if op.sig_str == signature: + return op + raise TypeError('No loop matching the specified signature and' + f' casting was found for ufunc {self._name}') + + def determine_dtype(self, args, dtype, casting, signature): + ret_dtype = None + func = self._default_func + if signature is not None: + # TODO(ecastill) use an externally provided signature to + # find the typecasting rules + op = self._determine_from_signature(signature) + elif dtype is not None: + if type(dtype) == tuple: + # TODO(ecastill) support dtype tuples + raise RuntimeError('dtype with tuple is not yet supported') + op = self._determine_from_dtype(dtype) + else: + op = self._determine_from_args(args, casting) + + if op is None: + # Should we allow op to be none? + if dtype is None: + dtype = args[0].dtype + for arg in args: + ret_dtype = numpy.promote_types(dtype, arg.dtype) + else: + ret_dtype = get_dtype(dtype) + else: + # Convert args to the op specified in_types + n_args = [] + def argname(): return f'ufunc {self._name} input {i}' + for i, (arg, in_type) in enumerate(zip(args, op.in_types)): + _raise_if_invalid_cast(arg.dtype, in_type, casting, argname) + + n_args.append(arg.astype(in_type, copy=False)) + args = n_args + ret_dtype = op.out_types[0] + func = op.func + + return args, ret_dtype, func + + +class _GUFunc: + ''' + Creates a Generalized Universal Function by wrapping a user + provided function with the signature. + + ``signature`` determines if the function consumes or produces core + dimensions. The remaining dimensions in given input arrays (``*args``) + are considered loop dimensions and are required to broadcast + naturally against each other. + + Args: + func (callable): + Function to call like ``func(*args, **kwargs)`` on input arrays + (``*args``) that returns an array or tuple of arrays. If + multiple arguments with non-matching dimensions are supplied, + this function is expected to vectorize (broadcast) over axes of + positional arguments in the style of NumPy universal functions. + signature (string): + Specifies what core dimensions are consumed and produced by + ``func``. According to the specification of numpy.gufunc + signature. + supports_batched (bool, optional): + If the wrapped function supports to pass the complete input + array with the loop and the core dimensions. + Defaults to `False`. Dimensions will be iterated in the + `GUFunc` processing code. + supports_out (bool, optional): + If the wrapped function supports out as one of its kwargs. + Defaults to `False`. + signatures (list of tuple of str): + Contains strings in the form of 'ii->i' with i being the char of a + dtype. Each element of the list is a tuple with the string + and a alternative function to `func` to be executed when the inputs + of the function can be casted as described by this function. + name (str, optional): + Name for the GUFunc object. If not specified, ``func``'s name + is used. + doc (str, optional): + Docstring for the GUFunc object. If not specified, ``func.__doc__`` + is used. + ''' + + def __init__(self, func, signature, **kwargs): + # We would like to create gufuncs from cupy regular ufuncs + # so we can avoid most of the __call__ stuff + self._func = func + self._signature = signature + self.__name__ = kwargs.pop('name', func.__name__) + self.__doc__ = kwargs.pop('doc', func.__doc__) + + # The following are attributes to avoid applying certain steps + # when wrapping cupy functions that do some of the gufunc + # stuff internally due to CUDA libraries requirements + self._supports_batched = kwargs.pop('supports_batched', False) + self._supports_out = kwargs.pop('supports_out', False) + signatures = kwargs.pop('signatures', []) + + if kwargs: + raise TypeError( + 'got unexpected keyword arguments: ' + + ', '.join([repr(k) for k in kwargs]) + ) + + # Preprocess the signature here + input_coredimss, output_coredimss = _parse_gufunc_signature( + self._signature) + self._input_coredimss = input_coredimss + self._output_coredimss = output_coredimss + # This is pre-calculated to later check the minimum number of + # dimensions required per input + self._min_dims = [0] * len(input_coredimss) + for i, inp in enumerate(input_coredimss): + for d in inp: + if d[-1] != '?': + self._min_dims[i] += 1 + + # Determine nout: nout = None for functions of one + # direct return; nout = int for return tuples + self._nout = ( + 0 + if not isinstance(output_coredimss, list) + else len(output_coredimss) + ) + self._nin = ( + 0 + if not isinstance(input_coredimss, list) + else len(input_coredimss) + ) + # Determines the function that will be run depending on the datatypes + # Pass a list of signatures that are either the types in format + # ii->o or a tuple with the string and a function other than func to be + # executed for those types + # For some reason _nout is a tuple and now we get it with 0s + self._ops_register = _OpsRegister( + signatures, self._func, self._nin, self._nout, self.__name__) + + def _apply_func_to_inputs(self, func, dim, sizes, dims, args, outs): + # Apply function + # The resulting array is loop_output_dims+the specified dims + # Some functions have batching logic inside due to higly + # optimized CUDA libraries so we just call them + if self._supports_batched or dim == len(dims): + # Check if the function supports out, order and other args + if self._supports_out and outs is not None: + outs = outs[0] if len(outs) == 1 else outs + func(*args, out=outs) + else: + fouts = func(*args) + # TODO(ecastill) improve this check + if isinstance(fouts, cupy.ndarray): + fouts = (fouts,) + for o, fo in zip(outs, fouts): + cupy._core.elementwise_copy(fo, o) + else: + dim_size = sizes[dims[dim]][0] + for i in range(dim_size): + n_args = [a[i] for a in args] + if outs is not None: + n_outs = [o[i] for o in outs] + self._apply_func_to_inputs( + func, dim + 1, sizes, dims, n_args, n_outs) + + def _transpose_element(self, arg, iax, shape): + iax = tuple(a if a < 0 else a - len(shape) for a in iax) + tidc = ( + tuple(i for i in range( + -len(shape) + 0, 0) if i not in iax) + iax + ) + return arg.transpose(tidc) + + def _get_args_transposed(self, args, input_axes, outs, output_axes): + # This code credit goes to Dask + # https://github.com/dask/dask/blob/61b578f5a3ad88cbc6a8b9a73ce08c551bd969fa/dask/array/gufunc.py#L349-L377 + # modifications have been done to support arguments broadcast + # out argument, and optional core dims. + transposed_args = [] + # This is used when reshaping the outputs so that we can delete + # dims that were not specified in the input + missing_dims = set() + for i, (arg, iax, input_coredims, md) in enumerate(zip( + args, input_axes, self._input_coredimss, self._min_dims)): + shape = arg.shape + nds = len(shape) + # For the inputs that has missing dimensions we need to reshape + if nds < md: + raise ValueError(f'Input operand {i} does not have enough' + f' dimensions (has {nds}, gufunc core with' + f' signature {self._signature} requires {md}') + optionals = len(input_coredims) - nds + if optionals > 0: + # Look for optional dimensions + # We only allow the first or the last dimensions to be optional + if input_coredims[0][-1] == '?': + shape = (1,) * optionals + shape + missing_dims.update(set(input_coredims[:optionals])) + else: + shape = shape + (1,) * optionals + missing_dims.update( + set(input_coredims[min(0, len(shape)-1):])) + arg = arg.reshape(shape) + transposed_args.append(self._transpose_element(arg, iax, shape)) + args = transposed_args + + if outs is not None: + transposed_outs = [] + # outs should be transposed to the intermediate form before + # copying all results + for out, iox, coredims in zip( + outs, output_axes, self._output_coredimss): + transposed_outs.append(self._transpose_element( + out, iox, out.shape)) + # check that outs has been correctly transposed + # if the function returns a scalar, outs will be ignored + if len(transposed_outs) == len(outs): + outs = transposed_outs + + # we cant directly broadcast arrays together since their core dims + # might differ. Only the loop dimensions are broadcastable + shape = internal._broadcast_shapes( + [a.shape[:-len(self._input_coredimss)] for a in args]) + args = [_manipulation.broadcast_to( + a, shape + a.shape[-len(self._input_coredimss):]) for a in args] + + # Assess input args for loop dims + input_shapes = [a.shape for a in args] + num_loopdims = [ + len(s) - len(cd) for s, cd in zip( + input_shapes, self._input_coredimss) + ] + max_loopdims = max(num_loopdims) if num_loopdims else None + core_input_shapes = [ + dict(zip(icd, s[n:])) + for s, n, icd in zip( + input_shapes, num_loopdims, self._input_coredimss) + ] + core_shapes = {} + for d in core_input_shapes: + core_shapes.update(d) + + loop_input_dimss = [ + tuple( + '__loopdim%d__' % d for d in range( + max_loopdims - n, max_loopdims) + ) + for n in num_loopdims + ] + input_dimss = [li + c for li, c in zip( + loop_input_dimss, self._input_coredimss)] + + loop_output_dims = max(loop_input_dimss, key=len, default=()) + + # Assess input args for same size and chunk sizes + # Collect sizes and chunksizes of all dims in all arrays + dimsizess = {} + for dims, shape in zip(input_dimss, input_shapes): + for dim, size in zip(dims, shape): + dimsizes = dimsizess.get(dim, []) + dimsizes.append(size) + dimsizess[dim] = dimsizes + + # Assert correct partitioning, for case: + for dim, sizes in dimsizess.items(): + if set(sizes).union({1}) != {1, max(sizes)}: + raise ValueError( + f'Dimension {dim} with different lengths in arrays' + ) + + return args, dimsizess, loop_output_dims, outs, missing_dims + + def _determine_order(self, args, order): + if order.upper() in ('C', 'K'): + # Order is determined to be C to allocate the out array + # but we will change the strides of the out array + # to be K later in __call__ + return 'C' + elif order.upper() == 'A': + # order is F if all arrays are strictly F + order = ('F' if all([a.flags.f_contiguous + and not a.flags.c_contiguous + for a in args]) else 'C') + return order + + elif order.upper() == 'F': + return 'F' + else: + raise RuntimeError(f'Unknown order {order}') + + def __call__(self, *args, **kwargs): + ''' + Apply a generalized ufunc. + + Args: + args: Input arguments. Each of them can be a :class:`cupy.ndarray` + object or a scalar. The output arguments can be omitted or be + specified by the ``out`` argument. + axes (List of tuples of int, optional): + A list of tuples with indices of axes a generalized ufunc + should operate on. + For instance, for a signature of ``'(i,j),(j,k)->(i,k)'`` + appropriate for matrix multiplication, the base elements are + two-dimensional matrices and these are taken to be stored in + the two last axes of each argument. The corresponding + axes keyword would be ``[(-2, -1), (-2, -1), (-2, -1)]``. + For simplicity, for generalized ufuncs that operate on + 1-dimensional arrays (vectors), a single integer is accepted + instead of a single-element tuple, and for generalized ufuncs + for which all outputs are scalars, the output tuples + can be omitted. + axis (int, optional): + A single axis over which a generalized ufunc should operate. + This is a short-cut for ufuncs that operate over a single, + shared core dimension, equivalent to passing in axes with + entries of (axis,) for each single-core-dimension argument + and ``()`` for all others. + For instance, for a signature ``'(i),(i)->()'``, it is + equivalent to passing in ``axes=[(axis,), (axis,), ()]``. + keepdims (bool, optional): + If this is set to True, axes which are reduced over will be + left in the result as a dimension with size one, so that the + result will broadcast correctly against the inputs. This + option can only be used for generalized ufuncs that operate + on inputs that all have the same number of core dimensions + and with outputs that have no core dimensions , i.e., with + signatures like ``'(i),(i)->()'`` or ``'(m,m)->()'``. + If used, the location of the dimensions in the output can + be controlled with axes and axis. + casting (str, optional): + Provides a policy for what kind of casting is permitted. + Defaults to ``'same_kind'`` + dtype (dtype, optional): + Overrides the dtype of the calculation and output arrays. + Similar to signature. + signature (str or tuple of dtype, optional): + Either a data-type, a tuple of data-types, or a special + signature string indicating the input and output types of a + ufunc. This argument allows you to provide a specific + signature for the function to be used if registered in the + ``signatures`` kwarg of the ``__init__`` method. + If the loop specified does not exist for the ufunc, then + a TypeError is raised. Normally, a suitable loop is found + automatically by comparing the input types with what is + available and searching for a loop with data-types to + which all inputs can be cast safely. This keyword argument + lets you bypass that search and choose a particular loop. + order (str, optional): + Specifies the memory layout of the output array. Defaults to + ``'K'``.``'C'`` means the output should be C-contiguous, + ``'F'`` means F-contiguous, ``'A'`` means F-contiguous + if the inputs are F-contiguous and not also not C-contiguous, + C-contiguous otherwise, and ``'K'`` means to match the element + ordering of the inputs as closely as possible. + out (cupy.ndarray): Output array. It outputs to new arrays + default. + + Returns: + Output array or a tuple of output arrays. + ''' + + # This argument cannot be used for generalized ufuncs + # as those take non-scalar input. + # where = kwargs.pop('where', None) + + outs = kwargs.pop('out', None) + axes = kwargs.pop('axes', None) + axis = kwargs.pop('axis', None) + order = kwargs.pop('order', 'K') + dtype = kwargs.pop('dtype', None) + keepdims = kwargs.pop('keepdims', False) + signature = kwargs.pop('signature', None) + casting = kwargs.pop('casting', 'same_kind') + if len(kwargs) > 0: + raise RuntimeError( + 'Unknown kwargs {}'.format(' '.join(kwargs.keys()))) + + ret_dtype = None + func = self._func + + # this will cast the inputs appropiately + args, ret_dtype, func = self._ops_register.determine_dtype( + args, dtype, casting, signature) + + if not type(self._signature) == str: + raise TypeError('`signature` has to be of type string') + + if outs is not None and type(outs) != tuple: + if isinstance(outs, cupy.ndarray): + outs = (outs,) + else: + raise TypeError('`outs` must be a tuple or `cupy.ndarray`') + + filter_order = self._determine_order(args, order) + + input_coredimss = self._input_coredimss + output_coredimss = self._output_coredimss + if outs is not None and type(outs) != tuple: + raise TypeError('`outs` must be a tuple') + # Axes + input_axes, output_axes = _validate_normalize_axes( + axes, axis, keepdims, input_coredimss, output_coredimss + ) + + if len(input_coredimss) != len(args): + ValueError( + 'According to `signature`, `func` requires %d arguments,' + ' but %s given' % (len(input_coredimss), len(args))) + + args, dimsizess, loop_output_dims, outs, m_dims = self._get_args_transposed( # NOQA + args, input_axes, outs, output_axes) + + # The output shape varies depending on optional dims or not + # TODO(ecastill) this only works for one out argument + out_shape = [dimsizess[od][0] for od in loop_output_dims] + if self._nout > 0: + out_shape += [dimsizess[od][0] for od in output_coredimss[0]] + out_shape = tuple(out_shape) + + if outs is None: + outs = cupy.empty(out_shape, dtype=ret_dtype, order=filter_order) + if order == 'K': + strides = internal._get_strides_for_order_K( + outs, ret_dtype, out_shape) + outs._set_shape_and_strides(out_shape, strides, True, True) + outs = (outs,) + else: + if outs[0].shape != out_shape: + raise ValueError(f'Invalid shape for out {outs[0].shape}' + f' needs {out_shape}') + + _raise_if_invalid_cast( + ret_dtype, outs[0].dtype, casting, "out dtype") + + self._apply_func_to_inputs( + func, 0, dimsizess, loop_output_dims, args, outs) + + # This code credit goes to Dask + # https://github.com/dask/dask/blob/61b578f5a3ad88cbc6a8b9a73ce08c551bd969fa/dask/array/gufunc.py#L462-L503 + # Treat direct output + + if self._nout == 0: + output_coredimss = [output_coredimss] + + # Split output + # tmp might be a tuple of outs + # we changed the way we apply the function compared to dask + # we have added support for optional dims + leaf_arrs = [] + for tmp in outs: + for i, (ocd, oax) in enumerate(zip(output_coredimss, output_axes)): + leaf_arr = tmp + + # Axes: + if keepdims: + slices = (len(leaf_arr.shape) * (slice(None),) + + len(oax) * (numpy.newaxis,)) + leaf_arr = leaf_arr[slices] + + tidcs = [None] * len(leaf_arr.shape) + for i, oa in zip(range(-len(oax), 0), oax): + tidcs[oa] = i + j = 0 + for i in range(len(tidcs)): + if tidcs[i] is None: + tidcs[i] = j + j += 1 + leaf_arr = leaf_arr.transpose(tidcs) + # Delete the dims that were optionals after the input expansion + if len(m_dims) > 0: + shape = leaf_arr.shape + # This line deletes the dimensions that were not present + # in the input + core_shape = shape[-len(ocd):] + core_shape = tuple([ + d for d, n in zip(core_shape, ocd) if n not in m_dims]) + shape = shape[:-len(ocd)] + core_shape + leaf_arr = leaf_arr.reshape(shape) + # leaf_arrs.append(leaf_arr.astype(leaf_arr.dtype, order=order)) # NOQA + leaf_arrs.append(leaf_arr) + return tuple(leaf_arrs) if self._nout > 1 else leaf_arrs[0] diff --git a/cupy/_core/_kernel.pxd b/cupy/_core/_kernel.pxd new file mode 100644 index 0000000..e4b139e --- /dev/null +++ b/cupy/_core/_kernel.pxd @@ -0,0 +1,170 @@ +from libcpp cimport vector + +from cupy._core cimport _carray +from cupy._core cimport _scalar +from cupy._core._carray cimport shape_t +from cupy._core.core cimport _ndarray_base +from cupy.cuda cimport memory +from cupy.cuda cimport texture + + +cdef class ParameterInfo: + cdef: + readonly str name + readonly object dtype + readonly str ctype + readonly bint raw + readonly bint is_const + + +cdef enum _ArgKind: + ARG_KIND_NDARRAY = 1 + ARG_KIND_INDEXER + ARG_KIND_SCALAR + ARG_KIND_POINTER + ARG_KIND_TEXTURE + + +cdef class _ArgInfo: + # Holds metadata of an argument. + # This class is immutable and used as a part of hash keys. + + cdef: + readonly _ArgKind arg_kind + readonly type type + readonly object dtype + readonly int ndim + readonly bint c_contiguous + readonly bint index_32_bits + + cdef _ArgInfo _init( + self, + _ArgKind arg_kind, + type typ, + object dtype, + int ndim, + bint c_contiguous, + bint index_32_bits) + + @staticmethod + cdef _ArgInfo from_arg(object arg) + + @staticmethod + cdef _ArgInfo from_ndarray(_ndarray_base arg) + + @staticmethod + cdef _ArgInfo from_scalar(_scalar.CScalar arg) + + @staticmethod + cdef _ArgInfo from_indexer(_carray.Indexer arg) + + @staticmethod + cdef _ArgInfo from_memptr(memory.MemoryPointer arg) + + @staticmethod + cdef _ArgInfo from_texture(texture.TextureObject arg) + + cdef _ArgInfo as_ndarray_with_ndim(self, int ndim) + + cdef bint is_ndarray(self) + + cdef bint is_scalar(self) + + cdef str get_c_type(self) + + cdef str get_param_c_type(self, ParameterInfo p) + + cdef str get_c_var_name(self, ParameterInfo p) + + +cdef class _TypeMap: + # Typedef mapping between C types. + # This class is immutable. + + cdef: + tuple _pairs + + cdef str get_typedef_code(self) + + +cdef class _Op: + """Simple data structure that represents a kernel routine with single \ +concrete dtype mapping. + """ + + cdef: + readonly tuple in_types + readonly tuple out_types + readonly int nin + readonly int nout + readonly object routine + # If the type combination specified by in_types and out_types is + # disallowed, error_func must be set instead of routine. + # It's called by check_valid() method. + readonly object error_func + + @staticmethod + cdef _Op _from_type_and_routine_or_error_func( + str typ, object routine, object error_func) + + # Creates an op instance parsing a dtype mapping. + @staticmethod + cdef _Op from_type_and_routine(str typ, routine) + + cpdef tuple get_in_dtypes(self) + + cpdef tuple get_out_dtypes(self) + + # Creates an op instance parsing a dtype mapping with given error function. + @staticmethod + cdef _Op from_type_and_error_func(str typ, error_func) + + # Raises an error if error_func is given. + cdef check_valid(self) + + +cdef class _Ops: + """A kernel routine representation with various dtype mappings. + """ + + cdef: + readonly tuple ops + readonly int nin + readonly int nout + + @staticmethod + cdef _Ops from_tuples(object ops, routine) + + # Queries a single op from input arguments. + cpdef _Op guess_routine( + self, str name, dict cache, list in_args, dtype, _Ops out_ops) + + cpdef _Op _guess_routine_from_in_types( + self, tuple in_types, object can_cast=*) + + cpdef _Op _guess_routine_from_dtype(self, object dtype) + + +cpdef create_ufunc(name, ops, routine=*, preamble=*, doc=*, + default_casting=*, loop_prep=*, out_ops=*, + cutensor_op=*, scatter_op=*) + +cdef tuple _get_arginfos(list args) + +cdef str _get_kernel_params(tuple params, tuple arginfos) + +cdef list _broadcast(list args, tuple params, bint use_size, shape_t& shape) + +cdef list _get_out_args_from_optionals( + subtype, list out_args, tuple out_types, const shape_t& out_shape, casting, + obj) + +cdef list _get_out_args_with_params( + list out_args, tuple out_types, + const shape_t& out_shape, tuple out_params, bint is_size_specified) + +cdef _check_peer_access(_ndarray_base arr, int device_id) + +cdef list _preprocess_args(int dev_id, args, bint use_c_scalar) + +cdef shape_t _reduce_dims(list args, tuple params, const shape_t& shape) diff --git a/cupy/_core/_kernel.pyx b/cupy/_core/_kernel.pyx new file mode 100644 index 0000000..1e719f6 --- /dev/null +++ b/cupy/_core/_kernel.pyx @@ -0,0 +1,1622 @@ +import string +import warnings + +import numpy + +import cupy +from cupy.cuda import compiler +from cupy import _util + +cimport cython # NOQA + +from libcpp cimport vector + +from cupy.cuda cimport device +from cupy.cuda cimport function +from cupy.cuda cimport memory +from cupy.cuda cimport texture +from cupy._core cimport _accelerator +from cupy._core cimport _carray +from cupy._core cimport _scalar +from cupy._core._dtype cimport get_dtype, _raise_if_invalid_cast +from cupy._core._memory_range cimport may_share_bounds +from cupy._core._scalar import get_typename as _get_typename +from cupy._core cimport core +from cupy._core.core cimport _convert_object_with_cuda_array_interface +from cupy._core.core cimport _ndarray_init +from cupy._core.core cimport compile_with_cache +from cupy._core.core cimport _ndarray_base +from cupy._core cimport internal +from cupy_backends.cuda.api cimport runtime + +try: + import cupy_backends.cuda.libs.cutensor as cuda_cutensor +except ImportError: + cuda_cutensor = None + +from cupy._core import _fusion_thread_local + + +cdef inline bint _contains_zero(const shape_t& v) except? -1: + for i in range(v.size()): + if v[i] == 0: + return True + return False + + +@_util.memoize(for_each_device=True) +def _get_warpsize(): + device_id = runtime.getDevice() + return runtime.getDeviceProperties(device_id)['warpSize'] + + +cdef str _get_simple_elementwise_kernel_code( + tuple params, tuple arginfos, str operation, str name, + _TypeMap type_map, str preamble, str loop_prep='', str after_loop=''): + # No loop unrolling due to avoid 64-bit division + module_code = string.Template(''' + ${typedef_preamble} + ${preamble} + extern "C" __global__ void ${name}(${params}) { + ${loop_prep}; + #pragma unroll 1 + CUPY_FOR(i, _ind.size()) { + _ind.set(i); + ${operation}; + } + ${after_loop}; + } + ''').substitute( + typedef_preamble=type_map.get_typedef_code(), + params=_get_kernel_params(params, arginfos), + operation=operation, + name=name, + preamble=preamble, + loop_prep=loop_prep, + after_loop=after_loop) + return module_code + + +cdef function.Function _get_simple_elementwise_kernel_from_code( + str name, str code, tuple options=()): + module = compile_with_cache(code, options) + return module.get_function(name) + + +cdef function.Function _get_simple_elementwise_kernel( + tuple params, tuple arginfos, str operation, str name, + _TypeMap type_map, str preamble, str loop_prep='', str after_loop='', + tuple options=()): + code = _get_simple_elementwise_kernel_code( + params, arginfos, operation, name, type_map, preamble, loop_prep, + after_loop + ) + return _get_simple_elementwise_kernel_from_code(name, code, options) + + +cdef inline int _get_kind_score(int kind): + if b'b' == kind: + return 0 + if b'u' == kind or b'i' == kind: + return 1 + if b'f' == kind or b'c' == kind: + return 2 + return -1 + + +@cython.profile(False) +cdef inline _check_peer_access(_ndarray_base arr, int device_id): + if arr.data.device_id == device_id: + return + + msg = ( + f'The device where the array resides ({arr.data.device_id}) is ' + f'different from the current device ({device_id}).' + ) + + cdef bint peer_access = device._enable_peer_access( + device_id, arr.data.device_id) + if not peer_access: + raise ValueError( + f'{msg} Peer access is unavailable between these devices.') + warnings.warn( + f'{msg} Peer access has been activated automatically.', + _util.PerformanceWarning) + + +cdef inline _preprocess_arg(int dev_id, arg, bint use_c_scalar): + if isinstance(arg, _ndarray_base): + s = arg + _check_peer_access(<_ndarray_base>s, dev_id) + elif isinstance(arg, texture.TextureObject): + s = arg + elif hasattr(arg, '__cuda_array_interface__'): + s = _convert_object_with_cuda_array_interface(arg) + _check_peer_access(<_ndarray_base>s, dev_id) + elif hasattr(arg, '__cupy_get_ndarray__'): + s = arg.__cupy_get_ndarray__() + _check_peer_access(<_ndarray_base>s, dev_id) + else: # scalars or invalid args + if use_c_scalar: + s = _scalar.scalar_to_c_scalar(arg) + else: + s = _scalar.scalar_to_numpy_scalar(arg) + if s is None: + raise TypeError('Unsupported type %s' % type(arg)) + return s + + +cdef list _preprocess_args(int dev_id, args, bint use_c_scalar): + """Preprocesses arguments for kernel invocation + + - Checks device compatibility for ndarrays + - Converts Python/NumPy scalars: + - If use_c_scalar is True, into CScalars. + - If use_c_scalar is False, into NumPy scalars. + """ + cdef list ret = [] + for arg in args: + ret.append(_preprocess_arg(dev_id, arg, use_c_scalar)) + return ret + + +cdef list _preprocess_optional_args(int dev_id, args, bint use_c_scalar): + """Preprocesses arguments for kernel invocation + + - Checks device compatibility for ndarrays + - Converts Python/NumPy scalars: + - If use_c_scalar is True, into CScalars. + - If use_c_scalar is False, into NumPy scalars. + """ + cdef list ret = [] + for arg in args: + if arg is None: + ret.append(None) + else: + ret.append(_preprocess_arg(dev_id, arg, use_c_scalar)) + return ret + + +cdef class _ArgInfo: + # Holds metadata of an argument. + # This class is immutable and used as a part of hash keys. + + def __init__(self, *args): + arg_kind, typ, dtype, ndim, c_contiguous, index_32_bits = args + self._init(arg_kind, typ, dtype, ndim, c_contiguous, index_32_bits) + + cdef _ArgInfo _init( + self, + _ArgKind arg_kind, + type typ, + object dtype, + int ndim, + bint c_contiguous, + bint index_32_bits): + self.arg_kind = arg_kind + self.type = typ + self.dtype = dtype + self.ndim = ndim + self.c_contiguous = c_contiguous + self.index_32_bits = index_32_bits + + @staticmethod + cdef _ArgInfo from_arg(object arg): + typ = type(arg) + if issubclass(typ, _ndarray_base): + return _ArgInfo.from_ndarray(arg) + if typ is _scalar.CScalar: + return _ArgInfo.from_scalar(arg) + if typ is _carray.Indexer: + return _ArgInfo.from_indexer(arg) + if typ is memory.MemoryPointer: + return _ArgInfo.from_memptr(arg) + if typ is texture.TextureObject: + return _ArgInfo.from_texture(arg) + assert False, typ + + @staticmethod + cdef _ArgInfo from_ndarray(_ndarray_base arg): + cdef _ArgInfo ret = _ArgInfo.__new__(_ArgInfo) + ret._init( + ARG_KIND_NDARRAY, + type(arg), + arg.dtype.type, + arg._shape.size(), + arg._c_contiguous, + arg._index_32_bits) + return ret + + @staticmethod + cdef _ArgInfo from_scalar(_scalar.CScalar arg): + cdef _ArgInfo ret = _ArgInfo.__new__(_ArgInfo) + dtype = arg.get_numpy_type() + ret._init(ARG_KIND_SCALAR, _scalar.CScalar, dtype, 0, True, True) + return ret + + @staticmethod + cdef _ArgInfo from_indexer(_carray.Indexer arg): + cdef _ArgInfo ret = _ArgInfo.__new__(_ArgInfo) + ret._init( + ARG_KIND_INDEXER, _carray.Indexer, None, arg.ndim, True, + arg._index_32_bits) + return ret + + @staticmethod + cdef _ArgInfo from_memptr(memory.MemoryPointer arg): + cdef _ArgInfo ret = _ArgInfo.__new__(_ArgInfo) + ret._init( + ARG_KIND_POINTER, memory.MemoryPointer, None, 0, True, True) + return ret + + @staticmethod + cdef _ArgInfo from_texture(texture.TextureObject arg): + cdef _ArgInfo ret = _ArgInfo.__new__(_ArgInfo) + ret._init( + ARG_KIND_TEXTURE, texture.TextureObject, None, 0, True, True) + return ret + + def __hash__(self): + return hash((self.arg_kind, self.type, self.dtype, self.ndim, + self.c_contiguous, self.index_32_bits)) + + def __eq__(self, other): + cdef _ArgInfo oth + if not isinstance(other, _ArgInfo): + return False + oth = other + return ( + self.arg_kind == oth.arg_kind + and self.type is oth.type + and self.dtype == oth.dtype + and self.ndim == oth.ndim + and self.c_contiguous == oth.c_contiguous + and self.index_32_bits == oth.index_32_bits) + + def __repr__(self): + return '<_ArgInfo({})>'.format( + ' '.join([ + 'arg_kind={!r}'.format(self.arg_kind), + 'type={!r}'.format(self.type), + 'dtype={!r}'.format(self.dtype), + 'ndim={!r}'.format(self.ndim), + 'c_contiguous={!r}'.format(self.c_contiguous), + 'index_32_bits={!r}'.format(self.index_32_bits), + ])) + + cdef _ArgInfo as_ndarray_with_ndim(self, int ndim): + # Returns an ndarray _ArgInfo with altered ndim. + # If ndim is the same, self is returned untouched. + assert self.arg_kind == ARG_KIND_NDARRAY + if self.ndim == ndim: + return self + return _ArgInfo( + ARG_KIND_NDARRAY, self.dtype, self.dtype, ndim, False, False) + + cdef bint is_ndarray(self): + return self.arg_kind == ARG_KIND_NDARRAY + + cdef bint is_scalar(self): + return self.arg_kind == ARG_KIND_SCALAR + + cdef str get_c_type(self): + # Returns the C type representation. + if self.arg_kind == ARG_KIND_NDARRAY: + return 'CArray<%s, %d, %d, %d>' % ( + _get_typename(self.dtype), self.ndim, + self.c_contiguous, self.index_32_bits) + if self.arg_kind == ARG_KIND_SCALAR: + return _get_typename(self.dtype) + if self.arg_kind == ARG_KIND_INDEXER: + return 'CIndexer<%d, %d>' % (self.ndim, self.index_32_bits) + if self.arg_kind == ARG_KIND_TEXTURE: + return 'cudaTextureObject_t' + assert False + + cdef str get_param_c_type(self, ParameterInfo p): + # Returns the C type representation in the global function's + # parameter list. + cdef str ctyp = self.get_c_type() + if p.is_const: + return 'const ' + ctyp + return ctyp + + cdef str get_c_var_name(self, ParameterInfo p): + if self.arg_kind in (ARG_KIND_NDARRAY, ARG_KIND_POINTER) and not p.raw: + return '_raw_' + p.name + return p.name + + +cdef tuple _get_arginfos(list args): + return tuple([_ArgInfo.from_arg(a) for a in args]) + + +cdef str _get_kernel_params(tuple params, tuple arginfos): + cdef ParameterInfo p + cdef _ArgInfo arginfo + assert len(params) == len(arginfos) + lst = [] + for i in range(len(params)): + p = params[i] + arginfo = arginfos[i] + lst.append('{} {}'.format( + arginfo.get_param_c_type(p), + arginfo.get_c_var_name(p))) + return ', '.join(lst) + + +cdef shape_t _reduce_dims(list args, tuple params, const shape_t& shape): + """ Remove contiguous stride to optimize CUDA kernel.""" + cdef _ndarray_base arr + + if shape.size() <= 1 or len(args) == 0: + return shape + + if len(args) == 1: # fast path for reduction + a = args[0] + if (params[0]).raw or not isinstance(a, _ndarray_base): + return shape + arr = a + arr = arr.reduced_view() + if arr is a: + return shape + else: + args[0] = arr + return arr._shape + return _reduced_view_core(args, params, shape) + + +cdef shape_t _reduced_view_core(list args, tuple params, const shape_t& shape): + cdef int i, ax, last_ax, ndim + cdef Py_ssize_t total_size + cdef shape_t vecshape, newshape, newstrides + cdef vector.vector[int] array_indexes, axes + cdef vector.vector[int] strides_indexes + cdef ParameterInfo p + cdef _ndarray_base arr + + ndim = shape.size() + array_indexes.reserve(len(args)) + strides_indexes.reserve(len(args)) + for i in range(len(args)): + p = params[i] + if p.raw: + continue + a = args[i] + if isinstance(a, _ndarray_base): + array_indexes.push_back(i) + arr = a + if not arr._c_contiguous: + if ndim == 2: # short cut + return shape + strides_indexes.push_back(i) + + if array_indexes.size() == 0: + return shape + + if strides_indexes.size() == 0: + # The input arrays are all c_contiguous + i = array_indexes[0] + arr = args[i] + total_size = arr.size + newshape.assign(1, total_size) + newstrides.resize(1) + for i in array_indexes: + arr = args[i] + newstrides[0] = arr.dtype.itemsize + # TODO(niboshi): Confirm update_x_contiguity flags + args[i] = arr._view( + type(arr), newshape, newstrides, False, True, arr) + return newshape + + axes.reserve(ndim) + vecshape.reserve(ndim) + for ax in range(ndim): + vecshape.push_back(shape[ax]) + last_ax = -1 + for ax in range(ndim): + if vecshape[ax] == 1: + continue + if last_ax < 0: + last_ax = ax + continue + for i in strides_indexes: + arr = args[i] + if arr._strides[ax] * vecshape[ax] != arr._strides[last_ax]: + axes.push_back(last_ax) + break + else: + vecshape[ax] *= vecshape[last_ax] + last_ax = ax + if last_ax >= 0: + axes.push_back(last_ax) + if axes.size() == ndim: + return shape + + newshape.reserve(axes.size()) + newstrides.reserve(axes.size()) + for ax in axes: + newshape.push_back(vecshape[ax]) + for i in array_indexes: + arr = args[i] + newstrides.clear() + for ax in axes: + newstrides.push_back(arr._strides[ax]) + # TODO(niboshi): Confirm update_x_contiguity flags + args[i] = arr._view(type(arr), newshape, newstrides, False, True, arr) + return newshape + + +cdef class ParameterInfo: + + def __init__(self, str param, bint is_const): + self.name = None + self.dtype = None + self.ctype = None + self.raw = False + self.is_const = is_const + s = tuple([i for i in param.split() if len(i) != 0]) + if len(s) < 2: + raise Exception('Syntax error: %s' % param) + + t, self.name = s[-2:] + if t == 'CIndexer': + pass + elif len(t) == 1: + self.ctype = t + else: + dtype = get_dtype(t) + self.dtype = dtype.type + if dtype.name != t: + raise ValueError('Wrong type %s' % t) + self.ctype = _get_typename(self.dtype) + + for i in s[:-2]: + if i == 'raw': + self.raw = True + elif i == '_non_const': + self.is_const = False + else: + raise Exception('Unknown keyword "%s"' % i) + + def __hash__(self): + return hash(( + self.name, self.dtype, self.ctype, self.raw, self.is_const)) + + def __eq__(self, other): + cdef ParameterInfo oth + if not isinstance(other, ParameterInfo): + return False + oth = other + return ( + self.name == oth.name + and self.dtype == oth.dtype + and self.ctype == oth.ctype + and self.raw == oth.raw + and self.is_const == oth.is_const) + + def __repr__(self): + return ''.format( + ' '.join([ + 'name={!r}'.format(self.name), + 'dtype={!r}'.format(self.dtype), + 'ctype={!r}'.format(self.ctype), + 'raw={!r}'.format(self.raw), + 'is_const={!r}'.format(self.is_const), + ])) + + +@_util.memoize() +def _get_param_info(str s, is_const): + if len(s) == 0: + return () + return tuple([ParameterInfo(i, is_const) for i in s.strip().split(',')]) + + +@_util.memoize() +def _decide_params_type(in_params, out_params, in_args_dtype, out_args_dtype): + return _decide_params_type_core(in_params, out_params, in_args_dtype, + out_args_dtype) + + +cdef class _TypeMap: + + def __init__(self, pairs): + self._pairs = pairs + + def __hash__(self): + return hash(self._pairs) + + def __eq__(self, other): + if not isinstance(other, _TypeMap): + return False + return self._pairs == (<_TypeMap>other)._pairs + + def __str__(self): + return '<_TypeMap {}>'.format(self._pairs) + + cdef str get_typedef_code(self): + # Returns a code fragment of typedef statements used as preamble. + return ''.join([ + 'typedef %s %s;\n' % (_get_typename(ctype2), ctype1) + for ctype1, ctype2 in self._pairs]) + + +cdef tuple _decide_params_type_core( + tuple in_params, tuple out_params, tuple in_args_dtype, + tuple out_args_dtype): + type_dict = {} + if out_args_dtype: + assert len(out_params) == len(out_args_dtype) + for p, a in zip(out_params, out_args_dtype): + if a is None: + raise TypeError('Output arguments must be cupy.ndarray') + if p.dtype is not None: + if get_dtype(a) != get_dtype(p.dtype): + raise TypeError( + 'Type is mismatched. %s %s %s' % (p.name, a, p.dtype)) + elif p.ctype in type_dict: + t = type_dict[p.ctype] + if get_dtype(t) != get_dtype(a): + raise TypeError( + 'Type is mismatched. %s %s %s %s' % ( + p.name, a, t, p.ctype)) + else: + type_dict[p.ctype] = a + + assert len(in_params) == len(in_args_dtype) + unknown_ctype = [] # TODO(leofang): remove this as it's unused? + for p, a in zip(in_params, in_args_dtype): + if a is None: + if p.dtype is None: + unknown_ctype.append(p.ctype) + else: + if p.dtype is not None: + if numpy.dtype(a) != numpy.dtype(p.dtype): + raise TypeError( + 'Type is mismatched. %s %s %s' % (p.name, a, p.dtype)) + elif p.ctype in type_dict: + t = type_dict[p.ctype] + if numpy.dtype(t) != numpy.dtype(a): + raise TypeError( + 'Type is mismatched. %s %s %s %s' % ( + p.name, a, t, p.ctype)) + else: + type_dict[p.ctype] = a + + in_types = tuple([type_dict[p.ctype] if p.dtype is None else p.dtype + for p in in_params]) + out_types = tuple([type_dict[p.ctype] if p.dtype is None else p.dtype + for p in out_params]) + type_map = _TypeMap(tuple(sorted(type_dict.items()))) + return in_types, out_types, type_map + + +cdef list _broadcast(list args, tuple params, bint use_size, shape_t& shape): + # `shape` is an output argument + cdef Py_ssize_t i + cdef ParameterInfo p + cdef bint any_nonraw_array = False + + # Collect non-raw arrays + value = [] + for i, a in enumerate(args): + p = params[i] + if not p.raw and isinstance(a, _ndarray_base): + # Non-raw array + any_nonraw_array = True + value.append(a) + else: + value.append(None) + + if use_size: + if any_nonraw_array: + raise ValueError('Specified \'size\' can be used only ' + 'if all of the ndarray are \'raw\'.') + else: + if not any_nonraw_array: + raise ValueError('Loop size is undecided.') + + # Perform broadcast. + # Note that arrays in `value` are replaced with broadcasted ones. + internal._broadcast_core(value, shape) + + # Restore raw arrays and scalars from the original list. + for i, a in enumerate(value): + if a is None: + value[i] = args[i] + return value + + +cdef _numpy_can_cast = numpy.can_cast + + +cdef list _get_out_args_from_optionals( + subtype, list out_args, tuple out_types, const shape_t& out_shape, casting, + obj +): + cdef _ndarray_base arr + + while len(out_args) < len(out_types): + out_args.append(None) + + for i, a in enumerate(out_args): + if a is None: + out_args[i] = _ndarray_init( + subtype, out_shape, out_types[i], obj) + continue + + if not isinstance(a, _ndarray_base): + raise TypeError( + 'Output arguments type must be cupy.ndarray') + arr = a + if not internal.vector_equal(arr._shape, out_shape): + raise ValueError('Out shape is mismatched') + out_type = get_dtype(out_types[i]) + + _raise_if_invalid_cast(out_type, arr.dtype, casting, "output operand") + return out_args + + +cdef _copy_in_args_if_needed(list in_args, list out_args): + # `in_args` is an input and output argument + cdef _ndarray_base inp, out + for i in range(len(in_args)): + a = in_args[i] + if isinstance(a, _ndarray_base): + inp = a + for out in out_args: + if inp is not out and may_share_bounds(inp, out): + in_args[i] = inp.copy() + break + + +cdef list _get_out_args_with_params( + list out_args, tuple out_types, const shape_t& out_shape, + tuple out_params, bint is_size_specified): + cdef ParameterInfo p + cdef _ndarray_base arr + if not out_args: + for p in out_params: + if p.raw and not is_size_specified: + raise ValueError('Output array size is Undecided') + return [_ndarray_init( + cupy.ndarray, out_shape, t, None) for t in out_types] + + for i, p in enumerate(out_params): + a = out_args[i] + if not isinstance(a, _ndarray_base): + raise TypeError( + 'Output arguments type must be cupy.ndarray') + arr = a + if not p.raw and not internal.vector_equal(arr._shape, out_shape): + raise ValueError('Out shape is mismatched') + return out_args + + +@_util.memoize() +def _get_elementwise_kernel_code( + tuple arginfos, _TypeMap type_map, + tuple params, str operation, str name, + str preamble, str loop_prep='', str after_loop='', tuple options=()): + cdef _ArgInfo arginfo + + op = [] + for p, arginfo in zip(params, arginfos): + if arginfo.is_ndarray() and not p.raw: + if p.is_const: + fmt = 'const {t} &{n} = _raw_{n}[_ind.get()];' + else: + fmt = '{t} &{n} = _raw_{n}[_ind.get()];' + op.append(fmt.format(t=p.ctype, n=p.name)) + op.append(operation) + operation = '\n'.join(op) + return _get_simple_elementwise_kernel_code( + params, arginfos, operation, name, type_map, + preamble, loop_prep, after_loop) + + +@_util.memoize(for_each_device=True) +def _get_elementwise_kernel( + tuple arginfos, _TypeMap type_map, + tuple params, str operation, str name, + str preamble, str loop_prep='', str after_loop='', tuple options=()): + cdef str code = _get_elementwise_kernel_code( + arginfos, type_map, params, operation, name, preamble, loop_prep, + after_loop + ) + return _get_simple_elementwise_kernel_from_code(name, code, options) + + +cdef class ElementwiseKernel: + + """User-defined elementwise kernel. + + This class can be used to define an elementwise kernel with or without + broadcasting. + + The kernel is compiled at an invocation of the + :meth:`~ElementwiseKernel.__call__` method, + which is cached for each device. + The compiled binary is also cached into a file under the + ``$HOME/.cupy/kernel_cache/`` directory with a hashed file name. The cached + binary is reused by other processes. + + Args: + in_params (str): Input argument list. + out_params (str): Output argument list. + operation (str): The body in the loop written in CUDA-C/C++. + name (str): Name of the kernel function. It should be set for + readability of the performance profiling. + reduce_dims (bool): If ``False``, the shapes of array arguments are + kept within the kernel invocation. The shapes are reduced + (i.e., the arrays are reshaped without copy to the minimum + dimension) by default. It may make the kernel fast by reducing the + index calculations. + options (tuple): Compile options passed to NVRTC. For details, see + https://docs.nvidia.com/cuda/nvrtc/index.html#group__options. + preamble (str): Fragment of the CUDA-C/C++ code that is inserted at the + top of the cu file. + no_return (bool): If ``True``, __call__ returns ``None``. + return_tuple (bool): If ``True``, __call__ always returns tuple of + array even if single value is returned. + loop_prep (str): Fragment of the CUDA-C/C++ code that is inserted at + the top of the kernel function definition and above the ``for`` + loop. + after_loop (str): Fragment of the CUDA-C/C++ code that is inserted at + the bottom of the kernel function definition. + + """ + + cdef: + readonly tuple in_params + readonly tuple out_params + readonly Py_ssize_t nin + readonly Py_ssize_t nout + readonly Py_ssize_t nargs + readonly tuple params + readonly object operation + readonly str name + readonly str __name__ + readonly bint reduce_dims + readonly object preamble + readonly bint no_return + readonly bint return_tuple + readonly dict kwargs + readonly dict _params_type_memo + readonly dict _elementwise_kernel_memo + readonly dict _cached_codes + + def __init__(self, in_params, out_params, operation, + name='kernel', reduce_dims=True, preamble='', + no_return=False, return_tuple=False, **kwargs): + if not compiler.is_valid_kernel_name(name): + raise ValueError( + 'Invalid kernel name: "%s"' % name) + + self.in_params = _get_param_info(in_params, True) + self.out_params = _get_param_info(out_params, False) + self.nin = len(self.in_params) + self.nout = len(self.out_params) + self.nargs = self.nin + self.nout + param_rest = _get_param_info('CIndexer _ind', False) + self.params = self.in_params + self.out_params + param_rest + self.operation = operation + self.name = name + self.reduce_dims = reduce_dims + self.preamble = preamble + self.no_return = no_return + self.return_tuple = return_tuple + self.kwargs = kwargs + self._params_type_memo = {} + self._cached_codes = {} + names = [p.name for p in self.in_params + self.out_params] + if 'i' in names: + raise ValueError('Can not use \'i\' as a parameter name') + self._elementwise_kernel_memo = {} + # This is for profiling mechanisms to auto infer a name + self.__name__ = name + + def __call__(self, *args, **kwargs): + """Compiles and invokes the elementwise kernel. + + The compilation runs only if the kernel is not cached. Note that the + kernels with different argument dtypes or dimensions are not + compatible. It means that single ElementwiseKernel object may be + compiled into multiple kernel binaries. + + Args: + args: Arguments of the kernel. + size (int): Range size of the indices. By default, the range size + is automatically determined from the result of broadcasting. + This parameter must be specified if and only if all ndarrays + are `raw` and the range size cannot be determined + automatically. + block_size (int): Number of threads per block. By default, the + value is set to 128. + + Returns: + If ``no_return`` has not set, arrays are returned according to the + ``out_params`` argument of the ``__init__`` method. + If ``no_return`` has set, ``None`` is returned. + + """ + cdef function.Function kern + cdef Py_ssize_t size, i + cdef list in_args, out_args + cdef tuple in_types, out_types + cdef shape_t shape + + size = kwargs.pop('size', -1) + stream = kwargs.pop('stream', None) + block_size = kwargs.pop('block_size', 128) + if len(kwargs): + raise TypeError('Wrong arguments %s' % kwargs) + if block_size <= 0: + raise ValueError('block_size must be greater than zero') + n_args = len(args) + if n_args != self.nin and n_args != self.nargs: + raise TypeError( + 'Wrong number of arguments for {!r}. ' + 'It must be either {} or {} (with outputs), ' + 'but given {}.'.format( + self.name, self.nin, self.nargs, n_args)) + for arg in args: + if hasattr(arg, '__cupy_override_elementwise_kernel__'): + return arg.__cupy_override_elementwise_kernel__( + self, *args, **kwargs) + dev_id = device.get_device_id() + arg_list = _preprocess_args(dev_id, args, True) + + out_args = arg_list[self.nin:] + # _broadcast updates shape + in_args = _broadcast( + arg_list, self.params, size != -1, shape)[:self.nin] + + in_ndarray_types = [] + for a in in_args: + if isinstance(a, _ndarray_base): + t = a.dtype.type + elif isinstance(a, texture.TextureObject): + t = 'cudaTextureObject_t' + else: + t = None + in_ndarray_types.append(t) + in_ndarray_types = tuple(in_ndarray_types) + out_ndarray_types = tuple([a.dtype.type for a in out_args]) + + in_types, out_types, type_map = self._decide_params_type( + in_ndarray_types, out_ndarray_types) + + is_size_specified = False + if size != -1: + shape.assign(1, size) + is_size_specified = True + + out_args = _get_out_args_with_params( + out_args, out_types, shape, self.out_params, is_size_specified) + if self.no_return: + ret = None + elif not self.return_tuple and self.nout == 1: + ret = out_args[0] + else: + ret = tuple(out_args) + + if _contains_zero(shape): + return ret + + for i, x in enumerate(in_args): + if type(x) is _scalar.CScalar: + (<_scalar.CScalar>x).apply_dtype(in_types[i]) + + inout_args = in_args + out_args + + if self.reduce_dims: + shape = _reduce_dims(inout_args, self.params, shape) + indexer = _carray._indexer_init(shape) + inout_args.append(indexer) + + arginfos = _get_arginfos(inout_args) + kern = self._get_elementwise_kernel(dev_id, arginfos, type_map) + kern.linear_launch(indexer.size, inout_args, shared_mem=0, + block_max_size=block_size, stream=stream) + return ret + + cpdef tuple _decide_params_type( + self, tuple in_args_dtype, tuple out_args_dtype): + key = (in_args_dtype, out_args_dtype) + ret = self._params_type_memo.get(key, None) + if ret is not None: + return ret + ret = _decide_params_type_core( + self.in_params, self.out_params, in_args_dtype, out_args_dtype) + self._params_type_memo[key] = ret + return ret + + cpdef function.Function _get_elementwise_kernel( + self, int dev_id, tuple arginfos, _TypeMap type_map): + key = ( + dev_id, + arginfos, + type_map) + kern = self._elementwise_kernel_memo.get(key, None) + if kern is not None: + return kern + kern = _get_elementwise_kernel( + arginfos, type_map, self.params, self.operation, + self.name, self.preamble, **self.kwargs) + + # Store the compiled kernel in the cache. + # Potentially overwrite a duplicate cache entry because + # _get_elementwise_kernel() may include IO wait. + in_types = [] + for x in arginfos: + if x.type is cupy.ndarray: + in_types.append(cupy.dtype(x.dtype).char) + in_types = tuple(in_types) + if in_types not in self._cached_codes: + code = _get_elementwise_kernel_code( + arginfos, type_map, self.params, self.operation, + self.name, self.preamble, **self.kwargs) + self._cached_codes[in_types] = code + self._elementwise_kernel_memo[key] = kern + return kern + + @property + def cached_codes(self): + """Returns a dict that has input types as keys and codes values. + + This proprety method is for debugging purpose. + The return value is not guaranteed to keep backward compatibility. + """ + if len(self._cached_codes) == 0: + warnings.warn( + 'No codes are cached because compilation is deferred until ' + 'the first function call.') + return dict([(k, v) for k, v in self._cached_codes.items()]) + + @property + def cached_code(self): + """Returns `next(iter(self.cached_codes.values()))`. + + This proprety method is for debugging purpose. + The return value is not guaranteed to keep backward compatibility. + """ + codes = self._cached_codes + if len(codes) > 1: + warnings.warn( + 'The input types of the kernel could not be inferred. ' + 'Please use `.cached_codes` instead.') + return next(iter(codes.values())) + + +cdef str fix_cast_expr(src_type, dst_type, str expr): + src_kind = get_dtype(src_type).kind + dst_kind = get_dtype(dst_type).kind + if src_kind == dst_kind: + return expr + if src_kind == 'b': + # HIP has an issue with bool conversions detailed below + if runtime._is_hip_environment: + return f'_hip_bool_cast({expr})' + else: + return f'({expr}) ? 1 : 0' + if src_kind == 'c': + if dst_kind == 'b': + return f'({expr}) != {_scalar.get_typename(src_type)}()' + else: # dst_kind in 'iuf' (int, uint, float) + return f'({expr}).real()' + return expr + + +cdef function.Function _get_ufunc_kernel( + tuple in_types, tuple out_types, routine, tuple arginfos, + bint has_where, params, + name, preamble, loop_prep): + cdef _ArgInfo arginfo + cdef str str_type, str_var + + offset_where = len(in_types) + offset_out = offset_where + if has_where: + offset_out += 1 + + types = [] + op = [] + if has_where: + arginfo = arginfos[offset_where] + if arginfo.is_ndarray(): + op.append('if(!_raw__where[_ind.get()]) continue;') + else: + op.append('if(!_where) continue;') + for i, x in enumerate(in_types): + str_var = 'in%d' % i + str_type = str_var + '_type' + types.append((str_type, x)) + arginfo = arginfos[i] + if arginfo.is_ndarray(): + op.append('const {} {}({});'.format( + str_type, + str_var, + fix_cast_expr(arginfo.dtype, x, f'_raw_{str_var}[_ind.get()]') + )) + + out_op = [] + for i, x in enumerate(out_types): + str_var = 'out%d' % i + str_type = str_var + '_type' + types.append((str_type, x)) + arginfo = arginfos[i + offset_out] + op.append(f'{str_type} {str_var};') + out_op.append('{} = {};'.format( + f'_raw_{str_var}[_ind.get()]', + fix_cast_expr(x, arginfo.dtype, str_var) + )) + + type_map = _TypeMap(tuple(types)) + + op.append(routine) + op.append(';') + op.extend(out_op) + operation = '\n'.join(op) + # HIP/ROCm 4.3 has an issue with ifs and ternary operators + # + # int bool(int x) { + # if (x != 0) return 1; + # return 0; + # } + # + # bool(5) == 1; //false + # bool(5) == 5; //true + # + # also it simplifies (a ? 1 : 0) directly to a, and yields + # an incorrect value + if runtime._is_hip_environment: + preamble += """ + __device__ int _hip_bool_cast(long long int x) { + volatile int a = 1; + if (x == 0) a = 0; + return a; + } + """ + return _get_simple_elementwise_kernel( + params, arginfos, operation, name, type_map, preamble, + loop_prep=loop_prep) + + +cdef inline bint _check_should_use_min_scalar(list in_args) except? -1: + cdef int kind, max_array_kind, max_scalar_kind + cdef bint all_scalars + all_scalars = True + max_array_kind = -1 + max_scalar_kind = -1 + for i in in_args: + kind = _get_kind_score(ord(i.dtype.kind)) + if isinstance(i, _ndarray_base): + all_scalars = False + max_array_kind = max(max_array_kind, kind) + else: + max_scalar_kind = max(max_scalar_kind, kind) + return (max_scalar_kind != -1 and + not all_scalars and + max_array_kind >= max_scalar_kind) + + +cdef dict _mst_unsigned_to_signed = { + i: (numpy.iinfo(j).max, (i, j)) + for i, j in [(numpy.dtype(i).type, numpy.dtype(i.lower()).type) + for i in "BHILQ"]} +cdef _numpy_min_scalar_type = numpy.min_scalar_type + +cdef _min_scalar_type(x): + # A non-negative integer may have two locally minimum scalar + # types: signed/unsigned integer. + # Return both for can_cast, while numpy.min_scalar_type only returns + # the unsigned type. + t = _numpy_min_scalar_type(x) + dt = t.type + if t.kind == 'u': + m, dt2 = _mst_unsigned_to_signed[dt] + if x <= m: + return dt2 + return dt + + +cdef class ufunc: + + """Universal function. + + Attributes: + ~ufunc.name (str): The name of the universal function. + ~ufunc.nin (int): Number of input arguments. + ~ufunc.nout (int): Number of output arguments. + ~ufunc.nargs (int): Number of all arguments. + + """ + + cdef: + readonly Py_ssize_t nin + readonly Py_ssize_t nout + readonly Py_ssize_t nargs + readonly object name + readonly _Ops _ops # normal routines + # routines based on explicitly given output dtype + readonly _Ops _out_ops + readonly object _preamble + readonly object _loop_prep + readonly object _default_casting + readonly object _cutensor_op + readonly int _cutensor_alpha + readonly int _cutensor_gamma + readonly str _scatter_op + readonly tuple _params + readonly tuple _params_with_where + readonly dict _routine_cache + readonly dict _kernel_memo + readonly object __doc__ + readonly object __name__ + readonly object __module__ + + def __init__( + self, name, nin, nout, _Ops ops, preamble='', loop_prep='', doc='', + default_casting=None, *, _Ops out_ops=None, cutensor_op=None, + scatter_op=None): + self.name = name + self.__name__ = name + self.nin = nin + self.nout = nout + self.nargs = nin + nout + self._ops = ops + self._out_ops = out_ops + self._preamble = preamble + self._loop_prep = loop_prep + self.__doc__ = doc + if default_casting is None: + self._default_casting = 'same_kind' + else: + self._default_casting = default_casting + if cutensor_op is not None and cuda_cutensor is not None: + self._cutensor_op, self._cutensor_alpha, self._cutensor_gamma = ( + getattr(cuda_cutensor, cutensor_op[0]), + cutensor_op[1], cutensor_op[2]) + self._scatter_op = scatter_op + + _in_params = tuple( + ParameterInfo('T in%d' % i, True) + for i in range(nin)) + _out_params = tuple( + ParameterInfo('T out%d' % i, False) + for i in range(nout)) + _other_params = ( + ParameterInfo('CIndexer _ind', False),) + self._params = _in_params + _out_params + _other_params + self._params_with_where = ( + _in_params + (ParameterInfo('T _where', False),) + + _out_params + _other_params) + self._routine_cache = {} + self._kernel_memo = {} + + def __repr__(self): + return '' % self.name + + @property + def types(self): + """A list of type signatures. + + Each type signature is represented by type character codes of inputs + and outputs separated by '->'. + + """ + types = [] + for op in self._ops.ops: + in_str = ''.join([get_dtype(t).char for t in op.in_types]) + out_str = ''.join([get_dtype(t).char for t in op.out_types]) + types.append('%s->%s' % (in_str, out_str)) + return types + + def __call__(self, *args, **kwargs): + """Applies the universal function to arguments elementwise. + + Args: + args: Input arguments. Each of them can be a :class:`cupy.ndarray` + object or a scalar. The output arguments can be omitted or be + specified by the ``out`` argument. + out (cupy.ndarray): Output array. It outputs to new arrays + default. + dtype: Data type specifier. + + Returns: + Output array or a tuple of output arrays. + + """ + for arg in args: + if hasattr(arg, '__cupy_override_elementwise_kernel__'): + return arg.__cupy_override_elementwise_kernel__( + self, *args, **kwargs) + + if _fusion_thread_local.is_fusing(): + return _fusion_thread_local.call_ufunc(self, *args, **kwargs) + + cdef function.Function kern + cdef list broad_values + cdef shape_t shape + + out = kwargs.pop('out', None) + where = kwargs.pop('_where', None) + cdef bint has_where = where is not None + dtype = kwargs.pop('dtype', None) + # Note default behavior of casting is 'same_kind' on numpy>=1.10 + casting = kwargs.pop('casting', self._default_casting) + if dtype is not None: + dtype = get_dtype(dtype).type + if kwargs: + raise TypeError('Wrong arguments %s' % kwargs) + + n_args = len(args) + if not (self.nin <= n_args <= self.nargs): + # TODO(kataoka): Fix error message for nout >= 2 (e.g. divmod) + raise TypeError( + 'Wrong number of arguments for {!r}. ' + 'It must be either {} or {} (with outputs), ' + 'but given {}.'.format( + self.name, self.nin, self.nargs, n_args)) + + # parse inputs (positional) and outputs (positional or keyword) + in_args = args[:self.nin] + out_args = args[self.nin:] + if out is not None: + if out_args: + raise ValueError('Cannot specify \'out\' as both ' + 'a positional and keyword argument') + if isinstance(out, tuple): + if len(out) != self.nout: + raise ValueError( + "The 'out' tuple must have exactly one entry per " + "ufunc output") + out_args = out + else: + if 1 != self.nout: + raise ValueError("'out' must be a tuple of arrays") + out_args = out, + + dev_id = device.get_device_id() + in_args = _preprocess_args(dev_id, in_args, False) + out_args = _preprocess_optional_args(dev_id, out_args, False) + given_out_args = [o for o in out_args if o is not None] + + # TODO(kataoka): Typecheck `in_args` w.r.t. `casting` (before + # broadcast). + if has_where: + where_args = _preprocess_args(dev_id, (where,), False) + x = where_args[0] + if isinstance(x, _ndarray_base): + # NumPy seems using casting=safe here + if x.dtype != bool: + raise TypeError( + f'Cannot cast array data from {x.dtype!r} to ' + f'{get_dtype(bool)!r} according to the rule \'safe\'') + else: + # NumPy does not seem raising TypeError. + # CuPy does not have to support `where=object()` etc. and + # `_preprocess_args` rejects it anyway. + where_args[0] = _scalar.CScalar.from_numpy_scalar_with_dtype( + x, numpy.bool_) + else: + where_args = [] + + # _copy_in_args_if_needed updates in_args + _copy_in_args_if_needed(in_args, given_out_args) + _copy_in_args_if_needed(where_args, given_out_args) + broad_values = in_args + where_args + given_out_args + # _broadcast updates shape + internal._broadcast_core(broad_values, shape) + + if (self._cutensor_op is not None + and _accelerator.ACCELERATOR_CUTENSOR in + _accelerator._elementwise_accelerators): + if (self.nin == 2 and self.nout == 1 and + isinstance(in_args[0], _ndarray_base) and + isinstance(in_args[1], _ndarray_base)): + import cupyx.cutensor + ret = cupyx.cutensor._try_elementwise_binary_routine( + in_args[0], in_args[1], dtype, + out_args[0] if len(out_args) == 1 else None, + self._cutensor_op, + self._cutensor_alpha, + self._cutensor_gamma, + ) + if ret is not None: + return ret + + op = self._ops.guess_routine( + self.name, self._routine_cache, in_args, dtype, self._out_ops) + + # Determine a template object from which we initialize the output when + # inputs have subclass instances + def issubclass1(cls, classinfo): + return issubclass(cls, classinfo) and cls is not classinfo + subtype = cupy.ndarray + template = None + for in_arg in in_args: + in_arg_type = type(in_arg) + if issubclass1(in_arg_type, cupy.ndarray): + subtype = in_arg_type + template = in_arg + break + + out_args = _get_out_args_from_optionals( + subtype, out_args, op.out_types, shape, casting, template) + if self.nout == 1: + ret = out_args[0] + else: + ret = tuple(out_args) + + if _contains_zero(shape): + return ret + + inout_args = [] + for i, t in enumerate(op.in_types): + x = broad_values[i] + inout_args.append( + x if isinstance(x, _ndarray_base) else + _scalar.CScalar.from_numpy_scalar_with_dtype(x, t)) + if has_where: + x = broad_values[self.nin] + inout_args.append(x) + inout_args.extend(out_args) + shape = _reduce_dims(inout_args, self._params, shape) + indexer = _carray._indexer_init(shape) + inout_args.append(indexer) + arginfos = _get_arginfos(inout_args) + + kern = self._get_ufunc_kernel(dev_id, op, arginfos, has_where) + + kern.linear_launch(indexer.size, inout_args) + return ret + + cdef str _get_name_with_type(self, tuple arginfos, bint has_where): + cdef str name = self.name + if has_where: + name += '_where' + cdef _ArgInfo arginfo + inout_type_words = [] + for arginfo in arginfos: + dtype = str(numpy.dtype(arginfo.dtype)) + if arginfo.is_ndarray(): + inout_type_words.append(dtype) + elif arginfo.is_scalar(): + inout_type_words.append(dtype.rstrip('0123456789')) + return '{}__{}'.format(name, '_'.join(inout_type_words)) + + cdef function.Function _get_ufunc_kernel( + self, int dev_id, _Op op, tuple arginfos, bint has_where): + cdef function.Function kern + key = (dev_id, op, arginfos, has_where) + kern = self._kernel_memo.get(key, None) + if kern is None: + name = self._get_name_with_type(arginfos, has_where) + params = self._params_with_where if has_where else self._params + kern = _get_ufunc_kernel( + op.in_types, op.out_types, op.routine, arginfos, has_where, + params, name, self._preamble, self._loop_prep) + self._kernel_memo[key] = kern + return kern + + def outer(self, A, B, **kwargs): + """Apply the ufunc operation to all pairs of elements in A and B. + + .. seealso:: + :meth:`numpy.ufunc.outer` + + """ + A = core.array(A) + B = core.array(B) + ndim_a = A.ndim + ndim_b = B.ndim + A = A.reshape(A.shape + (1,) * ndim_b) + B = B.reshape((1,) * ndim_a + B.shape) + return self(A, B, **kwargs) + + def at(self, a, indices, b=None): + """Apply in place operation on the operand ``a`` for elements + specified by ``indices``. + + .. seealso:: + :meth:`numpy.ufunc.at` + """ + if self._scatter_op is not None: + a._scatter_op(indices, b, self._scatter_op) + else: + raise NotImplementedError(f'`{self.name}.at` is not supported yet') + + def reduce(self, array, axis=0, dtype=None, out=None, keepdims=False): + """Reduce ``array`` applying ufunc. + + .. seealso:: + :meth:`numpy.ufunc.reduce` + """ + if self.name == 'cupy_add': + return array.sum(axis, dtype, out, keepdims) + if self.name == 'cupy_multiply': + return array.prod(axis, dtype, out, keepdims) + raise NotImplementedError(f'`{self.name}.reduce` is not supported yet') + + def accumulate(self, array, axis=0, dtype=None, out=None): + """Accumulate ``array`` applying ufunc. + + .. seealso:: + :meth:`numpy.ufunc.accumulate` + """ + if self.name == 'cupy_add': + return array.cumsum(axis, dtype, out) + if self.name == 'cupy_multiply': + return array.cumprod(axis, dtype, out) + raise NotImplementedError( + f'`{self.name}.accumulate` is not supported yet') + + def reduceat(self, array, indices, axis=0, dtype=None, out=None): + """Reduce ``array`` applying ufunc with indices. + + .. seealso:: + :meth:`numpy.ufunc.reduceat` + """ + if self.name == 'cupy_add': + return array._add_reduceat(indices, axis, dtype, out) + raise NotImplementedError( + f'`{self.name}.reduceat` is not supported yet') + + +cdef class _Op: + + def __init__( + self, tuple in_types, tuple out_types, object routine, + object error_func): + if error_func is None: + assert routine is not None + else: + assert callable(error_func) + self.in_types = in_types + self.out_types = out_types + self.nin = len(in_types) + self.nout = len(out_types) + self.routine = routine + self.error_func = error_func + + @staticmethod + cdef _Op _from_type_and_routine_or_error_func( + str typ, object routine, object error_func): + # TODO(niboshi): Write type mapping specification. + types = typ.split('->') + if len(types) == 1: + in_types = out_types = tuple(types) + else: + in_types, out_types = map(tuple, types) + in_types = tuple([get_dtype(t).type for t in in_types]) + out_types = tuple([get_dtype(t).type for t in out_types]) + return _Op(in_types, out_types, routine, error_func) + + @staticmethod + cdef _Op from_type_and_routine(str typ, routine): + return _Op._from_type_and_routine_or_error_func(typ, routine, None) + + @staticmethod + cdef _Op from_type_and_error_func(str typ, error_func): + return _Op._from_type_and_routine_or_error_func(typ, None, error_func) + + cdef check_valid(self): + if self.error_func is not None: + self.error_func() + + cpdef tuple get_in_dtypes(self): + return tuple([get_dtype(t) for t in self.in_types]) + + cpdef tuple get_out_dtypes(self): + return tuple([get_dtype(t) for t in self.out_types]) + + +cdef class _Ops: + + def __init__(self, tuple ops): + assert len(ops) > 0 + nin = ops[0].nin + nout = ops[0].nout + assert all(op.nin == nin for op in ops) + assert all(op.nout == nout for op in ops) + self.ops = ops + self.nin = nin + self.nout = nout + + @staticmethod + cdef _Ops from_tuples(object ops, routine): + ops_ = [] + for t in ops: + if isinstance(t, tuple): + typ, rt = t + if isinstance(rt, tuple): + rt = tuple([r1 or r2 for r1, r2 in zip(rt, routine)]) + elif not isinstance(rt, str): + assert callable(rt) + ops_.append(_Op.from_type_and_error_func(typ, rt)) + continue + else: + assert isinstance(t, str) + typ, rt = t, routine + ops_.append(_Op.from_type_and_routine(typ, rt)) + return _Ops(tuple(ops_)) + + cpdef _Op guess_routine( + self, str name, dict cache, list in_args, dtype, _Ops out_ops): + cdef _Ops ops_ + if dtype is None: + use_raw_value = _check_should_use_min_scalar(in_args) + if use_raw_value: + in_types = tuple([ + a.dtype.type if isinstance(a, _ndarray_base) + else _min_scalar_type(a) + for a in in_args]) + else: + in_types = tuple([a.dtype.type for a in in_args]) + op = cache.get(in_types, ()) + if op is (): + op = self._guess_routine_from_in_types(in_types) + cache[in_types] = op + else: + op = cache.get(dtype, ()) + if op is (): + ops_ = out_ops or self + op = ops_._guess_routine_from_dtype(dtype) + cache[dtype] = op + + if op is not None: + # raise TypeError if the type combination is disallowed + (<_Op>op).check_valid() + + return op + + if dtype is None: + dtype = tuple([a.dtype.type for a in in_args]) + raise TypeError('Wrong type (%s) of arguments for %s' % + (dtype, name)) + + cpdef _Op _guess_routine_from_in_types( + self, tuple in_types, object can_cast=_numpy_can_cast): + cdef _Op op + cdef tuple op_types + cdef Py_ssize_t n = len(in_types) + cdef Py_ssize_t i + for op in self.ops: + op_types = op.in_types + for i in range(n): + it = in_types[i] + ot = op_types[i] + if isinstance(it, tuple): + if not can_cast(it[0], ot) and not can_cast(it[1], ot): + break + elif not can_cast(it, ot): + break + else: + return op + return None + + cpdef _Op _guess_routine_from_dtype(self, object dtype): + cdef _Op op + cdef tuple op_types + for op in self.ops: + op_types = op.out_types + for t in op_types: + if t != dtype: + break + else: + return op + return None + + +cpdef create_ufunc(name, ops, routine=None, preamble='', doc='', + default_casting=None, loop_prep='', out_ops=None, + cutensor_op=None, scatter_op=None): + ops_ = _Ops.from_tuples(ops, routine) + _out_ops = None if out_ops is None else _Ops.from_tuples(out_ops, routine) + return ufunc( + name, ops_.nin, ops_.nout, ops_, preamble, + loop_prep, doc, default_casting=default_casting, out_ops=_out_ops, + cutensor_op=cutensor_op, scatter_op=scatter_op) diff --git a/cupy/_core/_memory_range.pxd b/cupy/_core/_memory_range.pxd new file mode 100644 index 0000000..d359b0e --- /dev/null +++ b/cupy/_core/_memory_range.pxd @@ -0,0 +1,7 @@ +from cupy._core.core cimport _ndarray_base + +from libcpp.pair cimport pair + + +cpdef pair[Py_ssize_t, Py_ssize_t] get_bound(_ndarray_base array) +cpdef bint may_share_bounds(_ndarray_base a, _ndarray_base b) diff --git a/cupy/_core/_memory_range.pyx b/cupy/_core/_memory_range.pyx new file mode 100644 index 0000000..db1a8b1 --- /dev/null +++ b/cupy/_core/_memory_range.pyx @@ -0,0 +1,40 @@ +from cupy._core.core cimport _ndarray_base +from cupy.cuda cimport memory + +from libcpp.pair cimport pair + + +cpdef pair[Py_ssize_t, Py_ssize_t] get_bound(_ndarray_base array): + cdef Py_ssize_t left = array.data.ptr + cdef Py_ssize_t right = left + cdef Py_ssize_t tmp + cdef pair[Py_ssize_t, Py_ssize_t] ret + cdef size_t i + + for i in range(array._shape.size()): + # shape[i] != 0 is assumed + tmp = (array._shape[i] - 1) * array._strides[i] + if tmp > 0: + right += tmp + else: + left += tmp + + ret.first = left + ret.second = right + array.dtype.itemsize + return ret + + +cpdef bint may_share_bounds(_ndarray_base a, _ndarray_base b): + cdef memory.MemoryPointer a_data = a.data + cdef memory.MemoryPointer b_data = b.data + cdef pair[Py_ssize_t, Py_ssize_t] a_range, b_range + + if (a_data.device_id != b_data.device_id + or a_data.mem.ptr != b_data.mem.ptr + or a.size == 0 or b.size == 0): + return False + + a_range = get_bound(a) + b_range = get_bound(b) + + return a_range.first < b_range.second and b_range.first < a_range.second diff --git a/cupy/_core/_optimize_config.pxd b/cupy/_core/_optimize_config.pxd new file mode 100644 index 0000000..070451f --- /dev/null +++ b/cupy/_core/_optimize_config.pxd @@ -0,0 +1,22 @@ +cdef object _thread_local +cdef dict _contexts + + +cdef class _OptimizationConfig: + + cdef readonly object optimize_impl + cdef readonly int max_trials + cdef readonly float timeout + cdef readonly float expected_total_time_per_trial + cdef readonly float max_total_time_per_trial + + +cdef class _OptimizationContext: + + cdef readonly str key + cdef readonly _OptimizationConfig config + cdef readonly dict _params_map + cdef readonly bint _dirty + + +cpdef _OptimizationContext get_current_context() diff --git a/cupy/_core/_optimize_config.pyx b/cupy/_core/_optimize_config.pyx new file mode 100644 index 0000000..4b38b37 --- /dev/null +++ b/cupy/_core/_optimize_config.pyx @@ -0,0 +1,81 @@ +import pickle +import threading + + +cdef _thread_local = threading.local() +cdef _contexts = {} + + +cdef class _OptimizationConfig: + + def __init__( + self, optimize_impl, *, + int max_trials=100, + float timeout=1, + float expected_total_time_per_trial=100 * 1e-6, + float max_total_time_per_trial=0.1): + self.optimize_impl = optimize_impl + self.max_trials = max_trials + self.timeout = timeout + self.expected_total_time_per_trial = expected_total_time_per_trial + self.max_total_time_per_trial = max_total_time_per_trial + + +cdef class _OptimizationContext: + + def __init__(self, str key, _OptimizationConfig config): + self.key = key + self.config = config + self._params_map = {} + self._dirty = False + + def get_params(self, key): + return self._params_map.get(key) + + def set_params(self, key, params): + self._params_map[key] = params + self._dirty = True + + def save(self, filepath): + with open(filepath, mode='wb') as f: + pickle.dump((self.key, self._params_map), f) + self._dirty = False + + def load(self, filepath): + with open(filepath, mode='rb') as f: + key, params_map = pickle.load(f) + if key != self.key: + raise ValueError( + 'Optimization key mismatch {} != {}'.format(key, self.key)) + self._params_map = params_map + self._dirty = False + + def _is_dirty(self): + return self._dirty + + +cpdef _OptimizationContext get_current_context(): + try: + return _thread_local.current_context + except AttributeError: + return None + + +def set_current_context(_OptimizationContext context): + _thread_local.current_context = context + + +def get_new_context( + str key, object optimize_impl, dict config_dict): + c = _contexts.get(key) + if c is None: + config = _OptimizationConfig(optimize_impl, **config_dict) + c = _OptimizationContext(key, config) + _contexts[key] = c + return c + + +def _clear_all_contexts_cache(): + global _contexts + assert get_current_context() is None + _contexts = {} diff --git a/cupy/_core/_reduction.pxd b/cupy/_core/_reduction.pxd new file mode 100644 index 0000000..035ca13 --- /dev/null +++ b/cupy/_core/_reduction.pxd @@ -0,0 +1,77 @@ +from cupy._core._carray cimport shape_t +from cupy._core cimport _kernel +from cupy._core.core cimport _ndarray_base +from cupy.cuda cimport function + + +cdef Py_ssize_t _block_size + + +cpdef tuple _get_axis(object axis, Py_ssize_t ndim) + +cpdef shape_t _get_out_shape( + const shape_t& shape, tuple reduce_axis, tuple out_axis, bint keepdims) + + +cdef class _AbstractReductionKernel: + + cdef: + readonly str name + public str identity + readonly tuple in_params + readonly tuple out_params + readonly tuple _params + readonly str __name__ + readonly dict _cached_codes + + cpdef _ndarray_base _call( + self, + list in_args, list out_args, + const shape_t& a_shape, axis, dtype, + bint keepdims, bint reduce_dims, int device_id, + stream, bint try_use_cub=*, bint sort_reduce_axis=*) + + cdef void _launch( + self, out_block_num, block_size, block_stride, + in_args, out_args, in_shape, out_shape, types, + map_expr, reduce_expr, post_map_expr, reduce_type, + stream, params) + + cdef tuple _get_expressions_and_types( + self, list in_args, list out_args, dtype) + + cdef list _get_out_args( + self, list out_args, tuple out_types, const shape_t& out_shape) + + cdef function.Function _get_function( + self, + tuple params, tuple arginfos, _kernel._TypeMap types, + str map_expr, str reduce_expr, str post_map_expr, str reduce_type, + Py_ssize_t block_size) + + +cdef class ReductionKernel(_AbstractReductionKernel): + + cdef: + readonly int nin + readonly int nout + readonly int nargs + readonly tuple params + readonly str reduce_expr + readonly str map_expr + readonly str post_map_expr + readonly object options + readonly bint reduce_dims + readonly object reduce_type + readonly str preamble + + +cdef shape_t _set_permuted_args( + list args, tuple axis_permutes, const shape_t& shape, tuple params) + +cdef tuple _get_shape_and_strides(list in_args, list out_args) + +cdef _optimizer_copy_arg(a) + +cpdef create_reduction_func( + name, ops, routine=*, identity=*, preamble=*, sort_reduce_axis=*) diff --git a/cupy/_core/_reduction.pyx b/cupy/_core/_reduction.pyx new file mode 100644 index 0000000..0046ce8 --- /dev/null +++ b/cupy/_core/_reduction.pyx @@ -0,0 +1,906 @@ +from cpython cimport sequence + +from cupy._core cimport _carray +from cupy._core cimport _accelerator +from cupy._core._carray cimport shape_t +from cupy._core cimport _cub_reduction +from cupy._core._dtype cimport get_dtype +from cupy._core cimport _kernel +from cupy._core._kernel cimport _broadcast +from cupy._core._kernel cimport _check_peer_access +from cupy._core._kernel cimport _get_arginfos +from cupy._core._kernel cimport _get_out_args_from_optionals +from cupy._core._kernel cimport _get_out_args_with_params +from cupy._core._kernel cimport _preprocess_args +from cupy._core._kernel cimport _reduce_dims +from cupy._core._kernel cimport ParameterInfo, _ArgInfo +from cupy._core cimport _optimize_config +from cupy._core cimport _routines_manipulation as _manipulation +from cupy._core cimport _scalar +from cupy._core._scalar import get_typename as _get_typename +from cupy._core.core cimport _convert_object_with_cuda_array_interface +from cupy._core.core cimport _create_ndarray_from_shape_strides +from cupy._core.core cimport compile_with_cache +from cupy._core.core cimport _ndarray_base +from cupy._core cimport internal +from cupy.cuda cimport device +from cupy.cuda cimport function +from cupy_backends.cuda.api cimport runtime + +import math +import string +import warnings + +import numpy + +import cupy +from cupy._core._kernel import _get_param_info +from cupy._core._kernel import _decide_params_type +from cupy._core._ufuncs import elementwise_copy +from cupy.cuda import compiler +from cupy import _util + + +cpdef str _create_reduction_function_code( + name, block_size, reduce_type, params, arginfos, identity, + pre_map_expr, reduce_expr, post_map_expr, + _kernel._TypeMap type_map, input_expr, output_expr, preamble, options): + # A (incomplete) list of internal variables: + # _J : the index of an element in the array + # _block_size : the number of threads in a block; should be power of 2 + # _block_stride : the number of elements being processed by a block; should + # be power of 2 and <= _block_size + + module_code = string.Template(''' +${type_preamble} +${preamble} +#define REDUCE(a, b) (${reduce_expr}) +#define POST_MAP(a) (${post_map_expr}) +#define _REDUCE(_offset) if (_tid < _offset) { \ + _type_reduce _a = _sdata[_tid], _b = _sdata[(_tid + _offset)]; \ + _sdata[_tid] = REDUCE(_a, _b); \ +} + +typedef ${reduce_type} _type_reduce; +extern "C" __global__ void ${name}(${params}) { + __shared__ char _sdata_raw[${block_size} * sizeof(_type_reduce)]; + _type_reduce *_sdata = reinterpret_cast<_type_reduce*>(_sdata_raw); + unsigned int _tid = threadIdx.x; + + int _J_offset = _tid >> __popc(_block_stride - 1); // _tid / _block_stride + ptrdiff_t _j_offset = (ptrdiff_t)_J_offset * _out_ind.size(); + int _J_stride = ${block_size} >> __popc(_block_stride - 1); + ptrdiff_t _j_stride = (ptrdiff_t)_J_stride * _out_ind.size(); + + for (ptrdiff_t _i_base = (ptrdiff_t)blockIdx.x * _block_stride; + _i_base < _out_ind.size(); + _i_base += (ptrdiff_t)gridDim.x * _block_stride) { + _type_reduce _s = _type_reduce(${identity}); + ptrdiff_t _i = + _i_base + (_tid & (_block_stride - 1)); // _tid % _block_stride + int _J = _J_offset; + for (ptrdiff_t _j = _i + _j_offset; _j < _in_ind.size(); + _j += _j_stride, _J += _J_stride) { + _in_ind.set(_j); + ${input_expr} + _type_reduce _a = static_cast<_type_reduce>(${pre_map_expr}); + _s = REDUCE(_s, _a); + } + _sdata[_tid] = _s; + __syncthreads(); + for (unsigned int _block = ${block_size} / 2; + _block >= _block_stride; _block >>= 1) { + if (_tid < _block) { + _REDUCE(_block); + } + __syncthreads(); + } + if (_tid < _block_stride) { + _s = _sdata[_tid]; + } + if (_tid < _block_stride && _i < _out_ind.size()) { + _out_ind.set(static_cast(_i)); + ${output_expr} + POST_MAP(_s); + } + } +}''').substitute( + name=name, + block_size=block_size, + reduce_type=reduce_type, + params=_kernel._get_kernel_params(params, arginfos), + identity=identity, + reduce_expr=reduce_expr, + pre_map_expr=pre_map_expr, + post_map_expr=post_map_expr, + type_preamble=type_map.get_typedef_code(), + input_expr=input_expr, + output_expr=output_expr, + preamble=preamble) + return module_code + + +cpdef function.Function _create_reduction_function_from_code( + name, code, options): + module = compile_with_cache(code, options) + return module.get_function(name) + + +cpdef function.Function _create_reduction_function( + name, block_size, reduce_type, params, arginfos, identity, + pre_map_expr, reduce_expr, post_map_expr, + _kernel._TypeMap type_map, input_expr, output_expr, preamble, options): + code = _create_reduction_function_code( + name, block_size, reduce_type, params, arginfos, identity, + pre_map_expr, reduce_expr, post_map_expr, type_map, input_expr, + output_expr, preamble, options + ) + return _create_reduction_function_from_code(name, code, options) + + +cpdef tuple _get_axis(object axis, Py_ssize_t ndim): + cdef Py_ssize_t dim + if axis is None: + return (tuple(range(ndim)), ()) + elif sequence.PySequence_Check(axis): + axis = tuple(axis) + else: + axis = axis, + + reduce_axis = tuple(sorted( + [internal._normalize_axis_index(dim, ndim) for dim in axis])) + out_axis = tuple([dim for dim in range(ndim) if dim not in reduce_axis]) + if len(reduce_axis) + len(out_axis) != ndim: + raise ValueError("duplicate value in 'axis'") + return reduce_axis, out_axis + + +cpdef shape_t _get_out_shape( + const shape_t& shape, tuple reduce_axis, tuple out_axis, + bint keepdims): + cdef shape_t out_shape + if keepdims: + out_shape = shape + for i in reduce_axis: + out_shape[i] = 1 + else: + out_shape.reserve(len(out_axis)) + for i in out_axis: + out_shape.push_back(shape[i]) + return out_shape + + +cdef shape_t _set_permuted_args( + list args, tuple axis_permutes, const shape_t& shape, tuple params): + # This function updates `args` + cdef ParameterInfo p + cdef Py_ssize_t i, s + cdef bint need_permutation = False + cdef shape_t out_shape + for i, s in enumerate(axis_permutes): + if i != s: + need_permutation = True + break + if need_permutation: + for p in params: + if p.raw: + raise NotImplementedError('Illegal conditions') + for i, a in enumerate(args): + if isinstance(a, _ndarray_base): + args[i] = _manipulation._transpose(a, axis_permutes) + out_shape.reserve(len(axis_permutes)) + for i in axis_permutes: + out_shape.push_back(shape[i]) + return out_shape + else: + return shape + + +cdef Py_ssize_t _get_contiguous_size( + list args, tuple params, list out_shape, Py_ssize_t ndim) except -1: + ''' + get contiguous size in the *output* axis (not *reduce* axis!) + ''' + cdef int i, j + cdef ParameterInfo p + cdef Py_ssize_t contiguous_size, tmp_contiguous_size, itemsize + out_ndim = len(out_shape) + contiguous_size = 1 + for i, a in enumerate(args): + if not isinstance(a, _ndarray_base): + continue + p = params[i] + if p.raw: + continue + tmp_contiguous_size = 1 + itemsize = a.dtype.itemsize + for j in range(out_ndim): + if a._strides[ndim-j-1] != tmp_contiguous_size * itemsize: + break + tmp_contiguous_size *= out_shape[out_ndim-j-1] + contiguous_size = max(contiguous_size, tmp_contiguous_size) + return contiguous_size + + +cdef Py_ssize_t _default_block_size = ( + 256 if runtime._is_hip_environment else 512) +cdef Py_ssize_t _min_block_size_log = 5 +cdef Py_ssize_t _max_block_size_log = ( + 8 if runtime._is_hip_environment else 9) + + +cpdef (Py_ssize_t, Py_ssize_t, Py_ssize_t) _get_block_specs( # NOQA + Py_ssize_t in_size, Py_ssize_t out_size, + Py_ssize_t contiguous_size, + Py_ssize_t block_size) except*: + cdef Py_ssize_t reduce_block_size, block_stride, out_block_num + if block_size == -1: + block_size = _default_block_size + + reduce_block_size = max(1, in_size // out_size) + contiguous_size = min(contiguous_size, 32) + block_stride = max(contiguous_size, block_size // reduce_block_size) + block_stride = internal.clp2(block_stride // 2 + 1) # floor + out_block_num = (out_size + block_stride - 1) // block_stride + + return block_size, block_stride, out_block_num + + +cdef tuple _sort_axis(tuple axis, tuple strides): + # Sorts axis in the decreasing order of absolute values of strides. + return tuple(sorted(axis, key=lambda i: -abs(strides[i]))) + + +cdef tuple _get_shape_and_strides(list in_args, list out_args): + cdef list shape_and_strides = [] + for x in in_args + out_args: + if isinstance(x, _ndarray_base): + shape_and_strides.append(x.shape) + shape_and_strides.append(x.strides) + else: + shape_and_strides.append(None) + shape_and_strides.append(None) + return tuple(shape_and_strides) + + +cdef _optimizer_copy_arg(a): + if isinstance(a, _ndarray_base): + x = _create_ndarray_from_shape_strides( + cupy.ndarray, a._shape, a._strides, a.dtype, None) + assert a.data.device_id == x.data.device_id + elementwise_copy(a, x) + return x + return a + + +cdef class _AbstractReductionKernel: + + def __init__( + self, str name, str identity, str in_params, str out_params): + assert name is not None + assert identity is not None + assert in_params is not None + assert out_params is not None + + in_params_ = _get_param_info(in_params, True) + out_params_ = _get_param_info(out_params, False) + params = ( + in_params_ + + out_params_ + + _get_param_info('CIndexer _in_ind, CIndexer _out_ind', False) + + _get_param_info('int32 _block_stride', True)) + + self.name = name + self.identity = identity + self.in_params = in_params_ + self.out_params = out_params_ + self._params = params + # This is for profiling mechanisms to auto infer a name + self.__name__ = name + self._cached_codes = {} + + cpdef _ndarray_base _call( + self, + list in_args, list out_args, + const shape_t& a_shape, axis, dtype, + bint keepdims, bint reduce_dims, int device_id, + stream, bint try_use_cub=False, bint sort_reduce_axis=True): + cdef tuple reduce_axis, out_axis, axis_permutes + cdef tuple params, opt_params + cdef tuple shape_and_strides + cdef Py_ssize_t contiguous_size = -1 + cdef Py_ssize_t block_size, block_stride, out_block_num = 0 + cdef shape_t in_shape, out_shape + cdef _ndarray_base ret + cdef bint cub_success + + if dtype is not None: + dtype = get_dtype(dtype).type + + ( + map_expr, reduce_expr, post_map_expr, + in_types, out_types, reduce_type, + type_map, + ) = self._get_expressions_and_types(in_args, out_args, dtype) + + reduce_axis, out_axis = _get_axis(axis, a_shape.size()) + + # When there is only one input array, sort the axes in such a way that + # contiguous (C or F) axes can be squashed in _reduce_dims() later. + # TODO(niboshi): Support (out_axis) > 1 + if (len(in_args) == 1 + and len(out_axis) <= 1 + and not in_args[0]._c_contiguous): + strides = in_args[0].strides + if sort_reduce_axis: + reduce_axis = _sort_axis(reduce_axis, strides) + out_axis = _sort_axis(out_axis, strides) + + out_shape = _get_out_shape(a_shape, reduce_axis, out_axis, keepdims) + out_args = self._get_out_args(out_args, out_types, out_shape) + ret = out_args[0] + if ret.size == 0: + return ret + + if self.identity == '' and internal.is_in(a_shape, 0): + raise ValueError(('zero-size array to reduction operation' + ' %s which has no identity') % self.name) + + in_args = [x if isinstance(x, _ndarray_base) else + _scalar.CScalar.from_numpy_scalar_with_dtype(x, t) + for x, t in zip(in_args, in_types)] + + optimize_context = _optimize_config.get_current_context() + key = () + if optimize_context is not None: + # Calculate a key unique to the reduction setting. + shape_and_strides = _get_shape_and_strides(in_args, out_args) + key = (self.name, shape_and_strides, + in_types, out_types, reduce_type, device_id) + + # Try to use CUB + for accelerator in _accelerator._reduction_accelerators: + if try_use_cub and accelerator == _accelerator.ACCELERATOR_CUB: + cub_success = _cub_reduction._try_to_call_cub_reduction( + self, in_args, out_args, a_shape, stream, optimize_context, + key, map_expr, reduce_expr, post_map_expr, reduce_type, + type_map, reduce_axis, out_axis, out_shape, ret) + if cub_success: + return ret + + axis_permutes = reduce_axis + out_axis + in_shape = _set_permuted_args( + in_args, axis_permutes, a_shape, self.in_params) + + if reduce_dims: + in_shape = _reduce_dims(in_args, self.in_params, in_shape) + out_shape = _reduce_dims(out_args, self.out_params, out_shape) + + params = self._params + + # Calculate the reduction block dimensions. + if optimize_context is None: + # Calculate manually + contiguous_size = _get_contiguous_size( + in_args, self.in_params, out_shape, in_shape.size()) + block_size, block_stride, out_block_num = _get_block_specs( + internal.prod(in_shape), + internal.prod(out_shape), + contiguous_size, -1) + else: + # Optimize dynamically + key = ('simple_reduction',) + key + opt_params = optimize_context.get_params(key) + if opt_params is None: + opt_params = self._get_optimized_params( + optimize_context.config, in_args, out_args, + in_shape, out_shape, type_map, map_expr, reduce_expr, + post_map_expr, reduce_type, stream) + optimize_context.set_params(key, opt_params) + block_size, block_stride, out_block_num = opt_params + + # Launch the kernel + self._launch( + out_block_num, + block_size, + block_stride, + in_args, out_args, + in_shape, out_shape, + type_map, + map_expr, reduce_expr, post_map_expr, reduce_type, + stream, params) + + return ret + + def _get_optimized_params( + self, optimize_config, in_args, out_args, in_shape, out_shape, + type_map, map_expr, reduce_expr, post_map_expr, reduce_type, + stream): + out_size = internal.prod(out_shape) + in_args = [_optimizer_copy_arg(a) for a in in_args] + out_args = [_optimizer_copy_arg(a) for a in out_args] + + contiguous_size = _get_contiguous_size( + in_args, self.in_params, out_shape, len(in_shape)) + block_size, block_stride, default_out_block_num = _get_block_specs( + internal.prod(in_shape), + internal.prod(out_shape), + contiguous_size, -1) + default_block_size_log = math.floor(math.log2(block_size)) + default_block_stride_log = math.floor(math.log2(block_stride)) + + def target_func(block_size, block_stride, out_block_num): + self._launch( + out_block_num, block_size, block_stride, in_args, out_args, + in_shape, out_shape, type_map, map_expr, reduce_expr, + post_map_expr, reduce_type, stream, self._params) + + def suggest_func(trial): + block_size_log = trial.suggest_int( + 'block_size_log', _min_block_size_log, _max_block_size_log) + block_size = 2 ** block_size_log + block_stride_log = trial.suggest_int( + 'block_stride_log', 0, block_size_log) + block_stride = 2 ** block_stride_log + max_out_block_num = (out_size + block_stride - 1) // block_stride + out_block_num = trial.suggest_int( + 'out_block_num', 1, max_out_block_num) + + trial.set_user_attr('block_size', block_size) + trial.set_user_attr('block_stride', block_stride) + return block_size, block_stride, out_block_num + + optimize_impl = optimize_config.optimize_impl + best = optimize_impl( + optimize_config, target_func, suggest_func, + default_best={ + 'block_size_log': default_block_size_log, + 'block_stride_log': default_block_stride_log, + 'out_block_num': default_out_block_num, + } + ) + return ( + best.user_attrs['block_size'], + best.user_attrs['block_stride'], + best.params['out_block_num']) + + cdef inline void _launch( + self, out_block_num, block_size, block_stride, + in_args, out_args, in_shape, out_shape, type_map, + map_expr, reduce_expr, post_map_expr, reduce_type, + stream, params): + cdef function.Function func + + inout_args = ( + in_args + + out_args + + [ + _carray._indexer_init(in_shape), + _carray._indexer_init(out_shape), + # block_stride is passed as the last argument. + _scalar.CScalar.from_int32(block_stride), + ]) + + # Retrieve the kernel function + func = self._get_function( + params, + _get_arginfos(inout_args), + type_map, + map_expr, reduce_expr, post_map_expr, reduce_type, + block_size) + + # Launch the kernel + func.linear_launch( + out_block_num * block_size, inout_args, 0, block_size, stream) + + cdef tuple _get_expressions_and_types( + self, list in_args, list out_args, dtype): + raise NotImplementedError() + + cdef list _get_out_args( + self, list out_args, tuple out_types, const shape_t& out_shape): + raise NotImplementedError() + + cdef function.Function _get_function( + self, + tuple params, tuple arginfos, _kernel._TypeMap type_map, + str map_expr, str reduce_expr, str post_map_expr, str reduce_type, + Py_ssize_t block_size): + raise NotImplementedError() + + @property + def cached_codes(self): + """Returns a dict that has input types as keys and codes values. + + This proprety method is for debugging purpose. + The return value is not guaranteed to keep backward compatibility. + """ + if len(self._cached_codes) == 0: + warnings.warn( + 'No codes are cached because compilation is deferred until ' + 'the first function call or CUB is enabled.') + return dict([(k, v) for k, v in self._cached_codes.items()]) + + @property + def cached_code(self): + """Returns `next(iter(self.cached_codes.values()))`. + + This proprety method is for debugging purpose. + The return value is not guaranteed to keep backward compatibility. + """ + codes = self._cached_codes + if len(codes) > 1: + warnings.warn( + 'The input types of the kernel could not be inferred. ' + 'Please use `.cached_codes` instead.') + return next(iter(codes.values())) + + +# ----------------------------------------------------------------------------- +# create_reduction_func +# ----------------------------------------------------------------------------- + +cpdef _SimpleReductionKernel create_reduction_func( + name, ops, routine=None, identity=None, preamble='', + sort_reduce_axis=True): + ops = _kernel._Ops.from_tuples(ops, routine) + return _SimpleReductionKernel( + name, ops, identity, preamble, sort_reduce_axis) + + +cdef class _SimpleReductionKernel(_AbstractReductionKernel): + + cdef: + readonly _kernel._Ops _ops + readonly str preamble + readonly int nin + readonly int nout + readonly str _input_expr + readonly str _output_expr + readonly dict _routine_cache + readonly bint _sort_reduce_axis + + def __init__( + self, name, _kernel._Ops ops, identity, preamble, + sort_reduce_axis=True): + super().__init__( + name, + '' if identity is None else str(identity), + 'T in0', + 'T out0', + ) + self._ops = ops + self.preamble = preamble + self.nin = 1 + self.nout = 1 + self._input_expr = 'const type_in0_raw in0 = _raw_in0[_in_ind.get()];' + self._output_expr = 'type_out0_raw &out0 = _raw_out0[_out_ind.get()];' + self._routine_cache = {} + self._sort_reduce_axis = sort_reduce_axis + + def __call__(self, object a, axis=None, dtype=None, _ndarray_base out=None, + bint keepdims=False): + + cdef _ndarray_base arr + + if isinstance(a, _ndarray_base): + arr = a + elif hasattr(a, '__cuda_array_interface__'): + arr = _convert_object_with_cuda_array_interface(a) + elif hasattr(a, '__cupy_get_ndarray__'): + arr = a.__cupy_get_ndarray__() + else: + raise TypeError( + 'Argument \'a\' has incorrect type (expected %s, got %s)' % + (cupy.ndarray, type(a))) + in_args = [arr] + + dev_id = device.get_device_id() + _check_peer_access(arr, dev_id) + + if out is None: + out_args = [] + else: + _check_peer_access(out, dev_id) + out_args = [out] + + reduce_dims = True + return self._call( + in_args, out_args, + arr._shape, axis, dtype, keepdims, reduce_dims, dev_id, + None, True, self._sort_reduce_axis) + + cdef tuple _get_expressions_and_types( + self, list in_args, list out_args, dtype): + cdef _kernel._Op op + + op = self._ops.guess_routine( + self.name, self._routine_cache, in_args, dtype, self._ops) + map_expr, reduce_expr, post_map_expr, reduce_type = op.routine + + if reduce_type is None: + reduce_type = _get_typename(op.out_types[0]) + + if out_args: + out_type = out_args[0].dtype.type + else: + out_type = op.out_types[0] + + # We guessed a routine that requires a C2R casting for the input + if (in_args[0].dtype.kind == 'c' + and numpy.dtype(op.in_types[0]).kind == 'f'): + warnings.warn( + 'Casting complex values to real discards the imaginary part', + numpy.ComplexWarning) + in_args[0] = in_args[0].real + + type_map = _kernel._TypeMap(( + ('type_in0_raw', in_args[0].dtype.type), + ('type_out0_raw', out_type), + )) + + return ( + map_expr, reduce_expr, post_map_expr, + op.in_types, op.out_types, reduce_type, + type_map) + + cdef list _get_out_args( + self, list out_args, tuple out_types, const shape_t& out_shape): + return _get_out_args_from_optionals( + cupy.ndarray, out_args, out_types, out_shape, 'unsafe', None) + + cdef function.Function _get_function( + self, + tuple params, tuple arginfos, _kernel._TypeMap type_map, + str map_expr, str reduce_expr, str post_map_expr, str reduce_type, + Py_ssize_t block_size): + + in_types = [] + for x in arginfos: + if x.type is cupy.ndarray: + in_types.append(cupy.dtype(x.dtype).char) + in_types = tuple(in_types) + if in_types not in self._cached_codes: + code = _SimpleReductionKernel_get_cached_function_code( + map_expr, reduce_expr, post_map_expr, reduce_type, + params, arginfos, type_map, + self.name, block_size, self.identity, + self._input_expr, self._output_expr, self.preamble, ()) + self._cached_codes[in_types] = code + + return _SimpleReductionKernel_get_cached_function( + map_expr, reduce_expr, post_map_expr, reduce_type, + params, arginfos, type_map, + self.name, block_size, self.identity, + self._input_expr, self._output_expr, self.preamble, ()) + + +@_util.memoize() +def _SimpleReductionKernel_get_cached_function_code( + map_expr, reduce_expr, post_map_expr, reduce_type, + params, arginfos, _kernel._TypeMap type_map, + name, block_size, identity, input_expr, output_expr, preamble, + options): + return _create_reduction_function_code( + name, block_size, reduce_type, params, arginfos, identity, + map_expr, reduce_expr, post_map_expr, + type_map, input_expr, output_expr, preamble, options) + + +@_util.memoize(for_each_device=True) +def _SimpleReductionKernel_get_cached_function( + map_expr, reduce_expr, post_map_expr, reduce_type, + params, arginfos, _kernel._TypeMap type_map, + name, block_size, identity, input_expr, output_expr, preamble, + options): + return _create_reduction_function( + name, block_size, reduce_type, params, arginfos, identity, + map_expr, reduce_expr, post_map_expr, + type_map, input_expr, output_expr, preamble, options) + + +# ----------------------------------------------------------------------------- +# ReductionKernel +# ----------------------------------------------------------------------------- + + +cdef class ReductionKernel(_AbstractReductionKernel): + + """User-defined reduction kernel. + + This class can be used to define a reduction kernel with or without + broadcasting. + + The kernel is compiled at an invocation of the + :meth:`~ReductionKernel.__call__` method, which is cached for each device. + The compiled binary is also cached into a file under the + ``$HOME/.cupy/kernel_cache/`` directory with a hashed file name. The cached + binary is reused by other processes. + + Args: + in_params (str): Input argument list. + out_params (str): Output argument list. + map_expr (str): Mapping expression for input values. + reduce_expr (str): Reduction expression. + post_map_expr (str): Mapping expression for reduced values. + identity (str): Identity value for starting the reduction. + name (str): Name of the kernel function. It should be set for + readability of the performance profiling. + reduce_type (str): Type of values to be used for reduction. This type + is used to store the special variables ``a``. + reduce_dims (bool): If ``True``, input arrays are reshaped without copy + to smaller dimensions for efficiency. + preamble (str): Fragment of the CUDA-C/C++ code that is inserted at the + top of the cu file. + options (tuple of str): Additional compilation options. + + """ + + def __init__(self, str in_params, str out_params, + map_expr, reduce_expr, post_map_expr, + identity, name='reduce_kernel', reduce_type=None, + reduce_dims=True, preamble='', options=()): + if not compiler.is_valid_kernel_name(name): + raise ValueError( + 'Invalid kernel name: "%s"' % name) + + super().__init__( + name, + '' if identity is None else str(identity), + in_params, + out_params, + ) + self.nin = len(self.in_params) + self.nout = len(self.out_params) + self.nargs = self.nin + self.nout + self.reduce_expr = reduce_expr + self.map_expr = map_expr + self.post_map_expr = post_map_expr + self.options = options + self.reduce_dims = reduce_dims + if reduce_type is None: + self.reduce_type = self.out_params[0].ctype + else: + self.reduce_type = reduce_type + self.preamble = preamble + + def __call__(self, *args, **kwargs): + """Compiles and invokes the reduction kernel. + + The compilation runs only if the kernel is not cached. Note that the + kernels with different argument dtypes, ndims, or axis are not + compatible. It means that single ReductionKernel object may be compiled + into multiple kernel binaries. + + Args: + args: Arguments of the kernel. + out (cupy.ndarray): The output array. This can only be specified if + ``args`` does not contain the output array. + axis (int or tuple of ints): Axis or axes along which the + reduction is performed. + keepdims (bool): If ``True``, the specified axes are remained as + axes of length one. + stream (cupy.cuda.Stream, optional): The CUDA stream to launch the + kernel on. If not given, the current stream will be used. + + Returns: + Arrays are returned according to the ``out_params`` argument of the + ``__init__`` method. + + """ + cdef shape_t broad_shape + + out = kwargs.pop('out', None) + axis = kwargs.pop('axis', None) + keepdims = kwargs.pop('keepdims', False) + stream = kwargs.pop('stream', None) + if kwargs: + raise TypeError('Wrong arguments %s' % kwargs) + + n_args = len(args) + if n_args != self.nin and n_args != self.nargs: + raise TypeError('Wrong number of arguments for %s' % self.name) + + out_args = list(args[self.nin:]) + if out is not None: + if self.nout != 1: + raise NotImplementedError('') + if len(out_args) != 0: + raise ValueError("cannot specify 'out' as both " + "a positional and keyword argument") + out_args = [out] + + dev_id = device.get_device_id() + in_args = _preprocess_args(dev_id, args[:self.nin], False) + out_args = _preprocess_args(dev_id, out_args, False) + in_args = _broadcast(in_args, self.in_params, False, broad_shape) + + return self._call( + in_args, out_args, + broad_shape, axis, None, + keepdims, self.reduce_dims, dev_id, stream, True, True) + + cdef tuple _get_expressions_and_types( + self, list in_args, list out_args, dtype): + + in_ndarray_types = tuple( + [a.dtype.type if isinstance(a, _ndarray_base) else None + for a in in_args]) + out_ndarray_types = tuple( + [a.dtype.type if isinstance(a, _ndarray_base) else None + for a in out_args]) + in_types, out_types, type_map = _decide_params_type( + self.in_params, self.out_params, + in_ndarray_types, out_ndarray_types) + return ( + self.map_expr, self.reduce_expr, self.post_map_expr, + in_types, out_types, self.reduce_type, + type_map) + + cdef list _get_out_args( + self, list out_args, tuple out_types, const shape_t& out_shape): + return _get_out_args_with_params( + out_args, out_types, out_shape, self.out_params, False) + + cdef function.Function _get_function( + self, + tuple params, tuple arginfos, _kernel._TypeMap type_map, + str map_expr, str reduce_expr, str post_map_expr, str reduce_type, + Py_ssize_t block_size): + + in_types = [] + for x in arginfos: + if x.type is cupy.ndarray: + in_types.append(cupy.dtype(x.dtype).char) + in_types = tuple(in_types) + if in_types not in self._cached_codes: + code =_ReductionKernel_get_cached_function_code( + self.nin, self.nout, params, arginfos, type_map, + self.name, block_size, reduce_type, self.identity, + map_expr, reduce_expr, post_map_expr, + self.preamble, self.options) + self._cached_codes[in_types] = code + return _ReductionKernel_get_cached_function( + self.nin, self.nout, params, arginfos, type_map, + self.name, block_size, reduce_type, self.identity, + map_expr, reduce_expr, post_map_expr, + self.preamble, self.options) + + +@_util.memoize() +def _ReductionKernel_get_cached_function_code( + nin, nout, params, arginfos, _kernel._TypeMap type_map, + name, block_size, reduce_type, identity, map_expr, reduce_expr, + post_map_expr, preamble, options): + cdef ParameterInfo p + cdef _ArgInfo arginfo + in_arrays = [ + p for p, arginfo in zip(params[:nin], arginfos[:nin]) + if not p.raw and arginfo.is_ndarray()] + out_arrays = [ + p for p, arginfo in zip(params[nin:nin+nout], arginfos[nin:nin+nout]) + if not p.raw and arginfo.is_ndarray()] + input_expr = '\n'.join( + [(('const {0} {1}' if p.is_const else '{0}& {1}') + + ' = _raw_{1}[_in_ind.get()];').format(p.ctype, p.name) + for p in in_arrays]) + output_expr = '\n'.join( + ['{0} &{1} = _raw_{1}[_out_ind.get()];'.format(p.ctype, p.name) + for p in out_arrays if not p.is_const]) + + return _create_reduction_function_code( + name, block_size, reduce_type, params, arginfos, identity, + map_expr, reduce_expr, post_map_expr, + type_map, input_expr, output_expr, preamble, options) + + +@_util.memoize(for_each_device=True) +def _ReductionKernel_get_cached_function( + nin, nout, params, arginfos, _kernel._TypeMap type_map, + name, block_size, reduce_type, identity, map_expr, reduce_expr, + post_map_expr, preamble, options): + code = _ReductionKernel_get_cached_function_code( + nin, nout, params, arginfos, type_map, + name, block_size, reduce_type, identity, map_expr, reduce_expr, + post_map_expr, preamble, options) + return _create_reduction_function_from_code(name, code, options) diff --git a/cupy/_core/_routines_binary.pxd b/cupy/_core/_routines_binary.pxd new file mode 100644 index 0000000..140f2a3 --- /dev/null +++ b/cupy/_core/_routines_binary.pxd @@ -0,0 +1,6 @@ +cdef object _bitwise_and +cdef object _bitwise_or +cdef object _bitwise_xor +cdef object _invert +cdef object _left_shift +cdef object _right_shift diff --git a/cupy/_core/_routines_binary.pyx b/cupy/_core/_routines_binary.pyx new file mode 100644 index 0000000..e069630 --- /dev/null +++ b/cupy/_core/_routines_binary.pyx @@ -0,0 +1,96 @@ +from cupy._core._kernel import create_ufunc + + +cdef _create_bit_op(name, op, no_bool, doc='', scatter_op=None): + types = () if no_bool else ('??->?',) + return create_ufunc( + 'cupy_' + name, + types + ('bb->b', 'BB->B', 'hh->h', 'HH->H', 'ii->i', 'II->I', 'll->l', + 'LL->L', 'qq->q', 'QQ->Q'), + 'out0 = in0 %s in1' % op, + doc=doc, scatter_op=scatter_op) + + +cdef _bitwise_and = _create_bit_op( + 'bitwise_and', '&', False, + '''Computes the bitwise AND of two arrays elementwise. + + Only integer and boolean arrays are handled. + + .. seealso:: :data:`numpy.bitwise_and` + + ''', + scatter_op='and') + + +cdef _bitwise_or = _create_bit_op( + 'bitwise_or', '|', False, + '''Computes the bitwise OR of two arrays elementwise. + + Only integer and boolean arrays are handled. + + .. seealso:: :data:`numpy.bitwise_or` + + ''', + scatter_op='or') + + +cdef _bitwise_xor = _create_bit_op( + 'bitwise_xor', '^', False, + '''Computes the bitwise XOR of two arrays elementwise. + + Only integer and boolean arrays are handled. + + .. seealso:: :data:`numpy.bitwise_xor` + + ''', + scatter_op='xor') + + +cdef _invert = create_ufunc( + 'cupy_invert', + (('?->?', 'out0 = !in0'), 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', + 'l->l', 'L->L', 'q->q', 'Q->Q'), + 'out0 = ~in0', + doc='''Computes the bitwise NOT of an array elementwise. + + Only integer and boolean arrays are handled. + + .. note:: + :func:`cupy.bitwise_not` is an alias for :func:`cupy.invert`. + + .. seealso:: :data:`numpy.invert` + + ''') + + +cdef _left_shift = _create_bit_op( + 'left_shift', '<<', True, + '''Shifts the bits of each integer element to the left. + + Only integer arrays are handled. + + .. seealso:: :data:`numpy.left_shift` + + ''') + + +cdef _right_shift = _create_bit_op( + 'right_shift', '>>', True, + '''Shifts the bits of each integer element to the right. + + Only integer arrays are handled + + .. seealso:: :data:`numpy.right_shift` + + ''') + + +# Variables to expose to Python +# (cythonized data cannot be exposed to Python, even with cpdef.) +bitwise_and = _bitwise_and +bitwise_or = _bitwise_or +bitwise_xor = _bitwise_xor +invert = _invert +left_shift = _left_shift +right_shift = _right_shift diff --git a/cupy/_core/_routines_indexing.pxd b/cupy/_core/_routines_indexing.pxd new file mode 100644 index 0000000..1fb06cd --- /dev/null +++ b/cupy/_core/_routines_indexing.pxd @@ -0,0 +1,15 @@ +from cupy._core.core cimport _ndarray_base + + +cpdef _ndarray_base _ndarray_argwhere(_ndarray_base self) +cdef _ndarray_base _ndarray_getitem(_ndarray_base self, slices) +cdef _ndarray_setitem(_ndarray_base self, slices, value) +cdef tuple _ndarray_nonzero(_ndarray_base self) +cdef _scatter_op(_ndarray_base a, slices, value, op) +cdef _ndarray_base _ndarray_take(_ndarray_base self, indices, axis, out) +cdef _ndarray_base _ndarray_put(_ndarray_base self, indices, values, mode) +cdef _ndarray_base _ndarray_choose(_ndarray_base self, choices, out, mode) +cdef _ndarray_base _ndarray_compress(_ndarray_base self, condition, axis, out) +cdef _ndarray_base _ndarray_diagonal(_ndarray_base self, offset, axis1, axis2) +cdef _ndarray_base _add_reduceat( + _ndarray_base array, indices, axis, dtype, out) diff --git a/cupy/_core/_routines_indexing.pyx b/cupy/_core/_routines_indexing.pyx new file mode 100644 index 0000000..e399ef9 --- /dev/null +++ b/cupy/_core/_routines_indexing.pyx @@ -0,0 +1,1161 @@ +# distutils: language = c++ +import warnings +import string + +import numpy + +import cupy +import cupy._core.core as core +from cupy._core._kernel import ElementwiseKernel, _get_warpsize +from cupy._core._ufuncs import elementwise_copy + +from libcpp cimport vector + +from cupy._core._carray cimport shape_t +from cupy._core._carray cimport strides_t +from cupy._core cimport core +from cupy._core cimport _routines_math as _math +from cupy._core cimport _routines_manipulation as _manipulation +from cupy._core.core cimport _ndarray_base +from cupy._core cimport internal + + +# _ndarray_base members + + +cdef _ndarray_base _ndarray_getitem(_ndarray_base self, slices): + cdef Py_ssize_t axis + cdef list slice_list + cdef _ndarray_base a + + slice_list = _prepare_slice_list(slices) + a, adv = _view_getitem(self, slice_list) + if adv is None: + return a + + axis = adv + if len(slice_list) == 1: + s = slice_list[0] + if s.dtype.kind == 'b': + return _getitem_mask_single(a, s, axis) + else: + return a.take(s, axis) + + return _getitem_multiple(a, axis, slice_list) + + +cdef _ndarray_setitem(_ndarray_base self, slices, value): + if isinstance(value, _ndarray_base): + value = _squeeze_leading_unit_dims(value) + _scatter_op(self, slices, value, 'update') + + +cdef tuple _ndarray_nonzero(_ndarray_base self): + cdef int ndim + cdef _ndarray_base dst = _ndarray_argwhere(self) + ndim = self.ndim + if ndim >= 1: + return tuple([dst[:, i] for i in range(ndim)]) + else: + warnings.warn( + 'calling nonzero on 0d arrays is deprecated', + DeprecationWarning) + return cupy.zeros(dst.shape[0], numpy.int64), + + +# TODO(kataoka): Rename the function because `_ndarray_base` does not have +# `argwhere` method +cpdef _ndarray_base _ndarray_argwhere(_ndarray_base self): + cdef Py_ssize_t count_nonzero + cdef int ndim + cdef _ndarray_base nonzero + numpy_int64 = numpy.int64 + if self.size == 0: + count_nonzero = 0 + else: + if self.dtype == numpy.bool_: + nonzero = self.ravel() + else: + nonzero = cupy._core.not_equal(self, 0) + nonzero = nonzero.ravel() + + # Get number of True in the mask to determine the shape of the array + # after masking. + if nonzero.size <= 2 ** 31 - 1: + scan_dtype = numpy.int32 + else: + scan_dtype = numpy_int64 + + chunk_size = 512 + + # TODO(anaruse): Use Optuna to automatically tune the threshold + # that determines whether "incomplete scan" is enabled or not. + # Basically, "incomplete scan" is fast when the array size is large, + # but for small arrays, it is better to use the normal method. + incomplete_scan = nonzero.size > chunk_size + + scan_index = _math.scan( + nonzero, op=_math.scan_op.SCAN_SUM, dtype=scan_dtype, out=None, + incomplete=incomplete_scan, chunk_size=chunk_size) + count_nonzero = int(scan_index[-1]) # synchronize! + + ndim = self._shape.size() + dst = core.ndarray((count_nonzero, ndim), dtype=numpy_int64) + if dst.size == 0: + return dst + + nonzero.shape = self.shape + if incomplete_scan: + warp_size = _get_warpsize() + size = scan_index.size * chunk_size + _nonzero_kernel_incomplete_scan(chunk_size, warp_size)( + nonzero, scan_index, dst, + size=size, block_size=chunk_size) + else: + scan_index.shape = self.shape + _nonzero_kernel(nonzero, scan_index, dst) + + return dst + + +cdef _ndarray_base _ndarray_take(_ndarray_base self, indices, axis, out): + cdef Py_ssize_t ndim = self._shape.size() + if axis is None: + return _take(self, indices, 0, ndim, out) + elif ndim == 0: + # check axis after atleast_1d + internal._normalize_axis_index(axis, 1) + return _take(self, indices, 0, 0, out) + else: + axis = internal._normalize_axis_index(axis, ndim) + return _take(self, indices, axis, axis + 1, out) + + +cdef _ndarray_base _ndarray_put(_ndarray_base self, indices, values, mode): + if mode not in ('raise', 'wrap', 'clip'): + raise ValueError('clipmode not understood') + + n = self.size + if not isinstance(indices, _ndarray_base): + indices = core.array(indices) + indices = indices.ravel() + + if not isinstance(values, _ndarray_base): + values = core.array(values, dtype=self.dtype) + if values.size == 0: + return + + if mode == 'raise': + err = cupy.zeros((), dtype=numpy.bool_) + _put_raise_kernel(indices, values, values.size, n, self, err) + if err: + raise IndexError('invalid entry in indices array') + elif mode == 'wrap': + _put_wrap_kernel(indices, values, values.size, n, self) + elif mode == 'clip': + _put_clip_kernel(indices, values, values.size, n, self) + + +cdef _ndarray_base _ndarray_choose(_ndarray_base self, choices, out, mode): + a = self + n = choices.shape[0] + + # broadcast `a` and `choices[i]` for all i + if a.ndim < choices.ndim - 1: + for i in range(choices.ndim - 1 - a.ndim): + a = a[None, ...] + elif a.ndim > choices.ndim - 1: + for i in range(a.ndim + 1 - choices.ndim): + choices = choices[:, None, ...] + ba, bcs = _manipulation.broadcast(a, choices).values + + if out is None: + out = core.ndarray(ba.shape[1:], choices.dtype) + + n_channel = numpy.prod(bcs[0].shape) + if mode == 'raise': + if not ((a < n).all() and (0 <= a).all()): + raise ValueError('invalid entry in choice array') + _choose_kernel(ba[0], bcs, n_channel, out) + elif mode == 'wrap': + ba = ba[0] % n + _choose_kernel(ba, bcs, n_channel, out) + elif mode == 'clip': + _choose_clip_kernel(ba[0], bcs, n_channel, n, out) + else: + raise ValueError('clipmode not understood') + + return out + + +cdef _ndarray_base _ndarray_compress(_ndarray_base self, condition, axis, out): + a = self + + if numpy.isscalar(condition): + raise ValueError('condition must be a 1-d array') + + if not isinstance(condition, _ndarray_base): + condition = core.array(condition, dtype=int) + if condition.ndim != 1: + raise ValueError('condition must be a 1-d array') + + # do not test condition.shape + res = _ndarray_nonzero(condition) # synchronize + + # the `take` method/function also make the input atleast_1d + return _ndarray_take(a, res[0], axis, out) + + +cdef _ndarray_base _ndarray_diagonal(_ndarray_base self, offset, axis1, axis2): + return _diagonal(self, offset, axis1, axis2) + + +# private/internal + + +cdef _ndarray_base _squeeze_leading_unit_dims(_ndarray_base src): + # remove leading 1s from the shape greedily. + # TODO(kataoka): compute requested ndim and do not remove too much for + # printing correct shape in error message. + cdef Py_ssize_t i + for i in range(src.ndim): + if src._shape[i] != 1: + break + else: + i = src.ndim + + if i == 0: + return src + + src = src.view() + # del src._shape[:i] + # del src._strides[:i] + src._shape.erase(src._shape.begin(), src._shape.begin()+i) + src._strides.erase(src._strides.begin(), src._strides.begin()+i) + return src + + +cpdef list _prepare_slice_list(slices): + cdef Py_ssize_t i + cdef list slice_list + cdef bint fix_empty_dtype + + if isinstance(slices, tuple): + slice_list = list(slices) + else: + slice_list = [slices] + + # Convert list/NumPy/CUDA-Array-Interface arrays to cupy.ndarray. + # - Scalar int in indices returns a view. + # - Other array-like (including ()-shaped array) in indices forces to + # return a new array. + for i, s in enumerate(slice_list): + if s is None or s is Ellipsis or isinstance(s, (slice, _ndarray_base)): + continue + + fix_empty_dtype = False + if isinstance(s, (list, tuple)): + # This condition looks inaccurate, but so is NumPy. + # a[1, [np.empty(0, float)]] is allowed, while + # a[1, np.empty((1, 0), float)] raises IndexError. + fix_empty_dtype = True + elif numpy.isscalar(s): + if not isinstance(s, (bool, numpy.bool_)): + # keep scalar int + continue + + if cupy.min_scalar_type(s).char == 'O': + raise IndexError( + 'arrays used as indices must be of integer (or boolean) type') + try: + s = core.array(s, dtype=None, copy=False) + except ValueError: + # "Unsupported dtype" + raise IndexError( + 'only integers, slices (`:`), ellipsis (`...`),' + 'numpy.newaxis (`None`) and integer or ' + 'boolean arrays are valid indices') + if fix_empty_dtype and s.size == 0: + # An empty list means empty indices, not empty mask. + # Fix default dtype (float64). + s = s.astype(numpy.int32) + slice_list[i] = s + + return slice_list + + +cdef tuple _view_getitem(_ndarray_base a, list slice_list): + # Process scalar/slice/ellipsis indices + # Returns a 2-tuple + # - [0] (ndarray): view of a + # - [1] (int or None): start axis for remaining indices + # slice_list will be overwritten. + # input should contain: + # None, Ellipsis, slice (start:stop:step), scalar int, or + # cupy.ndarray + # output will contain: + # cupy.ndarray + cdef shape_t shape + cdef strides_t strides + cdef _ndarray_base v + cdef Py_ssize_t ndim_a, axis_a, ndim_v, axis_v, ndim_ellipsis + cdef Py_ssize_t i, k, offset + cdef Py_ssize_t s_start, s_stop, s_step, dim, ind + cdef slice ss + cdef list index_list, axes + cdef vector.vector[bint] array_like_flags + cdef vector.vector[Py_ssize_t] array_ndims + cdef bint has_ellipsis, flag + cdef char kind + + axis_a = 0 + has_ellipsis = False + for s in slice_list: + if s is None: + continue + elif s is Ellipsis: + if has_ellipsis: + raise IndexError( + "an index can only have a single ellipsis ('...')") + has_ellipsis = True + elif isinstance(s, _ndarray_base): + kind = ord(s.dtype.kind) + if kind == b'b': + k = s.ndim + elif kind == b'i' or kind == b'u': + k = 1 + else: + raise IndexError( + 'arrays used as indices must be of integer or boolean ' + 'type. (actual: {})'.format(s.dtype.type)) + array_ndims.push_back(k) + axis_a += k + else: + # isinstance(s, slice) or numpy.isscalar(s) + axis_a += 1 + if not has_ellipsis: + slice_list.append(Ellipsis) + + ndim_a = a._shape.size() + if axis_a > ndim_a: + raise IndexError( + 'too many indices for array: ' + f'array is {ndim_a}-dimensional, but {axis_a} were indexed') + ndim_ellipsis = ndim_a - axis_a + + # Create new shape and stride + i = 0 + axis_a = 0 + axis_v = 0 + offset = 0 + # index_list: remaining indices to be processed. + # Each elem is a 3-tuple (array, axis_start, axis_count) + index_list = [] + for s in slice_list: + if s is None: + shape.push_back(1) + strides.push_back(0) + axis_v += 1 + array_like_flags.push_back(False) + elif isinstance(s, _ndarray_base): + k = array_ndims[i] + index_list.append((s, axis_v, k)) + i += 1 + kind = ord(s.dtype.kind) + if kind == b'b': + _check_mask_shape(a, s, axis_a) + for _ in range(k): + shape.push_back(a._shape[axis_a]) + strides.push_back(a._strides[axis_a]) + axis_a += 1 + axis_v += k + array_like_flags.push_back(True) + elif s is Ellipsis: + for _ in range(ndim_ellipsis): + shape.push_back(a._shape[axis_a]) + strides.push_back(a._strides[axis_a]) + axis_a += 1 + axis_v += ndim_ellipsis + array_like_flags.push_back(False) + elif isinstance(s, slice): + ss = internal.complete_slice(s, a._shape[axis_a]) + s_start = ss.start + s_stop = ss.stop + s_step = ss.step + if s_step > 0: + dim = (s_stop - s_start - 1) // s_step + 1 + else: + dim = (s_stop - s_start + 1) // s_step + 1 + + if dim == 0: + strides.push_back(a._strides[axis_a]) + else: + strides.push_back(a._strides[axis_a] * s_step) + + if s_start > 0: + offset += a._strides[axis_a] * s_start + shape.push_back(dim) + axis_a += 1 + axis_v += 1 + array_like_flags.push_back(False) + else: + # numpy.isscalar(s) + ind = int(s) + if ind < 0: + ind += a._shape[axis_a] + if not (0 <= ind < a._shape[axis_a]): + msg = ('Index %s is out of bounds for axis %s with ' + 'size %s' % (s, axis_a, a._shape[axis_a])) + raise IndexError(msg) + offset += ind * a._strides[axis_a] + axis_a += 1 + # array-like but not array + array_like_flags.push_back(True) + + ndim_v = axis_v + v = a.view() + if a.size != 0: + v.data = a.data + offset + v._set_shape_and_strides(shape, strides, True, True) + + if array_ndims.empty(): + # no advanced indexing. no mask. + del slice_list[:] + return v, None + + slice_list[:] = [s for s, _, _ in index_list] + + # non-consecutive array-like indices => batch dims go first in output + # consecutive array-like indices => start batch dims there + k = 0 + for i, flag in enumerate(array_like_flags): + if k == 0: + if flag: + k = 1 + elif k == 1: + if not flag: + k = 2 + else: # k == 2 + if flag: + break + else: + return v, index_list[0][1] + + # compute transpose arg + axes = [] + for _, axis_v, k in index_list: + for _ in range(k): + axes.append(axis_v) + axis_v += 1 + axes.extend([dim for dim in range(ndim_v) if dim not in axes]) + v = _manipulation._transpose(v, axes) + return v, 0 + + +@cupy._util.memoize(for_each_device=True) +def _nonzero_kernel_incomplete_scan(block_size, warp_size=32): + in_params = 'raw T a, raw S b' + out_params = 'raw O dst' + loop_prep = string.Template(""" + __shared__ S smem[${warp_size}]; + const int n_warp = ${block_size} / ${warp_size}; + const int warp_id = threadIdx.x / ${warp_size}; + const int lane_id = threadIdx.x % ${warp_size}; + """).substitute(block_size=block_size, warp_size=warp_size) + loop_body = string.Template(""" + S x = 0; + if (i < a.size()) x = a[i]; + for (int j = 1; j < ${warp_size}; j *= 2) { + S tmp = __shfl_up_sync(0xffffffff, x, j, ${warp_size}); + if (lane_id - j >= 0) x += tmp; + } + if (lane_id == ${warp_size} - 1) smem[warp_id] = x; + __syncthreads(); + if (warp_id == 0) { + S y = 0; + if (lane_id < n_warp) y = smem[lane_id]; + for (int j = 1; j < n_warp; j *= 2) { + S tmp = __shfl_up_sync(0xffffffff, y, j, ${warp_size}); + if (lane_id - j >= 0) y += tmp; + } + int block_id = i / ${block_size}; + S base = 0; + if (block_id > 0) base = b[block_id - 1]; + if (lane_id == ${warp_size} - 1) y = 0; + smem[(lane_id + 1) % ${warp_size}] = y + base; + } + __syncthreads(); + x += smem[warp_id]; + S x0 = __shfl_up_sync(0xffffffff, x, 1, ${warp_size}); + if (lane_id == 0) { + x0 = smem[warp_id]; + } + if (x0 < x && i < a.size()) { + O j = i; + for (int d = a.ndim - 1; d >= 0; d--) { + ptrdiff_t ind[] = {x0, d}; + O j_next = j / a.shape()[d]; + dst[ind] = j - j_next * a.shape()[d]; + j = j_next; + } + } + """).substitute(block_size=block_size, warp_size=warp_size) + return cupy.ElementwiseKernel(in_params, out_params, loop_body, + 'cupy_nonzero_kernel_incomplete_scan', + loop_prep=loop_prep) + + +_nonzero_kernel = ElementwiseKernel( + 'T src, S index', 'raw U dst', + ''' + if (src != 0){ + for(int j = 0; j < _ind.ndim; j++){ + ptrdiff_t ind[] = {index - 1, j}; + dst[ind] = _ind.get()[j]; + } + }''', + 'cupy_nonzero_kernel', + reduce_dims=False) + + +_take_kernel_core = ''' +ptrdiff_t out_i = indices % index_range; +if (out_i < 0) out_i += index_range; +if (ldim != 1) out_i += (i / (cdim * rdim)) * index_range; +if (rdim != 1) out_i = out_i * rdim + i % rdim; +out = a[out_i]; +''' + + +_take_kernel = ElementwiseKernel( + 'raw T a, S indices, uint32 ldim, uint32 cdim, uint32 rdim, ' + 'int64 index_range', + 'T out', _take_kernel_core, 'cupy_take') + + +_take_kernel_scalar = ElementwiseKernel( + 'raw T a, int64 indices, uint32 ldim, uint32 cdim, uint32 rdim, ' + 'int64 index_range', + 'T out', _take_kernel_core, 'cupy_take_scalar') + + +_choose_kernel = ElementwiseKernel( + 'S a, raw T choices, int32 n_channel', + 'T y', + 'y = choices[i + n_channel * a]', + 'cupy_choose') + + +_choose_clip_kernel = ElementwiseKernel( + 'S a, raw T choices, int32 n_channel, int32 n', + 'T y', + ''' + S x = a; + if (a < 0) { + x = 0; + } else if (a >= n) { + x = n - 1; + } + y = choices[i + n_channel * x]; + ''', + 'cupy_choose_clip') + + +cdef _put_raise_kernel = ElementwiseKernel( + 'S ind, raw T vals, int64 n_vals, int64 n', + 'raw U data, raw bool err', + ''' + ptrdiff_t ind_ = ind; + if (!(-n <= ind_ && ind_ < n)) { + err[0] = 1; + } else { + if (ind_ < 0) ind_ += n; + data[ind_] = (U)(vals[i % n_vals]); + } + ''', + 'cupy_put_raise') + + +cdef _put_wrap_kernel = ElementwiseKernel( + 'S ind, raw T vals, int64 n_vals, int64 n', + 'raw U data', + ''' + ptrdiff_t ind_ = ind; + ind_ %= n; + if (ind_ < 0) ind_ += n; + data[ind_] = (U)(vals[i % n_vals]); + ''', + 'cupy_put_wrap') + + +cdef _put_clip_kernel = ElementwiseKernel( + 'S ind, raw T vals, int64 n_vals, int64 n', + 'raw U data', + ''' + ptrdiff_t ind_ = ind; + if (ind_ < 0) { + ind_ = 0; + } else if (ind_ >= n) { + ind_ = n - 1; + } + data[ind_] = (U)(vals[i % n_vals]); + ''', + 'cupy_put_clip') + + +cdef _create_scatter_kernel(name, code): + return ElementwiseKernel( + 'T v, S indices, int32 cdim, int32 rdim, int32 adim', + 'raw T a', + string.Template(''' + S wrap_indices = indices % adim; + if (wrap_indices < 0) wrap_indices += adim; + ptrdiff_t li = i / (rdim * cdim); + ptrdiff_t ri = i % rdim; + T &out0 = a[(li * adim + wrap_indices) * rdim + ri]; + T &in0 = out0; + const T &in1 = v; + ${code}; + ''').substitute(code=code), + name, + ) + + +cdef _scatter_update_kernel = _create_scatter_kernel( + 'cupy_scatter_update', 'out0 = in1') + +cdef _scatter_add_kernel = _create_scatter_kernel( + 'cupy_scatter_add', 'atomicAdd(&out0, in1)') + +cdef _scatter_sub_kernel = _create_scatter_kernel( + 'cupy_scatter_sub', 'atomicSub(&out0, in1)') + +cdef _scatter_max_kernel = _create_scatter_kernel( + 'cupy_scatter_max', 'atomicMax(&out0, in1)') + +cdef _scatter_min_kernel = _create_scatter_kernel( + 'cupy_scatter_min', 'atomicMin(&out0, in1)') + +cdef _scatter_and_kernel = _create_scatter_kernel( + 'cupy_scatter_and', 'atomicAnd(&out0, in1)') + +cdef _scatter_or_kernel = _create_scatter_kernel( + 'cupy_scatter_or', 'atomicOr(&out0, in1)') + +cdef _scatter_xor_kernel = _create_scatter_kernel( + 'cupy_scatter_xor', 'atomicXor(&out0, in1)') + + +cdef _create_scatter_mask_kernel(name, code): + return ElementwiseKernel( + 'raw T v, bool mask, S mask_scanned', + 'T a', + string.Template(''' + T &out0 = a; + T &in0 = a; + const T &in1 = v[mask_scanned - 1]; + if (mask) ${code}; + ''').substitute(code=code), + name, + ) + + +cdef _scatter_update_mask_kernel = _create_scatter_mask_kernel( + 'cupy_scatter_update_mask', 'out0 = in1') + +cdef _scatter_add_mask_kernel = _create_scatter_mask_kernel( + 'cupy_scatter_add_mask', 'out0 = in0 + in1') + +cdef _scatter_sub_mask_kernel = _create_scatter_mask_kernel( + 'cupy_scatter_add_mask', 'out0 = in0 - in1') + +cdef _scatter_max_mask_kernel = _create_scatter_mask_kernel( + 'cupy_scatter_max_mask', 'out0 = max(in0, in1)') + +cdef _scatter_min_mask_kernel = _create_scatter_mask_kernel( + 'cupy_scatter_min_mask', 'out0 = min(in0, in1)') + +cdef _scatter_and_mask_kernel = _create_scatter_mask_kernel( + 'cupy_scatter_and_mask', 'out0 = (in0 & in1)') + +cdef _scatter_or_mask_kernel = _create_scatter_mask_kernel( + 'cupy_scatter_or_mask', 'out0 = (in0 | in1)') + +cdef _scatter_xor_mask_kernel = _create_scatter_mask_kernel( + 'cupy_scatter_xor_mask', 'out0 = (in0 ^ in1)') + + +_getitem_mask_kernel = ElementwiseKernel( + 'T a, bool mask, S mask_scanned', + 'raw T out', + 'if (mask) out[mask_scanned - 1] = a', + 'cupy_getitem_mask') + + +cdef _check_mask_shape(_ndarray_base a, _ndarray_base mask, Py_ssize_t axis): + cdef Py_ssize_t i, a_sh, m_sh + for i, m_sh in enumerate(mask._shape): + a_sh = a._shape[axis + i] + if m_sh not in (0, a_sh): + raise IndexError( + 'boolean index did not match indexed array along dimension ' + f'{axis + i}; dimension is {a_sh} ' + f'but corresponding boolean dimension is {m_sh}' + ) + + +cpdef _prepare_mask_indexing_single( + _ndarray_base a, _ndarray_base mask, Py_ssize_t axis): + cdef _ndarray_base mask_scanned, mask_br + cdef int n_true + cdef tuple lshape, rshape, a_shape + cdef Py_ssize_t a_ndim, mask_ndim + + a_ndim = a._shape.size() + mask_ndim = mask._shape.size() + a_shape = a.shape + lshape = a_shape[:axis] + rshape = a_shape[axis + mask._shape.size():] + + if mask.size == 0: + masked_shape = lshape + (0,) + rshape + mask_br = _manipulation._reshape(mask, masked_shape) + return mask_br, mask_br, masked_shape + + # Get number of True in the mask to determine the shape of the array + # after masking. + if mask.size <= 2 ** 31 - 1: + mask_type = numpy.int32 + else: + mask_type = numpy.int64 + op = _math.scan_op.SCAN_SUM + + # starts with 1 + mask_scanned = _math.scan(mask.ravel(), op=op, dtype=mask_type) + n_true = int(mask_scanned[-1]) + masked_shape = lshape + (n_true,) + rshape + # When mask covers the entire array, broadcasting is not necessary. + if mask_ndim == a_ndim and axis == 0: + return ( + mask, + _manipulation._reshape(mask_scanned, mask._shape), + masked_shape) + mask_scanned = None + + # The scan of the broadcasted array is used to index on kernel. + mask = _manipulation._reshape( + mask, + axis * (1,) + mask.shape + (a_ndim - axis - mask_ndim) * (1,)) + if mask._shape.size() > a_ndim: + raise IndexError('too many indices for array') + + mask = _manipulation.broadcast_to(mask, a_shape) + if mask.size <= 2 ** 31 - 1: + mask_type = numpy.int32 + else: + mask_type = numpy.int64 + mask_scanned = _manipulation._reshape( + _math.scan(mask.ravel(), op=_math.scan_op.SCAN_SUM, dtype=mask_type), + mask._shape) + return mask, mask_scanned, masked_shape + + +cpdef _ndarray_base _getitem_mask_single( + _ndarray_base a, _ndarray_base mask, int axis): + cdef _ndarray_base mask_scanned + cdef tuple masked_shape + + mask, mask_scanned, masked_shape = _prepare_mask_indexing_single( + a, mask, axis) + out = core.ndarray(masked_shape, dtype=a.dtype) + if out.size == 0: + return out + return _getitem_mask_kernel(a, mask, mask_scanned, out) + + +cdef _ndarray_base _take( + _ndarray_base a, indices, int start, int stop, _ndarray_base out=None): + # Take along (flattened) axes from start to stop. + # When start + 1 == stop this function behaves similarly to np.take + cdef tuple out_shape, indices_shape + cdef int i, ndim = a._shape.size() + cdef Py_ssize_t ldim, cdim, rdim, index_range + + assert start <= stop + + if numpy.isscalar(indices): + indices_shape = () + cdim = 1 + else: + if not isinstance(indices, _ndarray_base): + indices = core.array(indices, dtype=int) + indices_shape = indices.shape + cdim = indices.size + + ldim = rdim = 1 + if start == 0 and stop == ndim: + out_shape = indices_shape + index_range = a.size + else: + a_shape = a.shape + out_shape = a_shape[:start] + indices_shape + a_shape[stop:] + if len(indices_shape) != 0: + indices = _manipulation._reshape( + indices, + (1,) * start + indices_shape + (1,) * (ndim - stop)) + for i in range(start): + ldim *= a._shape[i] + for i in range(stop, ndim): + rdim *= a._shape[i] + index_range = 1 + for i in range(start, stop): + index_range *= a._shape[i] + + if out is None: + out = core.ndarray(out_shape, dtype=a.dtype) + else: + if out.dtype != a.dtype: + raise TypeError('Output dtype mismatch') + if out.shape != out_shape: + raise ValueError('Output shape mismatch') + if a.size == 0 and out.size != 0: + raise IndexError('cannot do a non-empty take from an empty axes.') + + if isinstance(indices, _ndarray_base): + return _take_kernel( + a.reduced_view(), indices, ldim, cdim, rdim, index_range, out) + else: + return _take_kernel_scalar( + a.reduced_view(), indices, ldim, cdim, rdim, index_range, out) + + +cdef _scatter_op_single( + _ndarray_base a, _ndarray_base indices, value, Py_ssize_t start, + Py_ssize_t stop, op=''): + # When op == 'update', this function behaves similarly to + # a code below using NumPy under the condition that a = a._reshape(shape) + # does not invoke copy. + # + # shape = a[:start] +\ + # (numpy.prod(a[start:stop]),) + a[stop:] + # a = a._reshape(shape) + # slices = (slice(None),) * start + indices +\ + # (slice(None),) * (a.ndim - stop) + # a[slices] = value + cdef Py_ssize_t adim, cdim, rdim + cdef tuple a_shape, indices_shape, lshape, rshape, v_shape + cdef _ndarray_base v + + if not isinstance(value, _ndarray_base): + v = core.array(value, dtype=a.dtype) + else: + v = value.astype(a.dtype, copy=False) + + a_shape = a.shape + + lshape = a_shape[:start] + rshape = a_shape[stop:] + adim = internal.prod_sequence(a_shape[start:stop]) + + indices_shape = indices.shape + v_shape = lshape + indices_shape + rshape + v = _manipulation.broadcast_to(v, v_shape) + + cdim = indices.size + rdim = internal.prod_sequence(rshape) + indices = _manipulation._reshape( + indices, + (1,) * len(lshape) + indices_shape + (1,) * len(rshape)) + indices = _manipulation.broadcast_to(indices, v_shape) + + if op == 'update': + _scatter_update_kernel( + v, indices, cdim, rdim, adim, a.reduced_view()) + elif op == 'add': + # There is constraints on types because atomicAdd() in CUDA 7.5 + # only supports int32, uint32, uint64, and float32. + if not issubclass(v.dtype.type, + (numpy.int32, numpy.float16, numpy.float32, + numpy.float64, numpy.uint32, numpy.uint64, + numpy.intc, numpy.uintc, numpy.ulonglong)): + raise TypeError( + 'cupy.add.at only supports int32, float16, float32, float64, ' + 'uint32, uint64, as data type') + _scatter_add_kernel( + v, indices, cdim, rdim, adim, a.reduced_view()) + elif op == 'sub': + if not issubclass(v.dtype.type, + (numpy.int32, numpy.uint32, + numpy.intc, numpy.uintc)): + raise TypeError( + 'cupy.subtract.at only supports int32, uint32, as data type') + _scatter_sub_kernel( + v, indices, cdim, rdim, adim, a.reduced_view()) + elif op == 'max': + if not issubclass(v.dtype.type, + (numpy.int32, numpy.float32, numpy.float64, + numpy.uint32, numpy.uint64, + numpy.intc, numpy.uintc, numpy.ulonglong)): + raise TypeError( + 'cupy.maximum.at only supports int32, float32, float64, ' + 'uint32, uint64 as data type') + _scatter_max_kernel( + v, indices, cdim, rdim, adim, a.reduced_view()) + elif op == 'min': + if not issubclass(v.dtype.type, + (numpy.int32, numpy.float32, numpy.float64, + numpy.uint32, numpy.uint64, + numpy.intc, numpy.uintc, numpy.ulonglong)): + raise TypeError( + 'cupy.minimum.at only supports int32, float32, float64, ' + 'uint32, uint64 as data type') + _scatter_min_kernel( + v, indices, cdim, rdim, adim, a.reduced_view()) + elif op == 'and': + if not issubclass(v.dtype.type, + (numpy.int32, numpy.int64, + numpy.uint32, numpy.uint64, + numpy.intc, numpy.uintc, + numpy.longlong, numpy.ulonglong)): + raise TypeError( + 'cupy.bitwise_and.at only supports int32, int64, ' + 'uint32, uint64 as data type') + _scatter_and_kernel( + v, indices, cdim, rdim, adim, a.reduced_view()) + elif op == 'or': + if not issubclass(v.dtype.type, + (numpy.int32, numpy.int64, + numpy.uint32, numpy.uint64, + numpy.intc, numpy.uintc, + numpy.longlong, numpy.ulonglong)): + raise TypeError( + 'cupy.bitwise_or.at only supports int32, int64, ' + 'uint32, uint64 as data type') + _scatter_or_kernel( + v, indices, cdim, rdim, adim, a.reduced_view()) + elif op == 'xor': + if not issubclass(v.dtype.type, + (numpy.int32, numpy.int64, + numpy.uint32, numpy.uint64, + numpy.intc, numpy.uintc, + numpy.longlong, numpy.ulonglong)): + raise TypeError( + 'cupy.bitwise_xor.at only supports int32, int64, ' + 'uint32, uint64 as data type') + _scatter_xor_kernel( + v, indices, cdim, rdim, adim, a.reduced_view()) + else: + raise ValueError('provided op is not supported') + + +cdef _scatter_op_mask_single( + _ndarray_base a, _ndarray_base mask, v, Py_ssize_t axis, op): + cdef _ndarray_base mask_scanned, src + cdef tuple masked_shape + + mask, mask_scanned, masked_shape = _prepare_mask_indexing_single( + a, mask, axis) + if internal.prod(masked_shape) == 0: + return + + if not isinstance(v, _ndarray_base): + src = core.array(v, dtype=a.dtype) + else: + src = v + # Cython's static resolution does not work because of omitted arguments + src = (src).astype(a.dtype, copy=False) + # broadcast src to shape determined by the mask + src = _manipulation.broadcast_to(src, masked_shape) + + if op == 'update': + _scatter_update_mask_kernel(src, mask, mask_scanned, a) + elif op == 'add': + _scatter_add_mask_kernel(src, mask, mask_scanned, a) + elif op == 'sub': + _scatter_sub_mask_kernel(src, mask, mask_scanned, a) + elif op == 'max': + _scatter_max_mask_kernel(src, mask, mask_scanned, a) + elif op == 'min': + _scatter_min_mask_kernel(src, mask, mask_scanned, a) + elif op == 'and': + _scatter_and_mask_kernel(src, mask, mask_scanned, a) + elif op == 'or': + _scatter_or_mask_kernel(src, mask, mask_scanned, a) + elif op == 'xor': + _scatter_xor_mask_kernel(src, mask, mask_scanned, a) + else: + raise ValueError('provided op is not supported') + + +cdef _scatter_op(_ndarray_base a, slices, value, op): + cdef Py_ssize_t start, stop, axis + cdef _ndarray_base x, y, reduced_idx + cdef list slice_list + + slice_list = _prepare_slice_list(slices) + a, adv = _view_getitem(a, slice_list) + if adv is not None: + axis = adv + if len(slice_list) == 1: + s = slice_list[0] + if s.dtype.kind == 'b': + _scatter_op_mask_single(a, s, value, axis, op) + else: + _scatter_op_single(a, s, value, axis, axis + 1, op) + else: + # scatter_op with multiple integer arrays + reduced_idx, start, stop = _prepare_multiple_array_indexing( + a, axis, slice_list) + _scatter_op_single(a, reduced_idx, value, start, stop, op) + return + + y = a + + if op == 'update': + if not isinstance(value, _ndarray_base): + y.fill(value) + return + x = value + if (internal.vector_equal(y._shape, x._shape) and + internal.vector_equal(y._strides, x._strides)): + if y.data.ptr == x.data.ptr: + return # Skip since x and y are the same array + elif y._c_contiguous and x.dtype == y.dtype: + y.data.copy_from_device_async(x.data, x.nbytes) + return + elementwise_copy(x, y) + return + if op == 'add': + _math._add(y, value, y) + return + if op == 'sub': + _math._subtract(y, value, y) + return + if op == 'max': + cupy.maximum(y, value, y) + return + if op == 'min': + cupy.minimum(y, value, y) + return + if op == 'and': + cupy.bitwise_and(y, value, y) + return + if op == 'or': + cupy.bitwise_or(y, value, y) + return + if op == 'xor': + cupy.bitwise_xor(y, value, y) + return + raise ValueError('this op is not supported') + + +cdef _ndarray_base _diagonal( + _ndarray_base a, Py_ssize_t offset=0, Py_ssize_t axis1=0, + Py_ssize_t axis2=1): + cdef Py_ssize_t ndim = a.ndim + if not (-ndim <= axis1 < ndim and -ndim <= axis2 < ndim): + raise numpy.AxisError( + 'axis1(={0}) and axis2(={1}) must be within range ' + '(ndim={2})'.format(axis1, axis2, ndim)) + + axis1 %= ndim + axis2 %= ndim + if axis1 < axis2: + min_axis, max_axis = axis1, axis2 + else: + min_axis, max_axis = axis2, axis1 + + tr = list(range(ndim)) + del tr[max_axis] + del tr[min_axis] + if offset >= 0: + a = _manipulation._transpose(a, tr + [axis1, axis2]) + else: + a = _manipulation._transpose(a, tr + [axis2, axis1]) + offset = -offset + + diag_size = max(0, min(a.shape[-2], a.shape[-1] - offset)) + ret_shape = a.shape[:-2] + (diag_size,) + if diag_size == 0: + return core.ndarray(ret_shape, dtype=a.dtype) + + a = a[..., :diag_size, offset:offset + diag_size] + + ret = a.view() + # TODO(niboshi): Confirm update_x_contiguity flags + ret._set_shape_and_strides( + a.shape[:-2] + (diag_size,), + a.strides[:-2] + (a.strides[-1] + a.strides[-2],), + True, True) + return ret + + +_prepare_array_indexing = ElementwiseKernel( + 'T s, S len, S stride', + 'S out', + 'S in0 = s, in1 = len;' + 'out += stride * (in0 - _floor_divide(in0, in1) * in1)', + 'cupy_prepare_array_indexing') + + +cdef tuple _prepare_multiple_array_indexing( + _ndarray_base a, Py_ssize_t start, list slices +): + # slices consist of ndarray + cdef list indices = [], shapes = [] # int ndarrays + cdef Py_ssize_t i, stop, stride + cdef _ndarray_base reduced_idx, s + + for s in slices: + if s.dtype.kind == 'b': + s = _ndarray_argwhere(s).T + indices.extend(s) + shapes.append(s.shape[1:]) + else: + indices.append(s) + shapes.append(s.shape) + + stop = start + len(indices) + + # br = _manipulation.broadcast(*indices) + # indices = list(br.values) + + reduced_idx = core.ndarray( + internal._broadcast_shapes(shapes), dtype=numpy.int64) + reduced_idx.fill(0) + stride = 1 + i = stop + for s in reversed(indices): + i -= 1 + a_shape_i = a._shape[i] + # wrap all out-of-bound indices + if a_shape_i != 0: + _prepare_array_indexing(s, a_shape_i, stride, reduced_idx) + stride *= a_shape_i + + return reduced_idx, start, stop + + +cdef _ndarray_base _getitem_multiple( + _ndarray_base a, Py_ssize_t start, list slices): + reduced_idx, start, stop = _prepare_multiple_array_indexing( + a, start, slices) + return _take(a, reduced_idx, start, stop) + + +cdef _ndarray_base _add_reduceat( + _ndarray_base array, indices, axis, dtype, out): + from cupy._sorting import search + axis = internal._normalize_axis_index(axis, array.ndim) + indices = cupy.append(indices, array.shape[axis]) + shape = [1 if i == axis else dim for i, dim in enumerate(array.shape)] + acc = array.cumsum(axis, dtype) + acc = cupy.append(cupy.zeros(shape, acc.dtype), acc, axis) + mask = indices[:-1] >= indices[1:] + mask = mask.reshape(-1, *([1] * (array.ndim - axis - 1))) + return search._where_ufunc( + mask, + array.take(indices[:-1], axis), + acc.take(indices[1:], axis) - acc.take(indices[:-1], axis), + out + ) diff --git a/cupy/_core/_routines_linalg.pxd b/cupy/_core/_routines_linalg.pxd new file mode 100644 index 0000000..43dcdd1 --- /dev/null +++ b/cupy/_core/_routines_linalg.pxd @@ -0,0 +1,27 @@ +from cupy._core._carray cimport shape_t +from cupy._core.core cimport _ndarray_base + + +cpdef compute_type_to_str(compute_type) + +cpdef get_compute_type(dtype) + +cpdef _ndarray_base dot(_ndarray_base a, _ndarray_base b, _ndarray_base out=*) + +cpdef _ndarray_base tensordot_core( + _ndarray_base a, _ndarray_base b, _ndarray_base out, Py_ssize_t n, + Py_ssize_t m, Py_ssize_t k, const shape_t& ret_shape) + +cpdef _ndarray_base matmul( + _ndarray_base a, _ndarray_base b, _ndarray_base out=*) + + +cpdef enum: + COMPUTE_TYPE_TBD = 0 + COMPUTE_TYPE_DEFAULT = 1 # default + COMPUTE_TYPE_PEDANTIC = 2 # disable algorithmic optimizations + COMPUTE_TYPE_FP16 = 3 # allow converting inputs to FP16 + COMPUTE_TYPE_FP32 = 4 # allow converting inputs to FP32 + COMPUTE_TYPE_FP64 = 5 # allow converting inputs to FP64 + COMPUTE_TYPE_BF16 = 6 # allow converting inputs to BF16 + COMPUTE_TYPE_TF32 = 7 # allow converting inputs to TF32 diff --git a/cupy/_core/_routines_linalg.pyx b/cupy/_core/_routines_linalg.pyx new file mode 100644 index 0000000..0458e98 --- /dev/null +++ b/cupy/_core/_routines_linalg.pyx @@ -0,0 +1,1067 @@ +import math +import os +import warnings + +import cython +import numpy + +import cupy +from cupy._core._kernel import ElementwiseKernel +from cupy._core._reduction import ReductionKernel +from cupy._core._ufuncs import elementwise_copy +import cupy._core.core as core + + +from libc.stdint cimport intptr_t + +from cupy._core cimport _accelerator +from cupy._core._carray cimport shape_t +from cupy._core._dtype cimport to_cuda_dtype +from cupy._core._scalar cimport get_typename +from cupy._core.core cimport _internal_ascontiguousarray +from cupy._core.core cimport _ndarray_init +from cupy._core.core cimport ascontiguousarray +from cupy._core.core cimport _ndarray_base +from cupy._core cimport _memory_range +from cupy._core cimport _routines_manipulation as _manipulation +from cupy._core cimport _routines_math as _math +from cupy.cuda cimport device +from cupy_backends.cuda.api cimport runtime +from cupy_backends.cuda.libs cimport cublas + + +cdef extern from '../../cupy_backends/cupy_complex.h': + ctypedef struct cuComplex 'cuComplex': + float x, y + + ctypedef struct cuDoubleComplex 'cuDoubleComplex': + double x, y + + +cdef int _cuda_runtime_version = -1 + + +cdef list compute_types = [COMPUTE_TYPE_TBD, # float16 + COMPUTE_TYPE_TBD, # float32 + COMPUTE_TYPE_TBD] # float64 +cdef dict compute_type_str = { + 0: 'COMPUTE_TYPE_TBD', + 1: 'COMPUTE_TYPE_DEFAULT', + 2: 'COMPUTE_TYPE_PEDANTIC', + 3: 'COMPUTE_TYPE_FP16', + 4: 'COMPUTE_TYPE_FP32', + 5: 'COMPUTE_TYPE_FP64', + 6: 'COMPUTE_TYPE_BF16', + 7: 'COMPUTE_TYPE_TF32', +} + + +cpdef int to_compute_type_index(dtype) except -1: + cdef str dtype_char = numpy.dtype(dtype).char + if dtype_char == 'e': + return 0 + elif dtype_char in 'fF': + return 1 + elif dtype_char in 'dD': + return 2 + else: + raise TypeError('dtype is not supported: {}'.format(dtype)) + + +cpdef set_compute_type(dtype, compute_type): + global compute_types + if compute_type in (COMPUTE_TYPE_TBD, COMPUTE_TYPE_DEFAULT, + COMPUTE_TYPE_PEDANTIC, COMPUTE_TYPE_FP16, + COMPUTE_TYPE_FP32, COMPUTE_TYPE_FP64): + compute_types[to_compute_type_index(dtype)] = compute_type + elif compute_type in (COMPUTE_TYPE_BF16, COMPUTE_TYPE_TF32): + if int(device.get_compute_capability()) >= 80: + compute_types[to_compute_type_index(dtype)] = compute_type + else: + warnings.warn('COMPUTE_TYPE_BF16 and COMPUTE_TYPE_TF32 are only ' + 'available on GPUs with compute capability 8.0 or ' + 'higher. COMPUTE_TYPE_DEFAULT will be used instead.') + compute_types[to_compute_type_index(dtype)] = COMPUTE_TYPE_DEFAULT + else: + raise ValueError('Unknown compute type: {}'.format(compute_type)) + + +cpdef compute_type_to_str(compute_type): + if compute_type in compute_type_str: + return compute_type_str[compute_type] + else: + return compute_type + + +def _tensordot_core_int_kernel_impl(config, dtype, code, name): + # This code is based in the GEMM implementation from MAGMA + # (http://icl.cs.utk.edu/magma/) + code = ''' +#define fetch(arr, col, m, n, bound) arr[min(n*col + m, bound)] + +template +__device__ void _tensordot_core_int_kernel_impl( + int M, int N, int K, + const T* A, + const T* B, + T * C) +{ + int idx = threadIdx.x; + int idy = threadIdx.y; + + int idt = DIM_X * idy + idx; + + int idxA = idt % DIM_XA; + int idyA = idt / DIM_XA; + + int idxB = idt % DIM_XB; + int idyB = idt / DIM_XB; + + int blx = blockIdx.x; + int bly = blockIdx.y; + + __shared__ T sA[BLK_K][BLK_M + 1]; + __shared__ T sB[BLK_N][BLK_K + 1]; + + // registers for the innermost loop + T rC[THR_N][THR_M]; + T rA[THR_M]; + T rB[THR_N]; + + T ra[BLK_K / DIM_YA][BLK_M / DIM_XA]; + T rb[BLK_N / DIM_YB][BLK_K / DIM_XB]; + + const T* offs_dA = A + blx * BLK_M + idyA * M + idxA; + int boundA = (M * (K - 1) + M) - (blx * BLK_M + idyA * M + idxA) - 1; + const T* offs_dB = B + bly * BLK_N * K + idyB * K + idxB; + int boundB = (K * (N - 1) + K) - (bly * BLK_N * K + idyB * K + idxB) - 1; + + int m, n, k, kk; + + #pragma unroll + for (n = 0; n < THR_N; n++) { + #pragma unroll + for (m = 0 ; m < THR_M; m++) { + rC[n][m] = 0; + } + } + + // blockwise transpose to transpose load + #pragma unroll + for (n = 0; n < BLK_K; n += DIM_YA) { + #pragma unroll + for (m = 0; m < BLK_M; m += DIM_XA) { + sA[n + idyA][m + idxA] = fetch(offs_dA, M, m, n, boundA); + } + } + // blockwise transpose to transpose load + #pragma unroll + for (n = 0; n < BLK_N; n += DIM_YB) { + #pragma unroll + for (m = 0; m < BLK_K; m += DIM_XB) { + sB[n + idyB][m + idxB] = fetch(offs_dB, K, m, n, boundB); + } + } + __syncthreads(); + + for (kk = 0; kk < K - BLK_K; kk += BLK_K) + { + offs_dA += BLK_K * M; + boundA -= BLK_K * M; + offs_dB += BLK_K; + boundB -= BLK_K; + + #pragma unroll + for (n = 0; n < BLK_K / DIM_YA; n++) { + #pragma unroll + for (m = 0; m < BLK_M / DIM_XA; m++) { + ra[n][m] = fetch(offs_dA, M, m * DIM_XA, n * DIM_YA, boundA); + } + } + + #pragma unroll + for (n = 0; n < BLK_N / DIM_YB; n++) { + #pragma unroll + for (m = 0; m < BLK_K / DIM_XB; m++) { + rb[n][m] = fetch(offs_dB, K, m * DIM_XB, n * DIM_YB, boundB); + } + } + + // multiply + #pragma unroll + for (k = 0; k < BLK_K; k++) + { + #pragma unroll + for (m = 0; m < THR_M; m++) { + rA[m] = sA[k][m * DIM_X + idx]; + } + + #pragma unroll + for (n = 0; n < THR_N; n++) { + rB[n] = sB[n * DIM_Y + idy][k]; + } + + // HIP is strange... + #ifdef __HIP_DEVICE_COMPILE__ + __syncthreads(); + #endif + + #pragma unroll + for (n = 0; n < THR_N; n++) { + #pragma unroll + for (m = 0; m < THR_M; m++) { + rC[n][m] += rA[m] * rB[n]; + } + } + } + __syncthreads(); + + // store A regs->smem + #pragma unroll + for (n = 0; n < BLK_K / DIM_YA; n++) + { + #pragma unroll + for (m = 0; m < BLK_M / DIM_XA; m++) + { + sA[n * DIM_YA + idyA][m * DIM_XA + idxA] = ra[n][m]; + } + } + + #pragma unroll + for (n = 0; n < BLK_N / DIM_YB; n++) + { + #pragma unroll + for (m = 0; m < BLK_K / DIM_XB; m++) + { + sB[n * DIM_YB + idyB][m * DIM_XB + idxB] = rb[n][m]; + } + } + __syncthreads(); + } + + // Multiply last full (BLK_K) or partial block of columns of A and + // rows of B. + // It's okay that m,n exceed matrix bounds as all work is in registers + // or shared memory, and out-of-bounds rC[n][m] will not be saved later. + + kk = K - kk; + #pragma unroll + for (k = 0; k < kk; k++) + { + #pragma unroll + for (m = 0; m < THR_M; m++) { + rA[m] = sA[k][m * DIM_X + idx]; + } + + #pragma unroll + for (n = 0; n < THR_N; n++) { + rB[n] = sB[n * DIM_Y + idy][k]; + } + + // HIP is strange... + #ifdef __HIP_DEVICE_COMPILE__ + __syncthreads(); + #endif + + #pragma unroll + for (n = 0; n < THR_N; n++) { + #pragma unroll + for (m = 0; m < THR_M; m++) { + rC[n][m] += rA[m] * rB[n]; + } + } + } + + #pragma unroll + for (n = 0; n < THR_N; n++) { + int coord_dCn = bly * BLK_N + n * DIM_Y + idy; + #pragma unroll + for (m = 0; m < THR_M; m++) { + int coord_dCm = blx * BLK_M + m * DIM_X + idx; + if (coord_dCm < M && coord_dCn < N) { + C[coord_dCn * M + coord_dCm] = rC[n][m]; + } + } + } +} +''' + code + for k, v in config: + code = '#define ' + k + ' ' + str(v) + '\n' + code + name_expressions = [f'{name}', + f'{name}', + f'{name}', + f'{name}', + f'{name}', + f'{name}', + f'{name}', + f'{name}', + f'{name}', + f'{name}', + f'{name}'] + mod = cupy.RawModule(code=code, options=('--std=c++11',), + name_expressions=name_expressions) + ker = mod.get_function(name + '<' + get_typename(dtype) + '>') + return ker + + +@cupy._util.memoize(for_each_device=True) +def _tensordot_core_int_kernel(config, dtype): + code = ''' +template +__global__ void _tensordot_core_int_kernel( + int M, int N, int K, + const T* A, + const T* B, + T * C) +{ + _tensordot_core_int_kernel_impl(M, N, K, A, B, C); +} +''' + name = '_tensordot_core_int_kernel' + return _tensordot_core_int_kernel_impl(config, dtype, code, name) + + +@cupy._util.memoize(for_each_device=True) +def _tensordot_core_int_batched_kernel(config, dtype): + code = ''' +template +__global__ void _tensordot_core_int_batched_kernel( + int M, int N, int K, + const T* A[], const T* B[], + T* C[]) +{ + int batchid = blockIdx.z; + _tensordot_core_int_kernel_impl( + M, N, K, A[batchid], B[batchid], C[batchid] + ); +} +''' + name = '_tensordot_core_int_batched_kernel' + return _tensordot_core_int_kernel_impl(config, dtype, code, name) + + +@cupy._util.memoize(for_each_device=True) +def _tensordot_core_int_strided_batched_kernel(config, dtype): + code = ''' +template +__global__ void _tensordot_core_int_strided_batched_kernel( + int M, int N, int K, + const T* A, long long strideA, + const T* B, long long strideB, + T * C, long long strideC) +{ + int batchid = blockIdx.z; + _tensordot_core_int_kernel_impl( + M, N, K, + &A[batchid * strideA], + &B[batchid * strideB], + &C[batchid * strideC] + ); +} +''' + name = '_tensordot_core_int_strided_batched_kernel' + return _tensordot_core_int_kernel_impl(config, dtype, code, name) + + +cdef tuple _integral_tensordot_core_config(): + # TODO(leofang): autotune the tuning parameters here? See the discussion + # in this thread: https://groups.google.com/a/icl.utk.edu/g/magma-user/c/igc66uduTfI # NOQA + dim_x=16 + dim_y=16 + blk_m=128 + blk_n=128 + blk_k=2 + dim_xa=128 + dim_ya=2 + dim_xb=2 + dim_yb=128 + config = (('DIM_X', dim_x), ('DIM_Y', dim_y), + ('BLK_M', blk_m), ('BLK_N', blk_n), ('BLK_K', blk_k), + ('DIM_XA', dim_xa), ('DIM_YA', dim_ya), + ('DIM_XB', dim_xb), ('DIM_YB', dim_yb), + ('THR_M', blk_m // dim_x), ('THR_N', blk_n // dim_y)) + return config, dim_x, dim_y, blk_m, blk_n + + +cdef _ndarray_base _integral_tensordot_core( + _ndarray_base a, _ndarray_base b, _ndarray_base out, Py_ssize_t m, + Py_ssize_t n, Py_ssize_t k, str dtype, const shape_t& ret_shape): + + config, dim_x, dim_y, blk_m, blk_n = _integral_tensordot_core_config() + kern = _tensordot_core_int_kernel(config, dtype) + args = (m, n, k, a, b, out) + grid = (int(math.ceil(m / blk_m)), int(math.ceil(n / blk_n)), 1) + block = (dim_x, dim_y, 1) + kern(grid, block, args=args) + return out + + +cdef _ndarray_base _integral_tensordot_core_batched( + _ndarray_base a, _ndarray_base b, _ndarray_base out, Py_ssize_t m, + Py_ssize_t n, Py_ssize_t k, str dtype, Py_ssize_t batch_count): + + config, dim_x, dim_y, blk_m, blk_n = _integral_tensordot_core_config() + kern = _tensordot_core_int_batched_kernel(config, dtype) + block = (dim_x, dim_y, 1) + matPtrA = _mat_ptrs(a) + matPtrB = _mat_ptrs(b) + matPtrOut = _mat_ptrs(out) + max_batch_count = 65000 + for i in range(0, batch_count, max_batch_count): + ibatch = min(max_batch_count, batch_count - i) + args = ( + m, n, k, matPtrA[i:i + ibatch], matPtrB[i:i + ibatch], + matPtrOut[i:i + ibatch]) + grid = (int(math.ceil(m / blk_m)), int(math.ceil(n / blk_n)), ibatch) + kern(grid, block, args=args) + return out + + +cdef _ndarray_base _integral_tensordot_core_strided_batched( + _ndarray_base a, _ndarray_base b, _ndarray_base out, Py_ssize_t m, + Py_ssize_t n, Py_ssize_t k, str dtype, Py_ssize_t batch_count): + + config, dim_x, dim_y, blk_m, blk_n = _integral_tensordot_core_config() + kern = _tensordot_core_int_strided_batched_kernel(config, dtype) + block = (dim_x, dim_y, 1) + a = a.reshape((-1,) + a.shape[-2:]) + b = b.reshape((-1,) + b.shape[-2:]) + out = out.reshape((-1,) + out.shape[-2:]) + strideA = _get_stride_for_strided_batched_gemm(a) + strideB = _get_stride_for_strided_batched_gemm(b) + strideOut = _get_stride_for_strided_batched_gemm(out) + max_batch_count = 65000 + for i in range(0, batch_count, max_batch_count): + ibatch = min(max_batch_count, batch_count - i) + args = ( + m, n, k, a[i:i + ibatch], strideA, b[i:i + ibatch], strideB, + out[i:i + ibatch], strideOut) + grid = (int(math.ceil(m / blk_m)), int(math.ceil(n / blk_n)), ibatch) + kern(grid, block, args=args) + return out + + +cdef _tensordot_core_mul_sum = ReductionKernel( + 'S x, T y', 'U out', + 'static_cast(x) * static_cast(y)', + 'a + b', 'out = a', '0', '_tensordot_core_mul_sum') + + +cpdef get_compute_type(dtype): + global compute_types + cdef int index = to_compute_type_index(dtype) + if compute_types[index] == COMPUTE_TYPE_TBD: + compute_type = COMPUTE_TYPE_DEFAULT + dtype_char = numpy.dtype(dtype).char + if dtype_char in 'fF' and int(os.getenv('CUPY_TF32', '0')) > 0: + compute_type = COMPUTE_TYPE_TF32 + set_compute_type(dtype, compute_type) + return compute_types[index] + + +@cython.profile(False) +cpdef inline tuple _mat_to_cublas_contiguous( + _ndarray_base a, Py_ssize_t trans): + assert a.ndim == 2 + if a._f_contiguous: + # builtin max function is not used for Cython 0.23 + lda = a._strides[1] // a.itemsize + if lda < a._shape[0]: + lda = a._shape[0] + return a, trans, lda + if not a._c_contiguous: + a = a.copy() + return a, 1 - trans, a._strides[0] // a.itemsize + + +cpdef _ndarray_base dot( + _ndarray_base a, _ndarray_base b, _ndarray_base out=None): + cdef Py_ssize_t a_ndim, b_ndim, a_axis, b_axis, n, m, k + cdef bint input_a_is_vec, input_b_is_vec + cdef shape_t ret_shape, shape + + a_ndim = a._shape.size() + b_ndim = b._shape.size() + + if out is not None: + if numpy.result_type(a.dtype, b.dtype) != out.dtype: + raise ValueError('Not supported dtype combination.') + if not out._c_contiguous: + raise ValueError('Output array must be C-contiguous') + + if a_ndim == 0 or b_ndim == 0: + return _math._multiply(a, b, out=out) + + input_a_is_vec = a_ndim == 1 + input_b_is_vec = b_ndim == 1 + if input_a_is_vec: + shape.clear() + shape.push_back(1) + shape.push_back(a.size) + a = _manipulation._reshape(a, shape) + a_ndim = 2 + if input_b_is_vec: + shape.clear() + shape.push_back(b.size) + shape.push_back(1) + b = _manipulation._reshape(b, shape) + b_ndim = 2 + + a_axis = a_ndim - 1 + b_axis = b_ndim - 2 + + if a._shape[a_axis] != b._shape[b_axis]: + raise ValueError('Axis dimension mismatch') + + if a_axis: + a = _manipulation.rollaxis(a, a_axis, 0) + if b_axis: + b = _manipulation.rollaxis(b, b_axis, 0) + + k = a._shape[0] + if k != 0: + m = b.size // k + n = a.size // k + else: + # When k==0, the function must return a matrix filled with zero + # like NumPy. + m = 0 + n = 0 + + if not input_a_is_vec: + ret_shape.insert(ret_shape.end(), a._shape.begin() + 1, a._shape.end()) + if not input_b_is_vec: + ret_shape.insert(ret_shape.end(), b._shape.begin() + 1, b._shape.end()) + if out is not None: + # TODO(kataoka): Make the condition strict + if k != 0 and out.size != n * m: + raise ValueError('Output array has an invalid size') + + return tensordot_core(a, b, out, n, m, k, ret_shape) + + +cpdef _ndarray_base tensordot_core( + _ndarray_base a, _ndarray_base b, _ndarray_base out, Py_ssize_t n, + Py_ssize_t m, Py_ssize_t k, const shape_t& ret_shape): + # out, if specified, must be C-contiguous and have correct shape. + cdef shape_t shape + cdef Py_ssize_t transa, transb, lda, ldb + cdef intptr_t handle + cdef _ndarray_base copy_to_out = None + cdef str dtype = a.dtype.char + cdef int compute_capability = int(device.get_compute_capability()) + if dtype != b.dtype.char: + dtype = numpy.promote_types(dtype, b.dtype).char + if not a.size or not b.size: + if out is None: + out = _ndarray_init(cupy.ndarray, ret_shape, dtype, None) + out.fill(0) + return out + + if out is not None: + assert out.flags.c_contiguous and out.dtype == dtype + cdef int ace + if m == 1 and n == 1: + if out is None: + out = _ndarray_init(cupy.ndarray, ret_shape, dtype, None) + c = _manipulation._reshape(out, ()) + for ace in _accelerator._routine_accelerators: + # fast path using CUB or cuTENSOR + if ace in (_accelerator.ACCELERATOR_CUB, + _accelerator.ACCELERATOR_CUTENSOR): + (a.ravel() * b.ravel()).sum(out=c) + break + else: + _tensordot_core_mul_sum(a.ravel(), b.ravel(), out=c) + return out + + a = a.astype(dtype, order='K', casting=None, subok=None, copy=False) + b = b.astype(dtype, order='K', casting=None, subok=None, copy=False) + # It copies the operands if needed + if a._shape.size() != 2 or a._shape[0] != k or a._shape[1] != n: + shape.clear() + shape.push_back(k) + shape.push_back(n) + a = _manipulation._reshape(a, shape) + if b._shape.size() != 2 or b._shape[0] != k or b._shape[1] != m: + shape.clear() + shape.push_back(k) + shape.push_back(m) + b = _manipulation._reshape(b, shape) + + # Be careful that cuBLAS uses the FORTRAN-order matrix representation. + # Matrix-Matrix product A^T * B + # c is C-contiguous while cuBLAS assumes F-contiguous inputs, so we + # compute C^T = B^T * A here. + a, transa, lda = _mat_to_cublas_contiguous(a, 0) + b, transb, ldb = _mat_to_cublas_contiguous(b, 1) + + if out is None: + out = c = _ndarray_init(cupy.ndarray, ret_shape, dtype, None) + elif ( + _memory_range.may_share_bounds(out, a) + or _memory_range.may_share_bounds(out, b) + ): + copy_to_out = c = _ndarray_init(cupy.ndarray, ret_shape, dtype, None) + else: + c = out + + if c._shape.size() != 2 or c._shape[0] != n or c._shape[1] != m: + c = c.view() + c.shape = (n, m) + + if dtype not in 'efdFD': + if transa: + a = a.T + a = _internal_ascontiguousarray(a) + if transb: + b = _internal_ascontiguousarray(b) + _integral_tensordot_core(b, a, c, m, n, k, dtype, ret_shape) + if copy_to_out is not None: + elementwise_copy(copy_to_out, out) + return out + + global _cuda_runtime_version + if _cuda_runtime_version < 0: + _cuda_runtime_version = runtime.runtimeGetVersion() + + if ( + not runtime._is_hip_environment and + _cuda_runtime_version >= 11000 and + compute_capability >= 50 + ): + tensordot_core_v11(transb, transa, m, n, k, b, ldb, a, lda, c, m) + if copy_to_out is not None: + elementwise_copy(copy_to_out, out) + return out + + handle = device.get_cublas_handle() + if dtype == 'e': + coef_dtype = 'f' + else: + coef_dtype = dtype + one = numpy.array(1.0, dtype=coef_dtype) + zero = numpy.array(0.0, dtype=coef_dtype) + if runtime._is_hip_environment and dtype == 'e': + # On HIP, SgemmEx does not work for half precision + dtype = 'f' + a = a.astype(dtype, order='K', casting=None, subok=None, copy=True) + b = b.astype(dtype, order='K', casting=None, subok=None, copy=True) + c = _ndarray_init(cupy.ndarray, ret_shape, dtype, None) + copy_to_out = c + warnings.warn('On ROCm/HIP, there is no specialized API to handle ' + 'half precision floating numbers, so the computation ' + 'will be done by casting to single precision') + if dtype == 'e': + use_tensor_core = (not runtime._is_hip_environment and + _cuda_runtime_version >= 9000 and + compute_capability >= 70) + if use_tensor_core: + cublas.setMathMode(handle, cublas.CUBLAS_TENSOR_OP_MATH) + cublas.gemmEx( + handle, transb, transa, m, n, k, + one.ctypes.data, b.data.ptr, runtime.CUDA_R_16F, ldb, + a.data.ptr, runtime.CUDA_R_16F, lda, zero.ctypes.data, + c.data.ptr, runtime.CUDA_R_16F, m, runtime.CUDA_R_32F, + cublas.CUBLAS_GEMM_DEFAULT_TENSOR_OP) + cublas.setMathMode(handle, cublas.CUBLAS_DEFAULT_MATH) + else: + cublas.sgemmEx( + handle, transb, transa, m, n, k, + one.ctypes.data, b.data.ptr, runtime.CUDA_R_16F, ldb, + a.data.ptr, runtime.CUDA_R_16F, lda, zero.ctypes.data, + c.data.ptr, runtime.CUDA_R_16F, m) + elif dtype == 'f': + cublas.sgemmEx( + handle, transb, transa, m, n, k, + one.ctypes.data, b.data.ptr, runtime.CUDA_R_32F, ldb, + a.data.ptr, runtime.CUDA_R_32F, lda, zero.ctypes.data, + c.data.ptr, runtime.CUDA_R_32F, m) + elif dtype == 'd': + cublas.dgemm( + handle, transb, transa, m, n, k, + one.ctypes.data, b.data.ptr, ldb, a.data.ptr, lda, + zero.ctypes.data, c.data.ptr, m) + elif dtype == 'F': + cublas.cgemm( + handle, transb, transa, m, n, k, + one.ctypes.data, b.data.ptr, ldb, a.data.ptr, lda, + zero.ctypes.data, c.data.ptr, m) + elif dtype == 'D': + cublas.zgemm( + handle, transb, transa, m, n, k, + one.ctypes.data, b.data.ptr, ldb, a.data.ptr, lda, + zero.ctypes.data, c.data.ptr, m) + else: + raise ValueError('Invalid dtype: %s' % str(dtype)) + if copy_to_out is not None: + elementwise_copy(copy_to_out, out) + return out + + +cpdef _ndarray_base tensordot_core_v11( + Py_ssize_t transa, Py_ssize_t transb, Py_ssize_t m, Py_ssize_t n, + Py_ssize_t k, _ndarray_base a, Py_ssize_t lda, _ndarray_base b, + Py_ssize_t ldb, _ndarray_base c, Py_ssize_t ldc): + cdef float one_f, zero_f + cdef double one_d, zero_d + cdef cuComplex one_F, zero_F + cdef cuDoubleComplex one_D, zero_D + cdef size_t one_ptr, zero_ptr + + cdef int compute_capability = int(device.get_compute_capability()) + cdef int compute_type = get_compute_type(c.dtype) + cdef int cublas_compute_type = -1 + if c.dtype.char in 'efF': + if compute_type == COMPUTE_TYPE_PEDANTIC: + cublas_compute_type = cublas.CUBLAS_COMPUTE_32F_PEDANTIC + elif compute_type == COMPUTE_TYPE_TF32 and c.dtype.char in 'fF': + cublas_compute_type = cublas.CUBLAS_COMPUTE_32F_FAST_TF32 + else: + cublas_compute_type = cublas.CUBLAS_COMPUTE_32F + elif c.dtype.char in 'dD': + if compute_type == COMPUTE_TYPE_PEDANTIC: + cublas_compute_type = cublas.CUBLAS_COMPUTE_64F_PEDANTIC + else: + cublas_compute_type = cublas.CUBLAS_COMPUTE_64F + else: + raise ValueError('Invalid dtype: {}'.format(c.dtype)) + + cdef int algo = cublas.CUBLAS_GEMM_DEFAULT + if ((compute_capability >= 80) or + (compute_capability >= 70 and c.dtype == 'e')): + algo = cublas.CUBLAS_GEMM_DEFAULT_TENSOR_OP + + if cublas_compute_type in (cublas.CUBLAS_COMPUTE_32F, + cublas.CUBLAS_COMPUTE_32F_PEDANTIC, + cublas.CUBLAS_COMPUTE_32F_FAST_TF32): + if c.dtype.char in 'efd': + one_f = 1 + zero_f = 0 + one_ptr = &one_f + zero_ptr = &zero_f + else: + one_F = cuComplex(1, 0) + zero_F = cuComplex(0, 0) + one_ptr = &one_F + zero_ptr = &zero_F + elif cublas_compute_type in (cublas.CUBLAS_COMPUTE_64F, + cublas.CUBLAS_COMPUTE_64F_PEDANTIC): + if c.dtype.char in 'efd': + one_d = 1 + zero_d = 0 + one_ptr = &one_d + zero_ptr = &zero_d + else: + one_D = cuDoubleComplex(1, 0) + zero_D = cuDoubleComplex(0, 0) + one_ptr = &one_D + zero_ptr = &zero_D + else: + raise ValueError('Invalid cublas compute type: {}' + .format(cublas_compute_type)) + + cdef int a_cuda_dtype = to_cuda_dtype(a.dtype, is_half_allowed=True) + cdef int b_cuda_dtype = to_cuda_dtype(b.dtype, is_half_allowed=True) + cdef int c_cuda_dtype = to_cuda_dtype(c.dtype, is_half_allowed=True) + cdef intptr_t handle = device.get_cublas_handle() + cublas.gemmEx( + handle, transa, transb, m, n, k, one_ptr, + a.data.ptr, a_cuda_dtype, lda, b.data.ptr, b_cuda_dtype, ldb, + zero_ptr, c.data.ptr, c_cuda_dtype, ldc, cublas_compute_type, + algo) + + +cdef Py_ssize_t _get_stride_for_strided_batched_gemm( + _ndarray_base a) except? 0: + cdef int ndim = a._shape.size() + assert ndim > 2 + assert a._c_contiguous + return a._shape[ndim - 2] * a._shape[ndim - 1] + + +cdef _mat_ptrs_kernel = ElementwiseKernel( + 'T base, T stride', 'T out', + 'out = base + _ind.get()[_ind.ndim - 1] * stride', 'cupy_mat_ptrs', + reduce_dims=False) + + +cpdef _ndarray_base _mat_ptrs(_ndarray_base a): + """Creates an array of pointers to matrices + Args: + a: A batch of matrices on GPU. + shape: (A, B, C) -> A ptrs to mat of size (B, C) + shape: (A_1, ..., A_N, B, C) -> A_1*...*A_N ptrs to mat of + size (B, C) + Returns: + GPU array of pointers to matrices. + """ + cdef int ndim = a._shape.size() + assert ndim > 2 + cdef _ndarray_base idx + idx = _mat_ptrs_kernel( + a.data.ptr, a._strides[0], + core.ndarray((a._shape[0],), dtype=numpy.uintp)) + + for i in range(1, ndim - 2): + idx = _mat_ptrs_kernel( + idx[:, None], a._strides[i], + core.ndarray((idx.size, a._shape[i]), dtype=numpy.uintp)) + idx = idx.ravel() + return idx + + +cpdef _ndarray_base matmul( + _ndarray_base a, _ndarray_base b, _ndarray_base out=None): + """Matrix product of two arrays. + + Returns the matrix product of two arrays and is the implementation of + the `@` operator introduced in Python 3.5 following PEP465. + + The main difference against cupy.dot are the handling of arrays with more + than 2 dimensions. For more information see :func:`numpy.matmul`. + + Args: + a (cupy.ndarray): The left argument. + b (cupy.ndarray): The right argument. + out (cupy.ndarray): Output array. + + Returns: + cupy.ndarray: Output array. + + .. seealso:: :func:`numpy.matmul` + + """ + + cdef Py_ssize_t i, n, m, ka, kb, a_sh, b_sh, c_sh, ldc + cdef Py_ssize_t batchCount, a_part_outshape, b_part_outshape + cdef int orig_a_ndim, orig_b_ndim, a_ndim, b_ndim, ndim + cdef _ndarray_base ap, bp, cp, c_view + cdef bint use_broadcast + + orig_a_ndim = a._shape.size() + orig_b_ndim = b._shape.size() + if orig_a_ndim == 0 or orig_b_ndim == 0: + raise ValueError('Scalar operands are not allowed, use \'*\' instead') + + ndim = max(orig_a_ndim, orig_b_ndim) + if ndim <= 2: + if out is None: + return dot(a, b, out) + ret_dtype = numpy.promote_types(a.dtype, b.dtype) + if out._c_contiguous and ret_dtype == out.dtype: + return dot(a, b, out) + c = _ndarray_init(cupy.ndarray, out._shape, dtype=ret_dtype, obj=None) + dot(a, b, c) + elementwise_copy(c, out) + return out + + orig_a = a + orig_b = b + a_part_outshape = b_part_outshape = 0 + if orig_a_ndim == 1: + a = _manipulation._reshape(a, (1, a.size)) + else: + a = a.view() + a_part_outshape = a._shape[orig_a_ndim - 2] + if orig_b_ndim == 1: + b = _manipulation._reshape(b, (b.size, 1)) + ldc = 1 + else: + b = b.view() + b_part_outshape = ldc = b._shape[orig_b_ndim - 1] + + # expand dims + a_ndim = a._shape.size() + b_ndim = b._shape.size() + if a_ndim < ndim: + # TODO(niboshi): Confirm update_x_contiguity flags + a._set_shape_and_strides( + (1,) * (ndim - a_ndim) + a.shape, + (0,) * (ndim - a_ndim) + a.strides, + True, True) + if b_ndim < ndim: + # TODO(niboshi): Confirm update_x_contiguity flags + b._set_shape_and_strides( + (1,) * (ndim - b_ndim) + b.shape, + (0,) * (ndim - b_ndim) + b.strides, + True, True) + + ret_dtype = numpy.promote_types(a.dtype, b.dtype) + dtype = ret_dtype + if dtype.char == 'e': + dtype = numpy.dtype('f') + + a = ascontiguousarray(a, dtype) + b = ascontiguousarray(b, dtype) + + # broadcast + batchCount = 1 # batchCount = numpy.prod(out_shape[:-2]) + out_shape = [] + use_broadcast = False + for i in range(0, ndim - 2): + a_sh = a._shape[i] + b_sh = b._shape[i] + if a_sh != b_sh and a_sh != 1 and b_sh != 1: + raise ValueError( + 'operands could not be broadcast together with ' + 'remapped shapes') + + if a_sh == 0 or b_sh == 0: + c_sh = 0 + else: + c_sh = max(a_sh, b_sh) + batchCount *= c_sh + out_shape.append(c_sh) + if a_sh == 1 and c_sh > 1: + a._strides[i] = 0 + a._shape[i] = c_sh + a._c_contiguous = a._f_contiguous = False + use_broadcast = True + + if b_sh == 1 and c_sh > 1: + b._strides[i] = 0 + b._shape[i] = c_sh + b._c_contiguous = b._f_contiguous = False + use_broadcast = True + + if orig_a_ndim != 1: + out_shape.append(a_part_outshape) + if orig_b_ndim != 1: + out_shape.append(b_part_outshape) + + # (A B)^T = B^T A^T + a, b = b, a + + ka = a._shape[ndim - 2] + lda = n = a._shape[ndim - 1] + m = b._shape[ndim - 2] + ldb = kb = b._shape[ndim - 1] + + if ka != kb: + raise ValueError( + 'shapes ({}) and ({}) not aligned'.format( + ','.join([str(_) for _ in orig_a.shape]), + ','.join([str(_) for _ in orig_b.shape]))) + + if out is not None and out.shape != tuple(out_shape): + raise ValueError('Output array has an invalid size') + + if a.size == 0 or b.size == 0: + if out is None: + return cupy.zeros(out_shape, ret_dtype) + else: + out.fill(0) + return out + + if ( + out is not None and out.dtype == dtype and out.flags.c_contiguous + and not _memory_range.may_share_bounds(out, a) + and not _memory_range.may_share_bounds(out, b) + ): + c = out + else: + c = core.ndarray(out_shape, dtype=dtype) + if out is None: + if dtype == ret_dtype: + out = c + else: + out = core.ndarray(out_shape, dtype=ret_dtype) + + if orig_a_ndim == 1 or orig_b_ndim == 1: + c_view = c.view() + if orig_b_ndim == 1: + c_view._shape.push_back(1) + c_view._strides.push_back(0) + if orig_a_ndim == 1: + c_view._shape.insert(c_view._shape.end() - 1, 1) + c_view._strides.insert(c_view._strides.end() - 1, 0) + assert c_view._c_contiguous + c_view._update_f_contiguity() + else: + c_view = c + + if dtype.char not in 'efdFD': + if not use_broadcast: + _integral_tensordot_core_strided_batched( + a, b, c_view, n, m, ka, dtype.char, batchCount) + else: + _integral_tensordot_core_batched( + a, b, c_view, n, m, ka, dtype.char, batchCount) + if out is not c: + elementwise_copy(c, out) + return out + + global _cuda_runtime_version + if _cuda_runtime_version < 0: + _cuda_runtime_version = runtime.runtimeGetVersion() + + cdef intptr_t handle = device.get_cublas_handle() + cdef int cuda_dtype = to_cuda_dtype(dtype) + cdef int algo = cublas.CUBLAS_GEMM_DEFAULT + + one = numpy.array(1, dtype=dtype) + zero = numpy.array(0, dtype=dtype) + if not use_broadcast: + strideA = _get_stride_for_strided_batched_gemm(a) + strideB = _get_stride_for_strided_batched_gemm(b) + strideC = _get_stride_for_strided_batched_gemm(c_view) + if dtype.char in 'fFdD': + cublas.gemmStridedBatchedEx( + handle, + 0, # transa + 0, # transb + n, m, ka, one.ctypes.data, + a.data.ptr, cuda_dtype, lda, strideA, + b.data.ptr, cuda_dtype, ldb, strideB, + zero.ctypes.data, + c_view.data.ptr, cuda_dtype, ldc, strideC, + batchCount, cuda_dtype, algo) + else: + raise TypeError(dtype, a.dtype, b.dtype) + else: + ap = _mat_ptrs(a) + bp = _mat_ptrs(b) + cp = _mat_ptrs(c_view) + if dtype == numpy.float32: + cublas.sgemmBatched( + handle, + 0, # transa + 0, # transb + n, m, ka, one.ctypes.data, + ap.data.ptr, lda, + bp.data.ptr, ldb, + zero.ctypes.data, cp.data.ptr, ldc, batchCount) + elif dtype == numpy.float64: + cublas.dgemmBatched( + handle, + 0, # transa + 0, # transb + n, m, ka, one.ctypes.data, + ap.data.ptr, lda, + bp.data.ptr, ldb, + zero.ctypes.data, cp.data.ptr, ldc, batchCount) + elif dtype == numpy.complex64: + cublas.cgemmBatched( + handle, + 0, # transa + 0, # transb + n, m, ka, one.ctypes.data, + ap.data.ptr, lda, + bp.data.ptr, ldb, + zero.ctypes.data, cp.data.ptr, ldc, batchCount) + elif dtype == numpy.complex128: + cublas.zgemmBatched( + handle, + 0, # transa + 0, # transb + n, m, ka, one.ctypes.data, + ap.data.ptr, lda, + bp.data.ptr, ldb, + zero.ctypes.data, cp.data.ptr, ldc, batchCount) + else: + raise TypeError(dtype, a.dtype, b.dtype) + + if out is not c: + elementwise_copy(c, out) + return out diff --git a/cupy/_core/_routines_logic.pxd b/cupy/_core/_routines_logic.pxd new file mode 100644 index 0000000..f5c49dc --- /dev/null +++ b/cupy/_core/_routines_logic.pxd @@ -0,0 +1,11 @@ +from cupy._core.core cimport _ndarray_base + + +cdef _ndarray_base _ndarray_all(_ndarray_base self, axis, out, keepdims) +cdef _ndarray_base _ndarray_any(_ndarray_base self, axis, out, keepdims) +cdef _ndarray_base _ndarray_greater(_ndarray_base self, other) +cdef _ndarray_base _ndarray_greater_equal(_ndarray_base self, other) +cdef _ndarray_base _ndarray_less(_ndarray_base self, other) +cdef _ndarray_base _ndarray_less_equal(_ndarray_base self, other) +cdef _ndarray_base _ndarray_equal(_ndarray_base self, other) +cdef _ndarray_base _ndarray_not_equal(_ndarray_base self, other) diff --git a/cupy/_core/_routines_logic.pyx b/cupy/_core/_routines_logic.pyx new file mode 100644 index 0000000..5f8e51c --- /dev/null +++ b/cupy/_core/_routines_logic.pyx @@ -0,0 +1,141 @@ +from cupy._core._kernel import create_ufunc +from cupy._core._reduction import create_reduction_func + +from cupy._core.core cimport _ndarray_base + + +cdef _ndarray_base _ndarray_all(_ndarray_base self, axis, out, keepdims): + return _all(self, axis=axis, out=out, keepdims=keepdims) + + +cdef _ndarray_base _ndarray_any(_ndarray_base self, axis, out, keepdims): + return _any(self, axis=axis, out=out, keepdims=keepdims) + + +cdef _ndarray_base _ndarray_greater(_ndarray_base self, other): + return _greater(self, other) + + +cdef _ndarray_base _ndarray_greater_equal(_ndarray_base self, other): + return _greater_equal(self, other) + + +cdef _ndarray_base _ndarray_less(_ndarray_base self, other): + return _less(self, other) + + +cdef _ndarray_base _ndarray_less_equal(_ndarray_base self, other): + return _less_equal(self, other) + + +cdef _ndarray_base _ndarray_equal(_ndarray_base self, other): + return _equal(self, other) + + +cdef _ndarray_base _ndarray_not_equal(_ndarray_base self, other): + return _not_equal(self, other) + + +cdef _all = create_reduction_func( + 'cupy_all', + ('?->?', 'B->?', 'h->?', 'H->?', 'i->?', 'I->?', 'l->?', 'L->?', + 'q->?', 'Q->?', 'e->?', 'f->?', 'd->?', 'F->?', 'D->?'), + ('in0 != type_in0_raw(0)', 'a & b', 'out0 = a', 'bool'), + 'true', '') + + +cdef _any = create_reduction_func( + 'cupy_any', + ('?->?', 'B->?', 'h->?', 'H->?', 'i->?', 'I->?', 'l->?', 'L->?', + 'q->?', 'Q->?', 'e->?', 'f->?', 'd->?', 'F->?', 'D->?'), + ('in0 != type_in0_raw(0)', 'a | b', 'out0 = a', 'bool'), + 'false', '') + + +cpdef create_comparison(name, op, doc='', no_complex_dtype=True): + + if no_complex_dtype: + ops = ('??->?', 'bb->?', 'BB->?', 'hh->?', 'HH->?', 'ii->?', 'II->?', + 'll->?', 'LL->?', 'qq->?', 'QQ->?', 'ee->?', 'ff->?', 'dd->?') + else: + ops = ('??->?', 'bb->?', 'BB->?', 'hh->?', 'HH->?', 'ii->?', 'II->?', + 'll->?', 'LL->?', 'qq->?', 'QQ->?', 'ee->?', 'ff->?', 'dd->?', + 'FF->?', 'DD->?') + + return create_ufunc( + 'cupy_' + name, + ops, + 'out0 = in0 %s in1' % op, + doc=doc) + + +cdef _greater = create_comparison( + 'greater', '>', + '''Tests elementwise if ``x1 > x2``. + + .. seealso:: :data:`numpy.greater` + + ''', + no_complex_dtype=False) + + +cdef _greater_equal = create_comparison( + 'greater_equal', '>=', + '''Tests elementwise if ``x1 >= x2``. + + .. seealso:: :data:`numpy.greater_equal` + + ''', + no_complex_dtype=False) + + +cdef _less = create_comparison( + 'less', '<', + '''Tests elementwise if ``x1 < x2``. + + .. seealso:: :data:`numpy.less` + + ''', + no_complex_dtype=False) + + +cdef _less_equal = create_comparison( + 'less_equal', '<=', + '''Tests elementwise if ``x1 <= x2``. + + .. seealso:: :data:`numpy.less_equal` + + ''', + no_complex_dtype=False) + + +cdef _equal = create_comparison( + 'equal', '==', + '''Tests elementwise if ``x1 == x2``. + + .. seealso:: :data:`numpy.equal` + + ''', + no_complex_dtype=False) + + +cdef _not_equal = create_comparison( + 'not_equal', '!=', + '''Tests elementwise if ``x1 != x2``. + + .. seealso:: :data:`numpy.equal` + + ''', + no_complex_dtype=False) + + +# Variables to expose to Python +# (cythonized data cannot be exposed to Python, even with cpdef.) +all = _all +any = _any +greater = _greater +greater_equal = _greater_equal +less = _less +less_equal = _less_equal +equal = _equal +not_equal = _not_equal diff --git a/cupy/_core/_routines_manipulation.pxd b/cupy/_core/_routines_manipulation.pxd new file mode 100644 index 0000000..ae46965 --- /dev/null +++ b/cupy/_core/_routines_manipulation.pxd @@ -0,0 +1,40 @@ +from libcpp cimport vector + +from cupy._core._carray cimport shape_t +from cupy._core._carray cimport strides_t +from cupy._core.core cimport _ndarray_base + + +cdef class broadcast: + cdef: + readonly tuple values + readonly tuple shape + readonly Py_ssize_t size + readonly Py_ssize_t nd + + +cdef _ndarray_shape_setter(_ndarray_base self, newshape) +cdef _ndarray_base _ndarray_reshape(_ndarray_base self, tuple shape, order) +cdef _ndarray_base _ndarray_transpose(_ndarray_base self, tuple axes) +cdef _ndarray_base _ndarray_swapaxes( + _ndarray_base self, Py_ssize_t axis1, Py_ssize_t axis2) +cdef _ndarray_base _ndarray_flatten(_ndarray_base self, order) +cdef _ndarray_base _ndarray_ravel(_ndarray_base self, order) +cdef _ndarray_base _ndarray_squeeze(_ndarray_base self, axis) +cdef _ndarray_base _ndarray_repeat(_ndarray_base self, repeats, axis) + +cpdef _ndarray_base _expand_dims(_ndarray_base a, tuple axis) +cpdef _ndarray_base moveaxis(_ndarray_base a, source, destination) +cpdef _ndarray_base _move_single_axis( + _ndarray_base a, Py_ssize_t source, Py_ssize_t destination) +cpdef _ndarray_base rollaxis( + _ndarray_base a, Py_ssize_t axis, Py_ssize_t start=*) +cpdef _ndarray_base broadcast_to(_ndarray_base array, shape) +cpdef _ndarray_base _reshape(_ndarray_base self, const shape_t &shape_spec) +cpdef _ndarray_base _T(_ndarray_base self) +cpdef _ndarray_base _transpose( + _ndarray_base self, const vector.vector[Py_ssize_t] &axes) +cpdef _ndarray_base _concatenate( + list arrays, Py_ssize_t axis, tuple shape, _ndarray_base out, str casting) +cpdef _ndarray_base concatenate_method( + tup, int axis, _ndarray_base out=*, dtype=*, casting=*) diff --git a/cupy/_core/_routines_manipulation.pyx b/cupy/_core/_routines_manipulation.pyx new file mode 100644 index 0000000..50cb8a3 --- /dev/null +++ b/cupy/_core/_routines_manipulation.pyx @@ -0,0 +1,885 @@ +# distutils: language = c++ +import functools + +import numpy + +from cupy._core._kernel import ElementwiseKernel +from cupy._core._ufuncs import elementwise_copy +import cupy._core.core as core + +cimport cpython # NOQA +cimport cython # NOQA +from libcpp cimport vector + +from cupy._core._dtype cimport get_dtype, _raise_if_invalid_cast +from cupy._core cimport core +from cupy._core.core cimport _ndarray_base +from cupy._core cimport internal +from cupy._core._kernel cimport _check_peer_access, _preprocess_args + +from cupy.cuda import device + + +@cython.final +cdef class broadcast: + """Object that performs broadcasting. + + CuPy actually uses this class to support broadcasting in various + operations. Note that this class does not provide an iterator. + + Args: + arrays (tuple of arrays): Arrays to be broadcasted. + + Attributes: + ~broadcast.shape (tuple of ints): The broadcasted shape. + nd (int): Number of dimensions of the broadcasted shape. + ~broadcast.size (int): Total size of the broadcasted shape. + values (list of arrays): The broadcasted arrays. + + .. seealso:: :class:`numpy.broadcast` + + """ + + def __init__(self, *arrays): + cdef shape_t shape + cdef list val = list(arrays) + internal._broadcast_core(val, shape) + self.values = tuple(val) + self.shape = tuple(shape) + self.nd = shape.size() + self.size = internal.prod(shape) + + +# _ndarray_base members + + +cdef _ndarray_shape_setter(_ndarray_base self, newshape): + cdef shape_t shape, strides + if not cpython.PySequence_Check(newshape): + newshape = (newshape,) + shape = internal.infer_unknown_dimension(newshape, self.size) + _get_strides_for_nocopy_reshape(self, shape, strides) + if strides.size() != shape.size(): + raise AttributeError( + 'Incompatible shape for in-place modification. Use `.reshape()` ' + 'to make a copy with the desired shape.') + self._set_shape_and_strides(shape, strides, False, True) + + +cdef _ndarray_base _ndarray_reshape(_ndarray_base self, tuple shape, order): + cdef int order_char = internal._normalize_order(order, False) + + if len(shape) == 1 and cpython.PySequence_Check(shape[0]): + shape = tuple(shape[0]) + + if order_char == b'A': + if self._f_contiguous and not self._c_contiguous: + order_char = b'F' + else: + order_char = b'C' + if order_char == b'C': + return _reshape(self, shape) + else: + # TODO(grlee77): Support order within _reshape instead + + # The Fortran-ordered case is equivalent to: + # 1.) reverse the axes via transpose + # 2.) C-ordered reshape using reversed shape + # 3.) reverse the axes via transpose + return _T(_reshape(_T(self), shape[::-1])) + + +cdef _ndarray_base _ndarray_transpose(_ndarray_base self, tuple axes): + if len(axes) == 0: + return _T(self) + if len(axes) == 1: + a = axes[0] + if a is None: + return _T(self) + elif cpython.PySequence_Check(a): + axes = tuple(a) + return _transpose(self, axes) + + +cdef _ndarray_base _ndarray_swapaxes( + _ndarray_base self, Py_ssize_t axis1, Py_ssize_t axis2): + cdef Py_ssize_t ndim = self.ndim + cdef vector.vector[Py_ssize_t] axes + if axis1 < -ndim or axis1 >= ndim or axis2 < -ndim or axis2 >= ndim: + raise ValueError('Axis out of range') + axis1 %= ndim + axis2 %= ndim + for i in range(ndim): + axes.push_back(i) + axes[axis1], axes[axis2] = axes[axis2], axes[axis1] + return _transpose(self, axes) + + +cdef _ndarray_base _ndarray_flatten(_ndarray_base self, order): + cdef int order_char + cdef vector.vector[Py_ssize_t] axes + + order_char = internal._normalize_order(order, True) + if order_char == b'A': + if self._f_contiguous and not self._c_contiguous: + order_char = b'F' + else: + order_char = b'C' + if order_char == b'C': + return _ndarray_flatten_order_c(self) + elif order_char == b'F': + return _ndarray_flatten_order_c(_T(self)) + elif order_char == b'K': + axes = _npyiter_k_order_axes(self.strides) + return _ndarray_flatten_order_c(_transpose(self, axes)) + + +cdef _ndarray_base _ndarray_flatten_order_c(_ndarray_base self): + newarray = self.copy(order='C') + newarray._shape.assign(1, self.size) + newarray._strides.assign(1, + self.itemsize) + newarray._c_contiguous = True + newarray._f_contiguous = True + return newarray + + +cdef vector.vector[Py_ssize_t] _npyiter_k_order_axes(strides_t& strides): + # output transpose axes such that + # x.flatten(order="K") == x.transpose(axes).flatten(order="C") + # by reproducing `npyiter_find_best_axis_ordering` + # in numpy/core/src/multiarray/nditer_constr.c + + # Note that `flatten` and `ravel` should use this function for order="K", + # while `copy(order="K")` should use `internal._get_strides_for_order_K`. + cdef vector.vector[Py_ssize_t] axes + cdef Py_ssize_t stride0, stride1 + cdef int ndim, i0, i1, ipos, k + ndim = strides.size() + for i0 in reversed(range(ndim)): + stride0 = abs(strides[i0]) + if stride0 == 0: # ambiguous + axes.insert(axes.begin(), i0) + continue + ipos = 0 + for k, i1 in enumerate(axes): + stride1 = abs(strides[i1]) + if stride1 == 0: # ambiguous + continue + elif stride1 <= stride0: # shouldswap = false + break + else: # shouldswap = true + ipos = k + 1 + axes.insert(axes.begin() + ipos, i0) + return axes + + +cdef _ndarray_base _ndarray_ravel(_ndarray_base self, order): + cdef int order_char + cdef shape_t shape + cdef vector.vector[Py_ssize_t] axes + shape.push_back(self.size) + + order_char = internal._normalize_order(order, True) + if order_char == b'A': + if self._f_contiguous and not self._c_contiguous: + order_char = b'F' + else: + order_char = b'C' + if order_char == b'C': + return _reshape(self, shape) + elif order_char == b'F': + return _reshape(_T(self), shape) + elif order_char == b'K': + axes = _npyiter_k_order_axes(self.strides) + return _reshape(_transpose(self, axes), shape) + + +cdef _ndarray_base _ndarray_squeeze(_ndarray_base self, axis): + cdef vector.vector[char] axis_flags + cdef shape_t newshape + cdef strides_t newstrides + cdef Py_ssize_t ndim, naxes, _axis + + ndim = self._shape.size() + axis_flags = vector.vector[char](ndim, 0) + + # Convert axis to boolean flag. + if axis is None: + for idim in range(ndim): + if self._shape[idim] == 1: + axis_flags[idim] = 1 + elif isinstance(axis, tuple): + naxes = len(axis) + for i in range(naxes): + _axis = internal._normalize_axis_index(axis[i], ndim) + if axis_flags[_axis] == 1: + raise ValueError('duplicate value in \'axis\'') + axis_flags[_axis] = 1 + else: + _axis = axis + if ndim == 0 and (_axis == 0 or _axis == -1): + # Special case letting axis={-1,0} slip through for scalars, + # for backwards compatibility reasons. + pass + else: + _axis = internal._normalize_axis_index(_axis, ndim) + axis_flags[_axis] = 1 + + # Verify that the axes requested are all of size one + any_ones = 0 + for idim in range(ndim): + if axis_flags[idim] != 0: + if self._shape[idim] == 1: + any_ones = 1 + else: + raise ValueError('cannot select an axis to squeeze out ' + 'which has size not equal to one') + + # If there were no axes to squeeze out, return the same array + if any_ones == 0: + return self + + for i in range(ndim): + if axis_flags[i] == 0: + newshape.push_back(self._shape[i]) + newstrides.push_back(self._strides[i]) + + v = self.view() + # TODO(niboshi): Confirm update_x_contiguity flags + v._set_shape_and_strides(newshape, newstrides, False, True) + return v + + +cdef _ndarray_base _ndarray_repeat(_ndarray_base self, repeats, axis): + return _repeat(self, repeats, axis) + + +# exposed + + +cpdef _ndarray_base _expand_dims(_ndarray_base a, tuple axis): + cdef vector.vector[Py_ssize_t] normalized_axis + cdef out_ndim = a.ndim + len(axis) + cdef shape_t a_shape = a.shape, out_shape + _normalize_axis_tuple(axis, out_ndim, normalized_axis) + out_shape.assign(out_ndim, 0) + cdef Py_ssize_t i, j + for i in normalized_axis: + out_shape[i] = 1 + j = 0 + for i in range(out_ndim): + if out_shape[i] == 1: + continue + out_shape[i] = a_shape[j] + j += 1 + return _reshape(a, out_shape) + + +cpdef _ndarray_base moveaxis(_ndarray_base a, source, destination): + cdef shape_t src, dest + cdef Py_ssize_t ndim = a.ndim + _normalize_axis_tuple(source, ndim, src) + _normalize_axis_tuple(destination, ndim, dest) + + if src.size() != dest.size(): + raise ValueError('`source` and `destination` arguments must have ' + 'the same number of elements') + + cdef vector.vector[Py_ssize_t] order + cdef Py_ssize_t i + for i in range(ndim): + if not _has_element(src, i): + order.push_back(i) + + cdef Py_ssize_t d, s + for d, s in sorted(zip(dest, src)): + order.insert(order.begin() + d, s) + + return _transpose(a, order) + + +cpdef _ndarray_base _move_single_axis( + _ndarray_base a, Py_ssize_t source, Py_ssize_t destination): + """Like moveaxis, but supporting only integer source and destination.""" + cdef Py_ssize_t ndim = a.ndim + source = internal._normalize_axis_index(source, ndim) + destination = internal._normalize_axis_index(destination, ndim) + + if source == destination: + return a + + cdef vector.vector[Py_ssize_t] order + cdef Py_ssize_t i + for i in range(ndim): + if i != source: + order.push_back(i) + + order.insert(order.begin() + destination, source) + return _transpose(a, order) + + +cpdef _ndarray_base rollaxis( + _ndarray_base a, Py_ssize_t axis, Py_ssize_t start=0): + cdef Py_ssize_t i, ndim = a.ndim + cdef vector.vector[Py_ssize_t] axes + if axis < 0: + axis += ndim + if start < 0: + start += ndim + if not (0 <= axis < ndim and 0 <= start <= ndim): + raise ValueError('Axis out of range') + if axis < start: + start -= 1 + if axis == start: + return a + if ndim == 2: + return _transpose(a, axes) + + for i in range(ndim): + axes.push_back(i) + axes.erase(axes.begin() + axis) + axes.insert(axes.begin() + start, axis) + return _transpose(a, axes) + + +cpdef _ndarray_base _reshape(_ndarray_base self, const shape_t &shape_spec): + cdef shape_t shape + cdef strides_t strides + cdef _ndarray_base newarray + shape = internal.infer_unknown_dimension(shape_spec, self.size) + if internal.vector_equal(shape, self._shape): + return self.view() + + _get_strides_for_nocopy_reshape(self, shape, strides) + if strides.size() == shape.size(): + return self._view(type(self), shape, strides, False, True, self) + newarray = self.copy() + _get_strides_for_nocopy_reshape(newarray, shape, strides) + + # TODO(niboshi): Confirm update_x_contiguity flags + newarray._set_shape_and_strides(shape, strides, False, True) + return newarray + + +cpdef _ndarray_base _T(_ndarray_base self): + ret = self.view() + ret._shape.assign(self._shape.rbegin(), self._shape.rend()) + ret._strides.assign(self._strides.rbegin(), self._strides.rend()) + ret._c_contiguous = self._f_contiguous + ret._f_contiguous = self._c_contiguous + return ret + + +cpdef _ndarray_base _transpose( + _ndarray_base self, const vector.vector[Py_ssize_t] &axes): + cdef vector.vector[Py_ssize_t] a_axes + cdef vector.vector[char] axis_flags + cdef Py_ssize_t i, ndim, axis, axes_size + cdef bint is_normal = True, is_trans = True + + axes_size = axes.size() + if axes_size == 0: + return _T(self) + + ndim = self._shape.size() + if axes_size != ndim: + raise ValueError("axes don't match array") + + axis_flags.resize(ndim, 0) + for i in range(axes_size): + axis = axes[i] + if axis < -ndim or axis >= ndim: + raise numpy.AxisError(axis, ndim) + axis %= ndim + a_axes.push_back(axis) + if axis_flags[axis]: + raise ValueError('repeated axis in transpose') + axis_flags[axis] = 1 + is_normal &= i == axis + is_trans &= ndim - 1 - i == axis + + if is_normal: + return self.view() + if is_trans: + return _T(self) + + ret = self.view() + ret._shape.clear() + ret._strides.clear() + for axis in a_axes: + ret._shape.push_back(self._shape[axis]) + ret._strides.push_back(self._strides[axis]) + ret._update_contiguity() + return ret + + +cpdef array_split(_ndarray_base ary, indices_or_sections, Py_ssize_t axis): + cdef Py_ssize_t i, ndim, size, each_size, index, prev, stride + cdef Py_ssize_t num_large + cdef shape_t shape + + ndim = ary.ndim + if -ndim > axis or ndim <= axis: + raise IndexError('Axis exceeds ndim') + if axis < 0: + axis += ndim + size = ary._shape[axis] + + if numpy.isscalar(indices_or_sections): + each_size = (size - 1) // indices_or_sections + num_large = (size - 1) % indices_or_sections + 1 + indices = [i * each_size + min(i, num_large) + for i in range(1, indices_or_sections)] + else: + indices = [i if i >= 0 else size + i for i in indices_or_sections] + + if len(indices) == 0: + return [ary] + + # Make a copy of shape for each view + shape = ary._shape + + prev = 0 + ret = [] + stride = ary._strides[axis] + if ary.size == 0: + stride = 0 + for index in indices: + index = min(index, size) + shape[axis] = max(index - prev, 0) + v = ary.view() + v.data = ary.data + prev * stride + # TODO(niboshi): Confirm update_x_contiguity flags + v._set_shape_and_strides(shape, ary._strides, True, True) + ret.append(v) + + prev = index + + shape[axis] = size - prev + v = ary.view() + v.data = ary.data + prev * stride + # TODO(niboshi): Confirm update_x_contiguity flags + v._set_shape_and_strides(shape, ary._strides, True, True) + ret.append(v) + + return ret + + +cpdef _ndarray_base broadcast_to(_ndarray_base array, shape): + """Broadcast an array to a given shape. + + .. seealso:: + :func:`cupy.broadcast_to` for full documentation, + :meth:`numpy.broadcast_to` + + """ + shape = tuple(shape) if numpy.iterable(shape) else (shape,) + cdef int i, j, ndim = array._shape.size(), length = len(shape) + cdef Py_ssize_t sh, a_sh + if ndim > length: + raise ValueError( + 'input operand has more dimensions than allowed by the axis ' + 'remapping') + cdef shape_t _shape = shape + cdef strides_t strides + strides.assign(length, 0) + for i in range(ndim): + j = i + length - ndim + sh = _shape[j] + a_sh = array._shape[i] + if sh == a_sh: + strides[j] = array._strides[i] + elif a_sh != 1: + raise ValueError( + 'operands could not be broadcast together with shape {} and ' + 'requested shape {}'.format(array.shape, shape)) + + view = array.view() + # TODO(niboshi): Confirm update_x_contiguity flags + view._set_shape_and_strides(_shape, strides, True, True) + return view + + +cpdef _ndarray_base _repeat(_ndarray_base a, repeats, axis=None): + """Repeat arrays along an axis. + + Args: + a (cupy.ndarray): Array to transform. + repeats (int, list or tuple): The number of repeats. + axis (int): The axis to repeat. + + Returns: + cupy.ndarray: Transformed array with repeats. + + .. seealso:: :func:`numpy.repeat` + + """ + cdef _ndarray_base ret + + if isinstance(repeats, _ndarray_base): + raise ValueError( + 'cupy.ndaray cannot be specified as `repeats` argument.') + + # Scalar and size 1 'repeat' arrays broadcast to any shape, for all + # other inputs the dimension must match exactly. + cdef bint broadcast = False + # numpy.issubdtype(1, numpy.integer) fails with old numpy like 1.13.3. + if (isinstance(repeats, int) or + (hasattr(repeats, 'dtype') and + numpy.issubdtype(repeats, numpy.integer))): + if repeats < 0: + raise ValueError( + '\'repeats\' should not be negative: {}'.format(repeats)) + broadcast = True + repeats = [repeats] + elif cpython.PySequence_Check(repeats): + for rep in repeats: + if rep < 0: + raise ValueError( + 'all elements of \'repeats\' should not be negative: {}' + .format(repeats)) + if len(repeats) == 1: + broadcast = True + else: + raise ValueError( + '\'repeats\' should be int or sequence: {}'.format(repeats)) + + if axis is None: + if broadcast: + a = _reshape(a, (-1, 1)) + ret = core.ndarray((a.size, repeats[0]), dtype=a.dtype) + if ret.size: + elementwise_copy(a, ret) + return ret.ravel() + else: + a = a.ravel() + axis = 0 + else: + axis = internal._normalize_axis_index(axis, a.ndim) + + if broadcast: + repeats = repeats * a._shape[axis] + elif a.shape[axis] != len(repeats): + raise ValueError( + '\'repeats\' and \'axis\' of \'a\' should be same length: {} != {}' + .format(a.shape[axis], len(repeats))) + + ret_shape = list(a.shape) + ret_shape[axis] = sum(repeats) + ret = core.ndarray(ret_shape, dtype=a.dtype) + a_index = [slice(None)] * len(ret_shape) + ret_index = list(a_index) + offset = 0 + for i in range(a._shape[axis]): + if repeats[i] == 0: + continue + a_index[axis] = slice(i, i + 1) + ret_index[axis] = slice(offset, offset + repeats[i]) + # convert to tuple because cupy has a indexing bug + ret[tuple(ret_index)] = a[tuple(a_index)] + offset += repeats[i] + return ret + + +cpdef _ndarray_base concatenate_method( + tup, int axis, _ndarray_base out=None, dtype=None, + casting='same_kind'): + cdef int ndim0 + cdef int i + cdef _ndarray_base a, a0 + + if dtype is not None: + dtype = get_dtype(dtype) + + dev_id = device.get_device_id() + arrays = _preprocess_args(dev_id, tup, False) + + # Check if the input is not an empty sequence + if len(arrays) == 0: + raise ValueError('Cannot concatenate from empty tuple') + + # Check types of the input arrays + for o in arrays: + if not isinstance(o, _ndarray_base): + raise TypeError('Only cupy arrays can be concatenated') + + # Check ndim > 0 for the input arrays + for o in arrays: + a = o + if a._shape.size() == 0: + raise TypeError('zero-dimensional arrays cannot be concatenated') + + # Check ndim consistency of the input arrays + a0 = arrays[0] + ndim0 = a0._shape.size() + for o in arrays[1:]: + a = o + if a._shape.size() != ndim0: + raise ValueError( + 'All arrays to concatenate must have the same ndim') + + # Check shape consistency of the input arrays, and compute the output shape + shape0 = a0._shape + axis = internal._normalize_axis_index(axis, ndim0) + for o in arrays[1:]: + a = o + for i in range(ndim0): + if i != axis and shape0[i] != a._shape[i]: + raise ValueError( + 'All arrays must have same shape except the axis to ' + 'concatenate') + shape0[axis] += a._shape[axis] + + # Compute the output dtype + if out is None: + if dtype is None: + dtype = a0.dtype + have_same_types = True + for o in arrays[1:]: + have_same_types = have_same_types and (o.dtype == dtype) + if not have_same_types: + dtype = functools.reduce( + numpy.promote_types, set([a.dtype for a in arrays])) + else: + if dtype is not None: + raise TypeError('concatenate() only takes `out` or `dtype` as an ' + 'argument, but both were provided.') + dtype = out.dtype + + # Check casting rule + for o in arrays: + _raise_if_invalid_cast(o.dtype, dtype, casting) + + # Prpare the output array + shape_t = tuple(shape0) + if out is None: + out = core.ndarray(shape_t, dtype=dtype) + else: + if len(out.shape) != len(shape_t): + raise ValueError('Output array has wrong dimensionality') + if out.shape != shape_t: + raise ValueError('Output array is the wrong shape') + + return _concatenate(arrays, axis, shape_t, out, casting) + + +cpdef _ndarray_base _concatenate( + list arrays, Py_ssize_t axis, tuple shape, _ndarray_base out, + str casting): + cdef _ndarray_base a, b + cdef Py_ssize_t i, aw, itemsize, axis_size + cdef bint all_same_type, same_shape_and_contiguous + # If arrays are large, Issuing each copy method is efficient. + cdef Py_ssize_t threshold_size = 2 * 1024 * 1024 + + dtype = out.dtype + + if len(arrays) > 8: + all_same_type = True + same_shape_and_contiguous = True + axis_size = shape[axis] // len(arrays) + total_bytes = 0 + itemsize = dtype.itemsize + for a in arrays: + if a.dtype != dtype: + all_same_type = False + break + if same_shape_and_contiguous: + same_shape_and_contiguous = ( + a._c_contiguous and a._shape[axis] == axis_size) + total_bytes += a.size * itemsize + + if all_same_type and total_bytes < threshold_size * len(arrays): + return _concatenate_single_kernel( + arrays, axis, shape, dtype, same_shape_and_contiguous, out) + + i = 0 + slice_list = [slice(None)] * len(shape) + for a in arrays: + aw = a._shape[axis] + slice_list[axis] = slice(i, i + aw) + b = out[tuple(slice_list)] + elementwise_copy(a, b, casting=casting) + i += aw + return out + + +cpdef Py_ssize_t size(_ndarray_base a, axis=None) except? -1: + """Returns the number of elements along a given axis. + + Args: + a (ndarray): Input data. + axis (int or None): Axis along which the elements are counted. + When it is ``None``, it returns the total number of elements. + + Returns: + int: Number of elements along the given axis. + + """ + cdef int index, ndim + if axis is None: + return a.size + else: + index = axis + ndim = a._shape.size() + if index < 0: + index += ndim + if not 0 <= index < ndim: + raise IndexError('index out of range') + return a._shape[index] + + +# private + + +cdef bint _has_element(const shape_t &source, Py_ssize_t n): + for i in range(source.size()): + if source[i] == n: + return True + return False + + +cdef _get_strides_for_nocopy_reshape( + _ndarray_base a, const shape_t &newshape, strides_t &newstrides): + cdef Py_ssize_t size, itemsize, ndim, dim, last_stride + size = a.size + newstrides.clear() + + itemsize = a.itemsize + if size == 1: + newstrides.assign(newshape.size(), itemsize) + return + if size == 0: + internal.get_contiguous_strides_inplace( + newshape, newstrides, itemsize, True, False) + return + + cdef shape_t shape + cdef strides_t strides + internal.get_reduced_dims(a._shape, a._strides, itemsize, shape, strides) + + ndim = shape.size() + dim = 0 + last_stride = shape[0] * strides[0] + for i in range(newshape.size()): + size = newshape[i] + if size <= 1: + newstrides.push_back(last_stride) + continue + if dim >= ndim or shape[dim] % size != 0: + newstrides.clear() + break + shape[dim] //= size + last_stride = shape[dim] * strides[dim] + newstrides.push_back(last_stride) + if shape[dim] == 1: + dim += 1 + + +cdef _normalize_axis_tuple(axis, Py_ssize_t ndim, shape_t &ret): + """Normalizes an axis argument into a tuple of non-negative integer axes. + + Arguments `argname` and `allow_duplicate` are not supported. + + """ + if numpy.isscalar(axis): + axis = (axis,) + + for ax in axis: + ax = internal._normalize_axis_index(ax, ndim) + if _has_element(ret, ax): + # the message in `numpy.core.numeric.normalize_axis_tuple` + raise ValueError('repeated axis') + ret.push_back(ax) + + +cdef _ndarray_base _concatenate_single_kernel( + list arrays, Py_ssize_t axis, tuple shape, dtype, + bint same_shape_and_contiguous, _ndarray_base out): + cdef _ndarray_base a, x + cdef Py_ssize_t base, cum, ndim + cdef int i, j + cdef Py_ssize_t[:] ptrs + cdef Py_ssize_t[:] cum_sizes + cdef Py_ssize_t[:, :] x_strides + cdef int device_id = device.get_device_id() + + assert out is not None + + ptrs = numpy.ndarray(len(arrays), numpy.int64) + for i, a in enumerate(arrays): + _check_peer_access(a, device_id) + ptrs[i] = a.data.ptr + x = core.array(ptrs) + + if same_shape_and_contiguous: + base = internal.prod_sequence(shape[axis:]) // len(arrays) + _concatenate_kernel_same_size(x, base, out) + return out + + ndim = len(shape) + x_strides = numpy.ndarray((len(arrays), ndim), numpy.int64) + cum_sizes = numpy.ndarray(len(arrays), numpy.int64) + cum = 0 + for i, a in enumerate(arrays): + for j in range(ndim): + x_strides[i, j] = a._strides[j] + cum_sizes[i] = cum + cum += a._shape[axis] + + _concatenate_kernel( + x, axis, core.array(cum_sizes), core.array(x_strides), out) + return out + + +cdef _concatenate_kernel_same_size = ElementwiseKernel( + 'raw P x, int64 base', + 'T y', + ''' + ptrdiff_t middle = i / base; + ptrdiff_t top = middle / x.size(); + ptrdiff_t array_ind = middle - top * x.size(); + ptrdiff_t offset = i + (top - middle) * base; + y = reinterpret_cast(x[array_ind])[offset]; + ''', + 'cupy_concatenate_same_size' +) + + +cdef _concatenate_kernel = ElementwiseKernel( + '''raw P x, int32 axis, raw int64 cum_sizes, raw int64 x_strides''', + 'T y', + ''' + ptrdiff_t axis_ind = _ind.get()[axis]; + ptrdiff_t left = 0; + ptrdiff_t right = cum_sizes.size(); + + while (left < right - 1) { + ptrdiff_t m = (left + right) / 2; + if (axis_ind < cum_sizes[m]) { + right = m; + } else { + left = m; + } + } + + ptrdiff_t array_ind = left; + axis_ind -= cum_sizes[left]; + char* ptr = reinterpret_cast(x[array_ind]); + for (int j = _ind.ndim - 1; j >= 0; --j) { + ptrdiff_t ind[] = {array_ind, j}; + ptrdiff_t offset; + if (j == axis) { + offset = axis_ind; + } else { + offset = _ind.get()[j]; + } + ptr += x_strides[ind] * offset; + } + + y = *reinterpret_cast(ptr); + ''', + 'cupy_concatenate', + reduce_dims=False +) diff --git a/cupy/_core/_routines_math.pxd b/cupy/_core/_routines_math.pxd new file mode 100644 index 0000000..864d178 --- /dev/null +++ b/cupy/_core/_routines_math.pxd @@ -0,0 +1,39 @@ +from cupy._core.core cimport _ndarray_base + + +cdef _ndarray_base _ndarray_conj(_ndarray_base self) +cdef _ndarray_base _ndarray_real_getter(_ndarray_base self) +cdef _ndarray_base _ndarray_real_setter(_ndarray_base self, value) +cdef _ndarray_base _ndarray_imag_getter(_ndarray_base self) +cdef _ndarray_base _ndarray_imag_setter(_ndarray_base self, value) +cdef _ndarray_base _ndarray_prod( + _ndarray_base self, axis, dtype, out, keepdims) +cdef _ndarray_base _ndarray_sum(_ndarray_base self, axis, dtype, out, keepdims) +cdef _ndarray_base _ndarray_cumsum(_ndarray_base self, axis, dtype, out) +cdef _ndarray_base _ndarray_cumprod(_ndarray_base self, axis, dtype, out) +cdef _ndarray_base _ndarray_clip(_ndarray_base self, a_min, a_max, out) + +cpdef _ndarray_base _nansum(_ndarray_base a, axis, dtype, out, keepdims) +cpdef _ndarray_base _nanprod(_ndarray_base a, axis, dtype, out, keepdims) + +cpdef enum scan_op: + SCAN_SUM = 0 + SCAN_PROD = 1 + +cdef _ndarray_base scan(_ndarray_base a, op, dtype=*, _ndarray_base out=*, + incomplete=*, chunk_size=*) +cdef object _sum_auto_dtype +cdef object _add +cdef object _conj +cdef object _angle +cdef object _positive +cdef object _negative +cdef object _multiply +cdef object _divide +cdef object _power +cdef object _subtract +cdef object _true_divide +cdef object _floor_divide +cdef object _remainder +cdef object _absolute +cdef object _sqrt diff --git a/cupy/_core/_routines_math.pyx b/cupy/_core/_routines_math.pyx new file mode 100644 index 0000000..440c8cb --- /dev/null +++ b/cupy/_core/_routines_math.pyx @@ -0,0 +1,1145 @@ +import string + +import numpy + +import cupy +from cupy._core._reduction import create_reduction_func +from cupy._core._kernel import create_ufunc, _get_warpsize +from cupy._core._scalar import get_typename +from cupy._core._ufuncs import elementwise_copy +import cupy._core.core as core +from cupy._core cimport internal +from cupy import _util + +from cupy_backends.cuda.api cimport runtime +from cupy._core cimport _accelerator +from cupy._core._dtype cimport get_dtype +from cupy._core.core cimport _ndarray_init +from cupy._core.core cimport compile_with_cache +from cupy._core.core cimport _ndarray_base +from cupy.cuda cimport memory + +from cupy.cuda import cub + +try: + import cupy_backends.cuda.libs.cutensor as cuda_cutensor +except ImportError: + cuda_cutensor = None + + +# _ndarray_base members + + +cdef _ndarray_base _ndarray_conj(_ndarray_base self): + if self.dtype.kind == 'c': + return _conjugate(self) + else: + return self + + +cdef _ndarray_base _ndarray_real_getter(_ndarray_base self): + if self.dtype.kind == 'c': + dtype = get_dtype(self.dtype.char.lower()) + view = core.ndarray.__new__( + type(self), shape=self._shape, dtype=dtype, _obj=self, + memptr=self.data, strides=self._strides) + (<_ndarray_base>view).base = ( + self.base if self.base is not None else self) + return view + return self + + +cdef _ndarray_base _ndarray_real_setter(_ndarray_base self, value): + elementwise_copy(value, _ndarray_real_getter(self)) + + +cdef _ndarray_base _ndarray_imag_getter(_ndarray_base self): + cdef memory.MemoryPointer memptr + if self.dtype.kind == 'c': + dtype = get_dtype(self.dtype.char.lower()) + memptr = self.data + # Make the memory pointer point to the first imaginary element. + # Note that even if the array doesn't have a valid memory (e.g. 0-size + # array), the resulting array should be a view of the original array, + # aligning with NumPy behavior. + if memptr.ptr != 0: + memptr = memptr + self.dtype.itemsize // 2 + view = core.ndarray.__new__( + type(self), shape=self._shape, dtype=dtype, memptr=memptr, + strides=self._strides) + (<_ndarray_base>view).base = ( + self.base if self.base is not None else self) + return view + new_array = core.ndarray.__new__(type(self), self.shape, dtype=self.dtype) + new_array.fill(0) + return new_array + + +cdef _ndarray_base _ndarray_imag_setter(_ndarray_base self, value): + if self.dtype.kind == 'c': + elementwise_copy(value, _ndarray_imag_getter(self)) + else: + raise TypeError('cupy.ndarray does not have imaginary part to set') + + +cdef _ndarray_base _ndarray_prod( + _ndarray_base self, axis, dtype, out, keepdims): + for accelerator in _accelerator._routine_accelerators: + result = None + if accelerator == _accelerator.ACCELERATOR_CUB: + # result will be None if the reduction is not compatible with CUB + result = cub.cub_reduction( + self, cub.CUPY_CUB_PROD, axis, dtype, out, keepdims) + if (accelerator == _accelerator.ACCELERATOR_CUTENSOR and + cuda_cutensor is not None): + from cupyx import cutensor + result = cutensor._try_reduction_routine( + self, axis, dtype, out, keepdims, cuda_cutensor.OP_MUL, 1, 0) + if result is not None: + return result + if dtype is None: + return _prod_auto_dtype(self, axis, dtype, out, keepdims) + else: + return _prod_keep_dtype(self, axis, dtype, out, keepdims) + + +cdef _ndarray_base _ndarray_sum( + _ndarray_base self, axis, dtype, out, keepdims): + for accelerator in _accelerator._routine_accelerators: + result = None + if accelerator == _accelerator.ACCELERATOR_CUB: + # result will be None if the reduction is not compatible with CUB + result = cub.cub_reduction( + self, cub.CUPY_CUB_SUM, axis, dtype, out, keepdims) + if (accelerator == _accelerator.ACCELERATOR_CUTENSOR and + cuda_cutensor is not None): + from cupyx import cutensor + result = cutensor._try_reduction_routine( + self, axis, dtype, out, keepdims, cuda_cutensor.OP_ADD, 1, 0) + if result is not None: + return result + + if dtype is None: + return _sum_auto_dtype(self, axis, dtype, out, keepdims) + else: + return _sum_keep_dtype(self, axis, dtype, out, keepdims) + + +cdef _ndarray_base _ndarray_cumsum(_ndarray_base self, axis, dtype, out): + return cupy.cumsum(self, axis, dtype, out) + + +cdef _ndarray_base _ndarray_cumprod(_ndarray_base self, axis, dtype, out): + return cupy.cumprod(self, axis, dtype, out) + + +cdef _ndarray_base _ndarray_clip(_ndarray_base self, a_min, a_max, out): + if a_min is None and a_max is None: + raise ValueError('array_clip: must set either max or min') + kind = self.dtype.kind + if a_min is None: + if kind == 'f': + a_min = self.dtype.type('-inf') + elif kind in 'iu': + a_min = numpy.iinfo(self.dtype.type).min + if a_max is None: + if kind == 'f': + a_max = self.dtype.type('inf') + elif kind in 'iu': + a_max = numpy.iinfo(self.dtype.type).max + return _clip(self, a_min, a_max, out=out) + + +# private/internal + +_op_char = {scan_op.SCAN_SUM: '+', scan_op.SCAN_PROD: '*'} +_identity = {scan_op.SCAN_SUM: 0, scan_op.SCAN_PROD: 1} + + +@cupy._util.memoize(for_each_device=True) +def _cupy_bsum_shfl(op, chunk_size, warp_size=32): + """Returns a kernel that computes the sum/prod of each thread-block. + + Args: + op (int): Operation type. SCAN_SUM or SCAN_PROD. + chunk_size (int): Number of array elements processed by a single + thread-block. + warp_size (int); Warp size. + + Returns: + cupy.ElementwiseKernel + + Example: + a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + _cupy_bsum(op=SCAN_SUM, chunk_size=4)(a, b, ...) + b == [10, 26, 19] + + Note: + This uses warp shuffle functions to exchange data in a warp. + See the link below for details about warp shuffle functions. + https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-shuffle-functions + """ + block_size = chunk_size // 2 # each thread handles two elements + in_params = 'raw T a' + out_params = 'raw O b' + loop_prep = string.Template(""" + __shared__ O smem[${block_size} / ${warp_size}]; + const int n_warp = ${block_size} / ${warp_size}; + const int warp_id = threadIdx.x / ${warp_size}; + const int lane_id = threadIdx.x % ${warp_size}; + """).substitute(block_size=block_size, warp_size=warp_size) + loop_body = string.Template(""" + O x = ${identity}; + if (2*i < a.size()) x = a[2*i]; + if (2*i + 1 < a.size()) x ${op}= a[2*i + 1]; + for (int j = 1; j < ${warp_size}; j *= 2) { + x ${op}= __shfl_xor_sync(0xffffffff, x, j, ${warp_size}); + } + if (lane_id == 0) smem[warp_id] = x; + __syncthreads(); + if (warp_id == 0) { + x = ${identity}; + if (lane_id < n_warp) x = smem[lane_id]; + for (int j = 1; j < n_warp; j *= 2) { + x ${op}= __shfl_xor_sync(0xffffffff, x, j, ${warp_size}); + } + int block_id = i / ${block_size}; + if (lane_id == 0) b[block_id] = x; + } + """).substitute(block_size=block_size, warp_size=warp_size, + op=_op_char[op], identity=_identity[op]) + return cupy.ElementwiseKernel(in_params, out_params, loop_body, + 'cupy_bsum_shfl', loop_prep=loop_prep) + + +@cupy._util.memoize(for_each_device=True) +def _cupy_bsum_smem(op, chunk_size, warp_size=32): + """Returns a kernel that computes the sum/prod of each thread-block. + + Args: + op (int): Operation type. SCAN_SUM or SCAN_PROD. + chunk_size (int): Number of array elements processed by a single + thread-block. + warp_size (int); Warp size. + + Returns: + cupy.ElementwiseKernel + + Example: + a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + _cupy_bsum(op=SCAN_SUM, chunk_size=4)(a, b, ...) + b == [10, 26, 19] + + Note: + This uses shared memory to exchange data in a warp. + """ + block_size = chunk_size // 2 # each thread handles two elements + in_params = 'raw T a' + out_params = 'raw O b' + loop_prep = string.Template(""" + __shared__ O smem1[${block_size}]; + __shared__ O smem2[${warp_size}]; + const int n_warp = ${block_size} / ${warp_size}; + const int warp_id = threadIdx.x / ${warp_size}; + const int lane_id = threadIdx.x % ${warp_size}; + """).substitute(block_size=block_size, warp_size=warp_size) + loop_body = string.Template(""" + O x = ${identity}; + if (2*i < a.size()) x = a[2*i]; + if (2*i + 1 < a.size()) x ${op}= a[2*i + 1]; + for (int j = 1; j < ${warp_size}; j *= 2) { + smem1[threadIdx.x] = x; __syncwarp(); + x ${op}= smem1[threadIdx.x ^ j]; __syncwarp(); + } + if (lane_id == 0) smem2[warp_id] = x; + __syncthreads(); + if (warp_id == 0) { + x = ${identity}; + if (lane_id < n_warp) x = smem2[lane_id]; + for (int j = 1; j < n_warp; j *= 2) { + smem2[lane_id] = x; __syncwarp(); + x ${op}= smem2[lane_id ^ j]; __syncwarp(); + } + int block_id = i / ${block_size}; + if (lane_id == 0) b[block_id] = x; + } + """).substitute(block_size=block_size, warp_size=warp_size, + op=_op_char[op], identity=_identity[op]) + return cupy.ElementwiseKernel(in_params, out_params, loop_body, + 'cupy_bsum_smem', loop_prep=loop_prep) + + +@cupy._util.memoize(for_each_device=True) +def _cupy_scan_naive(op, chunk_size, warp_size=32): + """Returns a kernel to compute an inclusive scan. + + It first performs an inclusive scan in each thread-block and then add the + scan results for the sum/prod of the chunks. + + Args: + op (int): Operation type. SCAN_SUM or SCAN_PROD. + chunk_size (int): Number of array elements processed by a single + thread-block. + warp_size (int); Warp size. + + Returns: + cupy.ElementwiseKernel + + Example: + b = [10, 36, 55] + a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + _cupy_scan(op=SCAN_SUM, chunk_size=4)(b, a, out, ...) + out == [1, 3, 6, 10, 15, 21, 28, 36, 45, 55] + + Note: + This uses a kind of method called "Naive Parallel Scan" for inclusive + scan in each thread-block. See below for details about it. + https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda + """ + in_params = 'raw O b' + out_params = 'raw T a, raw O out' + loop_prep = string.Template(""" + __shared__ O smem1[${block_size}]; + __shared__ O smem2[${warp_size}]; + const int n_warp = ${block_size} / ${warp_size}; + const int warp_id = threadIdx.x / ${warp_size}; + const int lane_id = threadIdx.x % ${warp_size}; + """).substitute(block_size=chunk_size, warp_size=warp_size) + loop_body = string.Template(""" + O x = ${identity}; + if (i < a.size()) x = a[i]; + for (int j = 1; j < ${warp_size}; j *= 2) { + smem1[threadIdx.x] = x; __syncwarp(); + if (lane_id - j >= 0) x ${op}= smem1[threadIdx.x - j]; + __syncwarp(); + } + if (lane_id == ${warp_size} - 1) smem2[warp_id] = x; + __syncthreads(); + if (warp_id == 0) { + O y = ${identity}; + if (lane_id < n_warp) y = smem2[lane_id]; + for (int j = 1; j < n_warp; j *= 2) { + smem2[lane_id] = y; __syncwarp(); + if (lane_id - j >= 0) y ${op}= smem2[lane_id - j]; + __syncwarp(); + } + smem2[lane_id] = y; + } + __syncthreads(); + if (warp_id > 0) x ${op}= smem2[warp_id - 1]; + int block_id = i / ${block_size}; + if (block_id > 0) x ${op}= b[block_id - 1]; + if (i < a.size()) out[i] = x; + """).substitute(block_size=chunk_size, warp_size=warp_size, + op=_op_char[op], identity=_identity[op]) + return cupy.ElementwiseKernel(in_params, out_params, loop_body, + 'cupy_scan_naive', loop_prep=loop_prep) + + +@cupy._util.memoize(for_each_device=True) +def _cupy_scan_btree(op, chunk_size, warp_size=32): + """Returns a kernel to compute an inclusive scan. + + It first performs an inclusive scan in each thread-block and then add the + scan results for the sum/prod of the chunks. + + Args: + op (int): Operation type. SCAN_SUM or SCAN_PROD. + chunk_size (int): Number of array elements processed by a single + thread-block. + warp_size (int); Warp size. + + Returns: + cupy.ElementwiseKernel + + Example: + b = [10, 36, 55] + a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + _cupy_scan(op=SCAN_SUM, chunk_size=4)(b, a, out, ...) + out == [1, 3, 6, 10, 15, 21, 28, 36, 45, 55] + + Note: + This uses a kind of method called "Work-Efficient Parallel Scan" for + inclusive scan in each thread-block. See below link for details about + "Work-Efficient Parallel Scan". + https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda + """ + in_params = 'raw O b' + out_params = 'raw T a, raw O out' + loop_prep = string.Template(""" + __shared__ O smem0[${block_size} + 1]; + O *smem1 = smem0 + 1; + __shared__ O smem2[${warp_size}]; + const int n_warp = ${block_size} / ${warp_size}; + const int warp_id = threadIdx.x / ${warp_size}; + const int lane_id = threadIdx.x % ${warp_size}; + if (threadIdx.x == 0) smem0[0] = ${identity}; + """).substitute(block_size=chunk_size, warp_size=warp_size, + identity=_identity[op]) + loop_body = string.Template(""" + O x = ${identity}; + if (i < a.size()) x = a[i]; + for (int j = 1; j < ${warp_size}; j *= 2) { + smem1[threadIdx.x] = x; __syncwarp(); + if (lane_id % (2*j) == (2*j)-1) { + x ${op}= smem1[threadIdx.x - j]; + } + __syncwarp(); + } + smem1[threadIdx.x] = x; + __syncthreads(); + if (warp_id == 0) { + O y = ${identity}; + if (lane_id < n_warp) { + y = smem0[${warp_size} * (lane_id + 1)]; + } + for (int j = 1; j < n_warp; j *= 2) { + smem2[lane_id] = y; __syncwarp(); + if (lane_id % (2*j) == (2*j)-1) { + y ${op}= smem2[lane_id - j]; + } + __syncwarp(); + } + for (int j = n_warp / 4; j > 0; j /= 2) { + smem2[lane_id] = y; __syncwarp(); + if ((lane_id % (2*j) == j-1) && (lane_id >= 2*j)) { + y ${op}= smem2[lane_id - j]; + } + __syncwarp(); + } + if (lane_id < n_warp) { + smem0[${warp_size} * (lane_id + 1)] = y; + } + } + __syncthreads(); + x = smem0[threadIdx.x]; + for (int j = ${warp_size} / 2; j > 0; j /= 2) { + if (lane_id % (2*j) == j) { + x ${op}= smem0[threadIdx.x - j]; + } + __syncwarp(); + smem0[threadIdx.x] = x; __syncwarp(); + } + __syncthreads(); + x = smem1[threadIdx.x]; + int block_id = i / ${block_size}; + if (block_id > 0) x ${op}= b[block_id - 1]; + if (i < a.size()) out[i] = x; + """).substitute(block_size=chunk_size, warp_size=warp_size, + op=_op_char[op], identity=_identity[op]) + return cupy.ElementwiseKernel(in_params, out_params, loop_body, + 'cupy_scan_btree', loop_prep=loop_prep) + + +cdef _ndarray_base scan( + _ndarray_base a, op, dtype=None, _ndarray_base out=None, + incomplete=False, chunk_size=512): + """Return the prefix sum(scan) of the elements. + + Args: + a (cupy.ndarray): input array. + out (cupy.ndarray): Alternative output array in which to place + the result. The same size and same type as the input array(a). + + Returns: + cupy.ndarray: A new array holding the result is returned. + + """ + if a._shape.size() != 1: + raise TypeError('Input array should be 1D array.') + + if out is None: + if dtype is None: + dtype = a.dtype + if not incomplete: + out = _ndarray_init(cupy.ndarray, a._shape, dtype, None) + else: + if a.size != out.size: + raise ValueError('Provided out is the wrong size') + dtype = out.dtype + dtype = numpy.dtype(dtype) + + warp_size = _get_warpsize() + if runtime._is_hip_environment: + if dtype.char in 'iIfdlq': + # On HIP, __shfl* supports int, unsigned int, float, double, + # long, and long long. The documentation is too outdated and + # unreliable; refer to the header at + # $ROCM_HOME/include/hip/hcc_detail/device_functions.h + bsum_kernel = _cupy_bsum_shfl(op, chunk_size, warp_size) + else: + bsum_kernel = _cupy_bsum_smem(op, chunk_size, warp_size) + else: + if dtype.char in 'iIlLqQfd': + bsum_kernel = _cupy_bsum_shfl(op, chunk_size, warp_size) + else: + bsum_kernel = _cupy_bsum_smem(op, chunk_size, warp_size) + if dtype.char in 'fdFD': + scan_kernel = _cupy_scan_btree(op, chunk_size, warp_size) + else: + scan_kernel = _cupy_scan_naive(op, chunk_size, warp_size) + b_size = (a.size + chunk_size - 1) // chunk_size + b = cupy.empty((b_size,), dtype=dtype) + size = b.size * chunk_size + + if a.size > chunk_size: + bsum_kernel(a, b, size=size // 2, block_size=chunk_size // 2) + scan(b, op, dtype=dtype, out=b) + if incomplete: + return b + scan_kernel(b, a, out, size=size, block_size=chunk_size) + else: + scan_kernel(b, a, out, size=size, block_size=chunk_size) + + return out + + +@_util.memoize(for_each_device=True) +def _inclusive_batch_scan_kernel( + dtype, block_size, op, src_c_cont, out_c_cont): + """return Prefix Sum(Scan) cuda kernel + for a 2d array over axis 1 + used for scanning over different axes + + e.g + if blocksize > len(src[0]) + src [[1, 2, 3, 4], + [5, 6, 7, 8]] + dst [[1, 3, 6, 10], + [5, 11, 18, 26]] + + if blocksize < len(src[0]) + block_size: 2 + # TODO show partialness + src [[1, 2, 3, 4], + [5, 6, 7, 8]] + dst [[1, 3, 3, 7], + [5, 11, 7, 15]] + + Args: + dtype: src, dst array type + block_size: block_size + + Returns: + cupy.cuda.Function: cuda function + """ + op_char = {scan_op.SCAN_SUM: '+', scan_op.SCAN_PROD: '*'} + identity = {scan_op.SCAN_SUM: 0, scan_op.SCAN_PROD: 1} + name = 'cupy_inclusive_batch_scan_kernel' + dtype = get_typename(dtype) + source = string.Template(""" + extern "C" __global__ void ${name}( + const CArray<${dtype}, 2, ${src_c_cont}> src, + CArray<${dtype}, 2, ${out_c_cont}> dst, int batch_size){ + long long n = src.size(); + + extern __shared__ ${dtype} temp[]; + + unsigned int thid = threadIdx.x; + unsigned int block = blockIdx.x * blockDim.x; + + unsigned int pad_batch_size = batch_size; + bool must_copy = true; + + if (batch_size & (batch_size -1)) { + pad_batch_size = 1 << (32 - __clz(batch_size)); + must_copy = (thid & (pad_batch_size-1)) < batch_size; + } + if (pad_batch_size > ${block_size}) { + int blocks_per_batch = (batch_size - 1) / ${block_size} + 1; + pad_batch_size = ${block_size} * blocks_per_batch; + + // Must copy enables for all blocks but the last one in the batch + bool last_block = (blockIdx.x + 1) % blocks_per_batch == 0; + int remaining_batch = batch_size % ${block_size}; + if (remaining_batch == 0) { + remaining_batch = ${block_size}; + } + must_copy = !last_block || (thid < (remaining_batch)); + } + + int pad_per_batch = pad_batch_size-batch_size; + int n_batches_block = ${block_size} / pad_batch_size; + + unsigned int idx0 = thid + block; + + int batch_id = idx0 / pad_batch_size; + idx0 = idx0 - pad_per_batch * batch_id; + + int row = idx0 / batch_size; + int col = idx0 % batch_size; + const ptrdiff_t idx0_idx[] = {row, col}; + + if(idx0 < n){ + temp[thid] = (must_copy) ? src[idx0_idx] : (${dtype}) ${identity}; + __syncthreads(); + if (!n_batches_block) { + n_batches_block = 1; + pad_batch_size = ${block_size}; + } + for (int j = 0; j < n_batches_block; j++) { + int offset = j * pad_batch_size; + for (int i = 1; i <= pad_batch_size; i <<= 1) { + int index = ((threadIdx.x + 1) * 2 * i - 1); + int index_block = offset + index; + if (index < (pad_batch_size)){ + temp[index_block] ${op}= temp[index_block - i]; + } + __syncthreads(); + } + for(int i = pad_batch_size >> 1; i > 0; i >>= 1){ + int index = ((threadIdx.x + 1) * 2 * i - 1); + int index_block = offset + index; + if((index + i) < (pad_batch_size)){ + temp[index_block + i] ${op}= temp[index_block]; + } + __syncthreads(); + } + } + if(must_copy){ + dst[idx0_idx] = temp[thid]; + } + } + } + """).substitute(name=name, dtype=dtype, block_size=block_size, + op=op_char[op], identity=identity[op], + src_c_cont=src_c_cont, out_c_cont=out_c_cont) + module = compile_with_cache(source) + return module.get_function(name) + + +@_util.memoize(for_each_device=True) +def _add_scan_batch_blocked_sum_kernel(dtype, op, block_size, c_cont): + name = 'cupy_add_scan_blocked_sum_kernel' + dtype = get_typename(dtype) + ops = {scan_op.SCAN_SUM: '+', scan_op.SCAN_PROD: '*'} + source = string.Template(""" + extern "C" __global__ void ${name}(CArray<${dtype}, 2, ${c_cont}> src_dst, + int batch_size){ + long long n = src_dst.size(); + + unsigned int thid = threadIdx.x; + unsigned int block = blockIdx.x * ${block_size}; + + unsigned int idx0 = thid + block; + + // Respect padding + unsigned int row = idx0 / batch_size; + unsigned int col = idx0 % batch_size; + int my_block = ${block_size} * (col / ${block_size}); + const ptrdiff_t dst_idx[] = {row, col}; + const ptrdiff_t src_idx[] = {row, my_block - 1}; + + // Avoid for the first block of every row + // This can be tweaked with kernel launch settings + bool first = col < ${block_size}; + bool is_block = (col % (${block_size})) == ${block_size} - 1; + if(idx0 < n && !first && !is_block){ + src_dst[dst_idx] ${op}= src_dst[src_idx]; + } + } + """).substitute(name=name, dtype=dtype, op=ops[op], block_size=block_size, + c_cont=c_cont) + module = compile_with_cache(source) + return module.get_function(name) + + +cdef _ndarray_base _batch_scan_op( + _ndarray_base a, scan_op op, _ndarray_base out): + batch_size = a.shape[1] + # TODO(ecastill) replace this with "_reduction._block_size" once it is + # properly exposed + block_size = 512 + # Since we need to pad each batch we spawn more threads as some + # of them will be idle + # Calc the total number of blocks + padded_bs = 1 << ((batch_size - 1).bit_length()) + if padded_bs > block_size: + blocks_per_batch = (batch_size - 1) // block_size + 1 + padded_bs = block_size * blocks_per_batch + padded_size = a.size // batch_size * padded_bs + + cdef int src_cont = int(a.flags.c_contiguous) + cdef int out_cont = int(out.flags.c_contiguous) + kern_scan = _inclusive_batch_scan_kernel(a.dtype, block_size, op, + src_cont, out_cont) + kern_scan(grid=((padded_size - 1) // (block_size) + 1,), + block=(block_size,), + args=(a, out, batch_size), + shared_mem=a.itemsize * block_size) + if batch_size > block_size: + blocked_sum = out[:, block_size-1::block_size] + _batch_scan_op(blocked_sum, op, blocked_sum) + kern_add = _add_scan_batch_blocked_sum_kernel( + out.dtype, op, block_size, out_cont) + kern_add( + grid=((out.size - 1) // (block_size) + 1,), + block=(block_size,), + args=(out, batch_size)) + return out + + +cdef _proc_as_batch(_ndarray_base x, int axis, scan_op op): + if x.shape[axis] == 0: + return cupy.empty_like(x) + t = cupy.rollaxis(x, axis, x.ndim) + s = t.shape + r = t.reshape(-1, x.shape[axis]) + _batch_scan_op(r, op, r) + return cupy.rollaxis(r.reshape(s), x.ndim-1, axis) + + +cpdef scan_core( + _ndarray_base a, axis, scan_op op, dtype=None, _ndarray_base out=None): + if out is None: + if dtype is None: + kind = a.dtype.kind + if kind == 'b': + dtype = numpy.dtype('l') + elif kind == 'i' and a.dtype.itemsize < numpy.dtype('l').itemsize: + dtype = numpy.dtype('l') + elif kind == 'u' and a.dtype.itemsize < numpy.dtype('L').itemsize: + dtype = numpy.dtype('L') + else: + dtype = a.dtype + result = None + else: + if (out.flags.c_contiguous or out.flags.f_contiguous): + result = out + elementwise_copy(a, result) + else: + result = a.astype(out.dtype, order='C') + + if axis is None: + for accelerator in _accelerator._routine_accelerators: + if accelerator == _accelerator.ACCELERATOR_CUB: + if result is None: + result = a.astype(dtype, order='C').ravel() + # result will be None if the scan is not compatible with CUB + if op == scan_op.SCAN_SUM: + cub_op = cub.CUPY_CUB_CUMSUM + else: + cub_op = cub.CUPY_CUB_CUMPROD + res = cub.cub_scan(result, cub_op) + if res is not None: + break + else: + if result is None: + result = scan(a.ravel(), op, dtype=dtype) + else: + scan(result, op, dtype=dtype, out=result) + else: + if result is None: + result = a.astype(dtype, order='C') + axis = internal._normalize_axis_index(axis, a.ndim) + result = _proc_as_batch(result, axis, op) + # This is for when the original out param was not contiguous + if out is not None and out.data != result.data: + elementwise_copy(result.reshape(out.shape), out) + else: + out = result + return out + + +# Only for test +def _scan_for_test(a, out=None): + return scan(a, scan_op.SCAN_SUM, dtype=None, out=out) + + +cpdef _ndarray_base _nansum(_ndarray_base a, axis, dtype, out, keepdims): + if cupy.iscomplexobj(a): + return _nansum_complex_dtype(a, axis, dtype, out, keepdims) + elif dtype is None: + return _nansum_auto_dtype(a, axis, dtype, out, keepdims) + else: + return _nansum_keep_dtype(a, axis, dtype, out, keepdims) + + +cpdef _ndarray_base _nanprod(_ndarray_base a, axis, dtype, out, keepdims): + if cupy.iscomplexobj(a): + return _nanprod_complex_dtype(a, axis, dtype, out, keepdims) + elif dtype is None: + return _nanprod_auto_dtype(a, axis, dtype, out, keepdims) + else: + return _nanprod_keep_dtype(a, axis, dtype, out, keepdims) + + +_sum_auto_dtype = create_reduction_func( + 'cupy_sum', + ('?->l', 'b->l', 'B->L', 'h->l', 'H->L', 'i->l', 'I->L', 'l->l', 'L->L', + 'q->q', 'Q->Q', + ('e->e', (None, None, None, 'float')), + 'f->f', 'd->d', 'F->F', 'D->D'), + ('in0', 'a + b', 'out0 = type_out0_raw(a)', None), 0) + + +_sum_keep_dtype = create_reduction_func( + 'cupy_sum_with_dtype', + ('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L', + 'q->q', 'Q->Q', + ('e->e', (None, None, None, 'float')), + 'f->f', 'd->d', 'F->F', 'D->D'), + ('in0', 'a + b', 'out0 = type_out0_raw(a)', None), 0) + + +_nansum_auto_dtype = create_reduction_func( + 'cupy_nansum', + ('?->l', 'b->l', 'B->L', 'h->l', 'H->L', 'i->l', 'I->L', 'l->l', 'L->L', + 'q->q', 'Q->Q', + ('e->e', (None, None, None, 'float')), + 'f->f', 'd->d', 'F->F', 'D->D'), + ('(in0 == in0) ? in0 : type_in0_raw(0)', + 'a + b', 'out0 = type_out0_raw(a)', None), 0) + + +_nansum_keep_dtype = create_reduction_func( + 'cupy_nansum_with_dtype', + ('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L', + 'q->q', 'Q->Q', + ('e->e', (None, None, None, 'float')), + 'f->f', 'd->d', 'F->F', 'D->D'), + ('(in0 == in0) ? in0 : type_in0_raw(0)', + 'a + b', 'out0 = type_out0_raw(a)', None), 0) + + +_nansum_complex_dtype = create_reduction_func( + 'cupy_nansum_complex_dtype', + ('F->F', 'D->D'), + (''' + type_in0_raw((in0.real() == in0.real()) ? in0.real() : 0, + (in0.imag() == in0.imag()) ? in0.imag() : 0) + ''', + 'a + b', 'out0 = type_out0_raw(a)', None), 0) + + +_prod_auto_dtype = create_reduction_func( + 'cupy_prod', + ('?->l', 'b->l', 'B->L', 'h->l', 'H->L', 'i->l', 'I->L', 'l->l', 'L->L', + 'q->q', 'Q->Q', + ('e->e', (None, None, None, 'float')), + 'f->f', 'd->d', 'F->F', 'D->D'), + ('in0', 'a * b', 'out0 = type_out0_raw(a)', None), 1) + + +_prod_keep_dtype = create_reduction_func( + 'cupy_prod_with_dtype', + ('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L', + 'q->q', 'Q->Q', + ('e->e', (None, None, None, 'float')), + 'f->f', 'd->d', 'F->F', 'D->D'), + ('in0', 'a * b', 'out0 = type_out0_raw(a)', None), 1) + + +_nanprod_auto_dtype = create_reduction_func( + 'cupy_nanprod', + ('?->l', 'b->l', 'B->L', 'h->l', 'H->L', 'i->l', 'I->L', 'l->l', 'L->L', + 'q->q', 'Q->Q', + ('e->e', (None, None, None, 'float')), + 'f->f', 'd->d', 'F->F', 'D->D'), + ('(in0 == in0) ? in0 : type_in0_raw(1)', + 'a * b', 'out0 = type_out0_raw(a)', None), 1) + + +_nanprod_keep_dtype = create_reduction_func( + 'cupy_nanprod_with_dtype', + ('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L', + 'q->q', 'Q->Q', + ('e->e', (None, None, None, 'float')), + 'f->f', 'd->d', 'F->F', 'D->D'), + ('(in0 == in0) ? in0 : type_in0_raw(1)', + 'a * b', 'out0 = type_out0_raw(a)', None), 1) + + +_nanprod_complex_dtype = create_reduction_func( + 'cupy_nanprod_complex_dtype', + ('F->F', 'D->D'), + (''' + type_in0_raw((in0.real() == in0.real()) ? in0.real() : 1, + (in0.imag() == in0.imag()) ? in0.imag() : 1) + ''', + 'a * b', 'out0 = type_out0_raw(a)', None), 1) + +cdef create_arithmetic( + name, op, boolop, doc, cutensor_op=None, scatter_op=None): + # boolop is either + # - str (the operator for bool-bool inputs) or + # - callable (a function to raise an error for bool-bool inputs). + if isinstance(boolop, str): + boolop = 'out0 = in0 %s in1' % boolop + + return create_ufunc( + 'cupy_' + name, + (('??->?', boolop), + 'bb->b', 'BB->B', 'hh->h', 'HH->H', 'ii->i', 'II->I', 'll->l', + 'LL->L', 'qq->q', 'QQ->Q', 'ee->e', 'ff->f', 'dd->d', 'FF->F', + 'DD->D'), + 'out0 = in0 %s in1' % op, + doc=doc, + cutensor_op=cutensor_op, + scatter_op=scatter_op) + + +_add = create_arithmetic( + 'add', '+', '|', + '''Adds two arrays elementwise. + + .. seealso:: :data:`numpy.add` + + ''', + cutensor_op=('OP_ADD', 1, 1), scatter_op='add') + + +_conjugate = create_ufunc( + 'cupy_conjugate', + ('b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L', 'q->q', + 'Q->Q', 'e->e', 'f->f', 'd->d', + ('F->F', 'out0 = conj(in0)'), + ('D->D', 'out0 = conj(in0)')), + 'out0 = in0', + doc='''Returns the complex conjugate, element-wise. + + .. seealso:: :data:`numpy.conjugate` + + ''') + + +_angle = create_ufunc( + 'cupy_angle', + ('?->d', 'e->e', 'f->f', 'd->d', + ('F->f', 'out0 = arg(in0)'), + ('D->d', 'out0 = arg(in0)')), + 'out0 = in0 >= 0 ? 0 : M_PI', + doc='''Returns the angle of the complex argument. + + .. seealso:: :func:`numpy.angle` + + ''') + + +_angle_deg = create_ufunc( + 'cupy_angle_deg', + ('?->d', 'e->e', 'f->f', 'd->d', + ('F->f', 'out0 = arg(in0) * (180.0 / M_PI)'), + ('D->d', 'out0 = arg(in0) * (180.0 / M_PI)')), + 'out0 = in0 >= 0 ? 0 : 180.0', + doc='''Returns the angle of the complex argument. + + .. seealso:: :func:`numpy.angle` + + ''') + + +def _positive_boolean_error(): + raise TypeError( + 'The cupy boolean positive, the `+` operator, is not supported.') + + +_positive = create_ufunc( + 'cupy_positive', + (('?->?', _positive_boolean_error), + 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L', + 'q->q', 'Q->Q', 'e->e', 'f->f', 'd->d', 'F->F', 'D->D'), + 'out0 = +in0', + doc='''Takes numerical positive elementwise. + + .. seealso:: :data:`numpy.positive` + + ''') + + +def _negative_boolean_error(): + raise TypeError( + 'The cupy boolean negative, the `-` operator, is not supported, ' + 'use the `~` operator or the logical_not function instead.') + + +_negative = create_ufunc( + 'cupy_negative', + (('?->?', _negative_boolean_error), + 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L', + 'q->q', 'Q->Q', 'e->e', 'f->f', 'd->d', 'F->F', 'D->D'), + 'out0 = -in0', + doc='''Takes numerical negative elementwise. + + .. seealso:: :data:`numpy.negative` + + ''') + + +_multiply = create_arithmetic( + 'multiply', '*', '&', + '''Multiplies two arrays elementwise. + + .. seealso:: :data:`numpy.multiply` + + ''', + cutensor_op=('OP_MUL', 1, 1)) + + +# `integral_power` should return somewhat appropriate values for negative +# integral powers (for which NumPy would raise errors). Hence the branches in +# the beginning. This behavior is not officially documented and could change. +cdef _power_preamble = ''' +template +inline __device__ T integral_power(T in0, T in1) { + if (in1 < 0) { + if (in0 == -1) {return (in1 & 1) ? -1 : 1;} + else {return (in0 == 1) ? 1 : 0;} + } + T out0 = 1; + while (in1 > 0) { + if (in1 & 1) { + out0 *= in0; + } + in0 *= in0; + in1 >>= 1; + } + return out0; +} + +template +inline __device__ T complex_power(T in0, T in1) { + return in1 == T(0) ? T(1): pow(in0, in1); +} +''' + +_power = create_ufunc( + 'cupy_power', + ('??->b', 'bb->b', 'BB->B', 'hh->h', 'HH->H', 'ii->i', 'II->I', 'll->l', + 'LL->L', 'qq->q', 'QQ->Q', + ('ee->e', 'out0 = powf(in0, in1)'), + ('ff->f', 'out0 = powf(in0, in1)'), + ('dd->d', 'out0 = pow(in0, in1)'), + ('FF->F', 'out0 = complex_power(in0, in1)'), + ('DD->D', 'out0 = complex_power(in0, in1)')), + 'out0 = integral_power(in0, in1)', + preamble=_power_preamble, + doc='''Computes ``x1 ** x2`` elementwise. + + .. seealso:: :data:`numpy.power` + + ''') + + +def _subtract_boolean_error(): + raise TypeError( + 'cupy boolean subtract, the `-` operator, is deprecated, use the ' + 'bitwise_xor, the `^` operator, or the logical_xor function instead.') + + +_subtract = create_arithmetic( + 'subtract', '-', _subtract_boolean_error, + '''Subtracts arguments elementwise. + + .. seealso:: :data:`numpy.subtract` + + ''', + cutensor_op=('OP_ADD', 1, -1), scatter_op='sub') + + +_true_divide = create_ufunc( + 'cupy_true_divide', + ('bb->d', 'BB->d', 'hh->d', 'HH->d', 'ii->d', 'II->d', 'll->d', 'LL->d', + 'qq->d', 'QQ->d', 'ee->e', 'ff->f', 'dd->d', 'FF->F', 'DD->D'), + 'out0 = (out0_type)in0 / (out0_type)in1', + doc='''Elementwise true division (i.e. division as floating values). + + .. seealso:: :data:`numpy.true_divide` + + ''', + out_ops=('ee->e', 'ff->f', 'dd->d', 'FF->F', 'DD->D'), +) + + +_divide = _true_divide + + +_floor_divide = create_ufunc( + 'cupy_floor_divide', + ('bb->b', 'BB->B', 'hh->h', 'HH->H', 'ii->i', 'II->I', 'll->l', 'LL->L', + 'qq->q', 'QQ->Q', 'ee->e', 'ff->f', 'dd->d'), + 'out0 = _floor_divide(in0, in1)', + doc='''Elementwise floor division (i.e. integer quotient). + + .. seealso:: :data:`numpy.floor_divide` + + ''') + + +_remainder = create_ufunc( + 'cupy_remainder', + ('bb->b', 'BB->B', 'hh->h', 'HH->H', 'ii->i', 'II->I', 'll->l', 'LL->L', + 'qq->q', 'QQ->Q', + ('ee->e', 'out0 = in0 - _floor_divide(in0, in1) * in1'), + ('ff->f', 'out0 = in0 - _floor_divide(in0, in1) * in1'), + ('dd->d', 'out0 = in0 - _floor_divide(in0, in1) * in1')), + 'out0 = (in0 - _floor_divide(in0, in1) * in1) * (in1 != 0)', + doc='''Computes the remainder of Python division elementwise. + + .. seealso:: :data:`numpy.remainder` + + ''') + + +_absolute = create_ufunc( + 'cupy_absolute', + (('?->?', 'out0 = in0'), + 'b->b', ('B->B', 'out0 = in0'), 'h->h', ('H->H', 'out0 = in0'), + 'i->i', ('I->I', 'out0 = in0'), 'l->l', ('L->L', 'out0 = in0'), + 'q->q', ('Q->Q', 'out0 = in0'), + ('e->e', 'out0 = fabsf(in0)'), + ('f->f', 'out0 = fabsf(in0)'), + ('d->d', 'out0 = fabs(in0)'), + ('F->f', 'out0 = abs(in0)'), + ('D->d', 'out0 = abs(in0)')), + 'out0 = in0 > 0 ? in0 : -in0', + doc='''Elementwise absolute value function. + + .. seealso:: :data:`numpy.absolute` + + ''') + + +_sqrt = create_ufunc( + 'cupy_sqrt', + ('e->e', 'f->f', 'd->d', 'F->F', 'D->D'), + 'out0 = sqrt(in0)', + doc='''Elementwise square root function. + + .. seealso:: :data:`numpy.sqrt` + + ''') + + +_clip = create_ufunc( + 'cupy_clip', + ('???->?', 'bbb->b', 'BBB->B', 'hhh->h', 'HHH->H', 'iii->i', 'III->I', + 'lll->l', 'LLL->L', 'qqq->q', 'QQQ->Q', 'eee->e', 'fff->f', 'ddd->d'), + 'out0 = in1 > in2 ? in2 : (in0 < in1 ? in1 : (in0 > in2 ? in2 : in0))') + + +# Variables to expose to Python +# (cythonized data cannot be exposed to Python, even with cpdef.) + + +add = _add +conjugate = _conjugate +angle = _angle +angle_deg = _angle_deg +positive = _positive +negative = _negative +multiply = _multiply +divide = _divide +power = _power +subtract = _subtract +true_divide = _true_divide +floor_divide = _floor_divide +remainder = _remainder +absolute = _absolute +sqrt = _sqrt + +sum_auto_dtype = _sum_auto_dtype # used from cupy/math/sumprod.py +nansum_auto_dtype = _nansum_auto_dtype # used from cupy/math/sumprod.py +prod_auto_dtype = _prod_auto_dtype # used from cupy/math/sumprod.py +nanprod_auto_dtype = _nanprod_auto_dtype # used from cupy/math/sumprod.py +clip = _clip # used from cupy/math/misc.py diff --git a/cupy/_core/_routines_sorting.pxd b/cupy/_core/_routines_sorting.pxd new file mode 100644 index 0000000..a118987 --- /dev/null +++ b/cupy/_core/_routines_sorting.pxd @@ -0,0 +1,7 @@ +from cupy._core.core cimport _ndarray_base + + +cdef _ndarray_sort(_ndarray_base self, int axis) +cdef _ndarray_base _ndarray_argsort(_ndarray_base self, axis) +cdef _ndarray_partition(_ndarray_base self, kth, int axis) +cdef _ndarray_base _ndarray_argpartition(self, kth, axis) diff --git a/cupy/_core/_routines_sorting.pyx b/cupy/_core/_routines_sorting.pyx new file mode 100644 index 0000000..3d33942 --- /dev/null +++ b/cupy/_core/_routines_sorting.pyx @@ -0,0 +1,534 @@ +import string + +import numpy + +import cupy +from cupy._core._scalar import get_typename as _get_typename +from cupy._core._ufuncs import elementwise_copy +import cupy._core.core as core +from cupy import _util +from cupy.cuda import thrust + +from cupy._core cimport _routines_manipulation as _manipulation +from cupy._core.core cimport compile_with_cache +from cupy._core.core cimport _ndarray_base +from cupy._core cimport internal + + +cdef _ndarray_sort(_ndarray_base self, int axis): + cdef int ndim = self._shape.size() + cdef _ndarray_base data + + if not cupy.cuda.thrust.available: + raise RuntimeError('Thrust is needed to use cupy.sort. Please ' + 'install CUDA Toolkit with Thrust then ' + 'reinstall CuPy after uninstalling it.') + + if ndim == 0: + raise numpy.AxisError('Sorting arrays with the rank of zero is not ' + 'supported') # as numpy.sort() raises + + # TODO(takagi): Support sorting views + if not self._c_contiguous: + raise NotImplementedError('Sorting non-contiguous array is not ' + 'supported.') + + axis = internal._normalize_axis_index(axis, ndim) + + if axis == ndim - 1: + data = self + else: + data = _manipulation.rollaxis(self, axis, ndim).copy() + + if ndim == 1: + thrust.sort(self.dtype, data.data.ptr, 0, self.shape) + else: + max_size = max(min(1 << 22, data.size) // data.shape[-1], 1) + keys_array = core.ndarray( + (max_size * data.shape[-1],), dtype=numpy.intp) + stop = data.size // data.shape[-1] + for offset in range(0, stop, max_size): + width = min(max_size, stop - offset) + thrust.sort( + self.dtype, + data.data.ptr + offset * data.shape[-1] * data.itemsize, + keys_array.data.ptr, + (width, data.shape[-1]), + ) + + if axis == ndim - 1: + pass + else: + data = _manipulation.rollaxis(data, -1, axis) + elementwise_copy(data, self) + + +cdef _ndarray_base _ndarray_argsort(_ndarray_base self, axis): + cdef int _axis, ndim + cdef _ndarray_base data + + if not cupy.cuda.thrust.available: + raise RuntimeError('Thrust is needed to use cupy.argsort. Please ' + 'install CUDA Toolkit with Thrust then ' + 'reinstall CuPy after uninstalling it.') + + self = cupy.atleast_1d(self) + ndim = self._shape.size() + + if axis is None: + data = self.ravel() + _axis = -1 + else: + data = self + _axis = axis + + _axis = internal._normalize_axis_index(_axis, ndim) + + if _axis == ndim - 1: + data = data.copy() + else: + data = _manipulation.rollaxis(data, _axis, ndim).copy() + shape = data.shape + + idx_array = core.ndarray(shape, dtype=numpy.intp) + + if ndim == 1: + thrust.argsort(self.dtype, idx_array.data.ptr, data.data.ptr, 0, + shape) + else: + keys_array = core.ndarray(shape, dtype=numpy.intp) + thrust.argsort(self.dtype, idx_array.data.ptr, data.data.ptr, + keys_array.data.ptr, shape) + + if _axis == ndim - 1: + return idx_array + else: + return _manipulation.rollaxis(idx_array, -1, _axis) + + +cdef _ndarray_partition(_ndarray_base self, kth, int axis): + """Partitions an array. + + Args: + kth (int or sequence of ints): Element index to partition by. If + supplied with a sequence of k-th it will partition all elements + indexed by k-th of them into their sorted position at once. + + axis (int): Axis along which to sort. Default is -1, which means + sort along the last axis. + + .. seealso:: + :func:`cupy.partition` for full documentation, + :meth:`numpy.ndarray.partition` + + """ + + cdef int ndim = self._shape.size() + cdef Py_ssize_t k, max_k, length, s, sz, t + cdef _ndarray_base data + + if ndim == 0: + raise numpy.AxisError('Sorting arrays with the rank of zero is not ' + 'supported') + + if not self._c_contiguous: + raise NotImplementedError('Sorting non-contiguous array is not ' + 'supported.') + + axis = internal._normalize_axis_index(axis, ndim) + + if axis == ndim - 1: + data = self + else: + data = _manipulation.rollaxis(self, axis, ndim).copy() + + length = self._shape[axis] + if isinstance(kth, int): + kth = kth, + max_k = 0 + for k in kth: + if k < 0: + k += length + if not (0 <= k < length): + raise ValueError('kth(={}) out of bounds {}'.format(k, length)) + if max_k < k: + max_k = k + + # For simplicity, max_k is round up to the power of 2. If max_k is + # already the power of 2, it is round up to the next power of 2 because + # we need to collect the first max(kth)+1 elements. + max_k = max(32, 1 << max_k.bit_length()) + + # The parameter t is the length of the list that stores elements to be + # selected for each thread. We divide the array into sz subarrays. + # These parameters are determined from the measurement on TITAN X. + t = 4 + sz = 512 + while sz > 0 and length // sz < max_k + 32 * t: + sz //= 2 + sz *= self.size // length + + # If the array size is small or k is large, we simply sort the array. + if length < 32 or sz <= 32 or max_k >= 1024: + # kth is ignored. + data.sort(axis=-1) + else: + shape = data.shape + data = data.ravel() + + # For each subarray, we collect first k elements to the head. + kern, merge_kern = _partition_kernel(self.dtype) + block_size = 32 + grid_size = sz + kern(grid=(grid_size,), block=(block_size,), args=( + data, max_k, self.size, t, sz)) + + # Merge heads of subarrays. + s = 1 + while s < sz // (self.size // length): + block_size = 32 + grid_size = sz // s // 2 + merge_kern(grid=(grid_size,), block=(block_size,), args=( + data, max_k, self.size, sz, s)) + s *= 2 + + data = data.reshape(shape) + + if axis != ndim - 1: + data = _manipulation.rollaxis(data, -1, axis) + elementwise_copy(data, self) + + +cdef _ndarray_base _ndarray_argpartition(self, kth, axis): + """Returns the indices that would partially sort an array. + + Args: + kth (int or sequence of ints): Element index to partition by. If + supplied with a sequence of k-th it will partition all elements + indexed by k-th of them into their sorted position at once. + axis (int or None): Axis along which to sort. Default is -1, which + means sort along the last axis. If None is supplied, the array + is flattened before sorting. + + Returns: + cupy.ndarray: Array of the same type and shape as ``a``. + + .. seealso:: + :func:`cupy.argpartition` for full documentation, + :meth:`numpy.ndarray.argpartition` + + """ + cdef int _axis, ndim + cdef Py_ssize_t k, max_k, length, s, sz, t + cdef _ndarray_base data + if axis is None: + data = self.ravel() + _axis = -1 + else: + data = self + _axis = axis + + ndim = data._shape.size() + _axis = internal._normalize_axis_index(_axis, ndim) + + if _axis != ndim - 1: + data = _manipulation.rollaxis(self, _axis, ndim).copy() + + length = data._shape[ndim - 1] + + if length == 0: + return cupy.empty((0,), dtype=cupy.int64) + + if isinstance(kth, int): + kth = kth, + max_k = 0 + for k in kth: + if k < 0: + k += length + if not (0 <= k < length): + raise ValueError('kth(={}) out of bounds {}'.format(k, length)) + if max_k < k: + max_k = k + + # For simplicity, max_k is round up to the power of 2. If max_k is + # already the power of 2, it is round up to the next power of 2 because + # we need to collect the first max(kth)+1 elements. + max_k = max(32, 1 << max_k.bit_length()) + + # The parameter t is the length of the list that stores elements to be + # selected for each thread. We divide the array into sz subarrays. + # These parameters are determined from the measurement on TITAN X. + t = 4 + sz = 512 + while sz > 0 and length // sz < max_k + 32 * t: + sz //= 2 + sz *= self.size // length + shape = data.shape + + # If the array size is small or k is large, we simply sort the array. + if length < 32 or sz < 1 or max_k >= 1024: + # kth is ignored. + indices = data.argsort(axis=-1) + else: + data = data.ravel() + indices = cupy.arange(0, data.shape[0], dtype=cupy.int64) + + # For each subarray, we collect first k elements to the head. + kern, merge_kern = _argpartition_kernel(self.dtype) + block_size = 32 + grid_size = sz + kern(grid=(grid_size,), block=(block_size,), args=( + data, indices, max_k, self.size, t, sz)) + + # Merge heads of subarrays. + s = 1 + while s < sz // (self.size // length): + block_size = 32 + grid_size = sz // s // 2 + merge_kern(grid=(grid_size,), block=(block_size,), args=( + data, indices, max_k, self.size, sz, s)) + s *= 2 + + # Rearrange indices w.r.t the original axis + axis_indices = cupy.unravel_index(indices, shape) + indices = axis_indices[-1] + indices = indices.reshape(shape) + + if _axis != ndim - 1: + indices = _manipulation.rollaxis(indices, -1, _axis) + + return indices + + +@_util.memoize(for_each_device=True) +def _partition_kernel(dtype): + name = 'partition_kernel' + merge_kernel = 'partition_merge_kernel' + dtype = _get_typename(dtype) + source = string.Template(''' + template + __device__ void bitonic_sort_step(CArray a, + ptrdiff_t x, ptrdiff_t y, int i, ptrdiff_t s, ptrdiff_t w) { + for (ptrdiff_t j = i; j < (y - x) / 2; j += 32) { + ptrdiff_t n = j + (j & -w); + T v = a[n + x], u = a[n + w + x]; + if (n & s ? v < u : v > u) { + a[n + x] = u; + a[n + w + x] = v; + } + } + } + + // Sort a[x:y]. + template + __device__ void bitonic_sort( + CArray a, ptrdiff_t x, ptrdiff_t y, int i) { + for (ptrdiff_t s = 2; s <= y - x; s *= 2) { + for (ptrdiff_t w = s / 2; w >= 1; w /= 2) { + bitonic_sort_step< T >(a, x, y, i, s, w); + } + } + } + + // Merge first k elements and the next 32 times t elements. + template + __device__ void merge( + CArray a, + int k, int i, ptrdiff_t x, ptrdiff_t z, int u) { + for (int s = i; s < u; s += 32) { + if (a[x + k - s - 1] > a[z + s]) { + T tmp = a[x + k - s - 1]; + a[x + k - s - 1] = a[z + s]; + a[z + s] = tmp; + } + } + + // After merge step, the first k elements are already bitonic. + // Therefore, we do not need to fully sort. + for (int w = k / 2; w >= 1; w /= 2) { + bitonic_sort_step< T >(a, x, k + x, i, k, w); + } + } + + extern "C" { + // In this function, 32 threads handle one subarray. This number equals to + // the warp size. The first k elements are always sorted and the next 32 + // times t elements stored values that have possibilities to be selected. + __global__ void ${name}( + CArray<${dtype}, 1, true> a, + int k, ptrdiff_t n, int t, ptrdiff_t sz) { + + // This thread handles a[z:m]. + ptrdiff_t i = static_cast(blockIdx.x) * blockDim.x + + threadIdx.x; + ptrdiff_t z = i / 32 * n / sz; + ptrdiff_t m = (i / 32 + 1) * n / sz; + int id = i % 32; + int x = 0; + + bitonic_sort< ${dtype} >(a, z, k + z, id); + ptrdiff_t j; + for (j = k + id + z; j < m - (m - z) % 32; j += 32) { + if (a[j] < a[k - 1 + z]) { + ${dtype} tmp = a[k + 32 * x + id + z]; + a[k + 32 * x + id + z] = a[j]; + a[j] = tmp; + ++x; + } + + // If at least one thread in the warp has found t values that + // can be selected, we update the first k elements. + #if __CUDACC_VER_MAJOR__ >= 9 + if (__any_sync(0xffffffff, x >= t)) { + #else + if (__any(x >= t)) { + #endif + bitonic_sort< ${dtype} >(a, k + z, 32 * t + k + z, id); + merge< ${dtype} >(a, k, id, z, k + z, min(k, 32 * t)); + x = 0; + } + } + if (j < m && a[j] < a[k - 1 + z]) { + ${dtype} tmp = a[k + 32 * x + id + z]; + a[k + 32 * x + id + z] = a[j]; + a[j] = tmp; + } + + // Finally, we merge the first k elements and the remainders to be + // stored. + bitonic_sort< ${dtype} >(a, k + z, 32 * t + k + z, id); + merge< ${dtype} >(a, k, id, z, k + z, min(k, 32 * t)); + } + + __global__ void ${merge_kernel}( + CArray<${dtype}, 1, true> a, int k, ptrdiff_t n, int sz, int s) { + ptrdiff_t i = static_cast(blockIdx.x) * blockDim.x + + threadIdx.x; + ptrdiff_t z = i / 32 * 2 * s * n / sz; + ptrdiff_t m = (i / 32 * 2 + 1) * s * n / sz; + int id = i % 32; + merge< ${dtype} >(a, k, id, z, m, k); + } + } + ''').substitute(name=name, merge_kernel=merge_kernel, dtype=dtype) + module = compile_with_cache(source) + return module.get_function(name), module.get_function(merge_kernel) + + +@_util.memoize(for_each_device=True) +def _argpartition_kernel(dtype): + name = 'argpartition_kernel' + merge_kernel = 'argpartition_merge_kernel' + dtype = _get_typename(dtype) + source = string.Template(''' + template + __device__ void bitonic_sort_step( + CArray a, CArray b, + ptrdiff_t x, ptrdiff_t y, int i, ptrdiff_t s, ptrdiff_t w) { + for (ptrdiff_t j = i; j < (y - x) / 2; j += 32) { + ptrdiff_t n = j + (j & -w); + T v = a[b[n + x]], u = a[b[n + w + x]]; + if (n & s ? v < u : v > u) { + long long temp = b[n + x]; + b[n + x] = b[n + w + x]; + b[n + w + x] = temp; + } + } + } + + // Sort a[x:y]. + template + __device__ void bitonic_sort( + CArray a, CArray b, + ptrdiff_t x, ptrdiff_t y, int i) { + for (ptrdiff_t s = 2; s <= y - x; s *= 2) { + for (ptrdiff_t w = s / 2; w >= 1; w /= 2) { + bitonic_sort_step< T >(a, b, x, y, i, s, w); + } + } + } + + // Merge first k elements and the next 32 times t elements. + template + __device__ void merge( + CArray a, CArray b, + int k, int i, ptrdiff_t x, ptrdiff_t z, int u) { + for (int s = i; s < u; s += 32) { + if (a[b[x + k - s - 1]] > a[b[z + s]]) { + long long tmp = b[x + k - s - 1]; + b[x + k - s - 1] = b[z + s]; + b[z + s] = tmp; + } + } + + // After merge step, the first k elements are already bitonic. + // Therefore, we do not need to fully sort. + for (int w = k / 2; w >= 1; w /= 2) { + bitonic_sort_step< T >(a, b, x, k + x, i, k, w); + } + } + + extern "C" { + // In this function, 32 threads handle one subarray. This number equals to + // the warp size. The first k elements are always sorted and the next 32 + // times t elements stored values that have possibilities to be selected. + __global__ void ${name}( + CArray<${dtype}, 1, true> a, CArray b, + int k, ptrdiff_t n, int t, ptrdiff_t sz) { + + // This thread handles a[z:m]. + ptrdiff_t i = static_cast(blockIdx.x) * blockDim.x + + threadIdx.x; + ptrdiff_t z = i / 32 * n / sz; + ptrdiff_t m = (i / 32 + 1) * n / sz; + int id = i % 32; + int x = 0; + + bitonic_sort< ${dtype} >(a, b, z, k + z, id); + ptrdiff_t j; + for (j = k + id + z; j < m - (m - z) % 32; j += 32) { + if (a[b[j]] < a[b[k - 1 + z]]) { + long long tmp = b[k + 32 * x + id + z]; + b[k + 32 * x + id + z] = b[j]; + b[j] = tmp; + ++x; + } + + // If at least one thread in the warp has found t values that + // can be selected, we update the first k elements. + #if __CUDACC_VER_MAJOR__ >= 9 + if (__any_sync(0xffffffff, x >= t)) { + #else + if (__any(x >= t)) { + #endif + bitonic_sort< ${dtype} >(a, b, k + z, 32 * t + k + z, id); + merge< ${dtype} >(a, b, k, id, z, k + z, min(k, 32 * t)); + x = 0; + } + } + if (j < m && a[b[j]] < a[b[k - 1 + z]]) { + long long tmp = b[k + 32 * x + id + z]; + b[k + 32 * x + id + z] = b[j]; + b[j] = tmp; + } + + // Finally, we merge the first k elements and the remainders to be + // stored. + bitonic_sort< ${dtype} >(a, b, k + z, 32 * t + k + z, id); + merge< ${dtype} >(a, b, k, id, z, k + z, min(k, 32 * t)); + } + + __global__ void ${merge_kernel}( + CArray<${dtype}, 1, true> a, CArray b, + int k, ptrdiff_t n, int sz, int s) { + ptrdiff_t i = static_cast(blockIdx.x) * blockDim.x + + threadIdx.x; + ptrdiff_t z = i / 32 * 2 * s * n / sz; + ptrdiff_t m = (i / 32 * 2 + 1) * s * n / sz; + int id = i % 32; + merge< ${dtype} >(a, b, k, id, z, m, k); + } + } + ''').substitute(name=name, merge_kernel=merge_kernel, dtype=dtype) + module = compile_with_cache(source) + return module.get_function(name), module.get_function(merge_kernel) diff --git a/cupy/_core/_routines_statistics.pxd b/cupy/_core/_routines_statistics.pxd new file mode 100644 index 0000000..1496961 --- /dev/null +++ b/cupy/_core/_routines_statistics.pxd @@ -0,0 +1,30 @@ +from cupy._core.core cimport _ndarray_base + + +# TODO(niboshi): Move {nan,}arg{min,max} to sorting + + +cdef _ndarray_base _ndarray_max(_ndarray_base self, axis, out, dtype, keepdims) +cdef _ndarray_base _ndarray_min(_ndarray_base self, axis, out, dtype, keepdims) +cdef _ndarray_base _ndarray_ptp(_ndarray_base self, axis, out, keepdims) +cdef _ndarray_base _ndarray_argmax( + _ndarray_base self, axis, out, dtype, keepdims) +cdef _ndarray_base _ndarray_argmin( + _ndarray_base self, axis, out, dtype, keepdims) +cdef _ndarray_base _ndarray_mean( + _ndarray_base self, axis, dtype, out, keepdims) +cdef _ndarray_base _ndarray_var( + _ndarray_base self, axis, dtype, out, ddof, keepdims) +cdef _ndarray_base _ndarray_std( + _ndarray_base self, axis, dtype, out, ddof, keepdims) + +cpdef _ndarray_base _median( + _ndarray_base a, axis, out, overwrite_input, keepdims) + +cpdef _ndarray_base _nanmean(_ndarray_base a, axis, dtype, out, keepdims) +cpdef _ndarray_base _nanvar(_ndarray_base a, axis, dtype, out, ddof, keepdims) +cpdef _ndarray_base _nanstd(_ndarray_base a, axis, dtype, out, ddof, keepdims) + + +cpdef _ndarray_base _nanargmin(_ndarray_base a, axis, out, dtype, keepdims) +cpdef _ndarray_base _nanargmax(_ndarray_base a, axis, out, dtype, keepdims) diff --git a/cupy/_core/_routines_statistics.pyx b/cupy/_core/_routines_statistics.pyx new file mode 100644 index 0000000..f0f74f9 --- /dev/null +++ b/cupy/_core/_routines_statistics.pyx @@ -0,0 +1,763 @@ +from cpython cimport sequence + +import numpy +from numpy import nan + +import cupy +from cupy._core import _reduction +from cupy._core._reduction import create_reduction_func +from cupy._core._reduction import ReductionKernel +from cupy._core._kernel import ElementwiseKernel +from cupy._core._ufuncs import elementwise_copy + +from cupy._core cimport _accelerator +from cupy._core cimport _routines_math as _math +from cupy._core.core cimport _ndarray_base + +from cupy.cuda import cub + +try: + import cupy_backends.cuda.libs.cutensor as cuda_cutensor +except ImportError: + cuda_cutensor = None + + +cdef _ndarray_base _ndarray_max( + _ndarray_base self, axis, out, dtype, keepdims): + for accelerator in _accelerator._routine_accelerators: + result = None + if accelerator == _accelerator.ACCELERATOR_CUB: + # result will be None if the reduction is not compatible with CUB + result = cub.cub_reduction( + self, cub.CUPY_CUB_MAX, axis, dtype, out, keepdims) + if (accelerator == _accelerator.ACCELERATOR_CUTENSOR and + cuda_cutensor is not None): + from cupyx import cutensor + if self.dtype.kind == 'c' or dtype in ('F', 'D'): + # Complex dtype is not supported + continue + result = cutensor._try_reduction_routine( + self, axis, dtype, out, keepdims, cuda_cutensor.OP_MAX, 1, 0) + if result is not None: + return result + return _amax(self, axis=axis, out=out, dtype=dtype, keepdims=keepdims) + + +cdef _ndarray_base _ndarray_min( + _ndarray_base self, axis, out, dtype, keepdims): + for accelerator in _accelerator._routine_accelerators: + result = None + if accelerator == _accelerator.ACCELERATOR_CUB: + # result will be None if the reduction is not compatible with CUB + result = cub.cub_reduction( + self, cub.CUPY_CUB_MIN, axis, out, dtype, keepdims) + if (accelerator == _accelerator.ACCELERATOR_CUTENSOR and + cuda_cutensor is not None): + from cupyx import cutensor + if self.dtype.kind == 'c' or dtype in ('F', 'D'): + # Complex dtype is not supported + continue + result = cutensor._try_reduction_routine( + self, axis, dtype, out, keepdims, cuda_cutensor.OP_MIN, 1, 0) + if result is not None: + return result + return _amin(self, axis=axis, out=out, dtype=dtype, keepdims=keepdims) + + +cdef _ndarray_base _ndarray_ptp(_ndarray_base self, axis, out, keepdims): + for accelerator in _accelerator._routine_accelerators: + if accelerator == _accelerator.ACCELERATOR_CUB: + # result will be None if the reduction is not compatible with CUB + result = cub.cub_reduction( + self, cub.CUPY_CUB_MAX, axis, out, None, keepdims) + if result is not None: + result -= cub.cub_reduction( + self, cub.CUPY_CUB_MIN, axis, None, None, keepdims) + return result + if (accelerator == _accelerator.ACCELERATOR_CUTENSOR and + cuda_cutensor is not None): + from cupyx import cutensor + if self.dtype.kind == 'c': + # Complex dtype is not supported + continue + maxv = cutensor._try_reduction_routine( + self, axis, None, out, keepdims, cuda_cutensor.OP_MAX, 1, 0) + if maxv is None: + continue + return cutensor._try_reduction_routine( + self, axis, None, maxv, keepdims, cuda_cutensor.OP_MIN, -1, 1) + + result = _amax(self, axis=axis, out=out, keepdims=keepdims) + result -= _amin(self, axis=axis, out=None, keepdims=keepdims) + return result + + +# TODO(leofang): this signature is incompatible with NumPy! +cdef _ndarray_base _ndarray_argmax( + _ndarray_base self, axis, out, dtype, keepdims): + for accelerator in _accelerator._routine_accelerators: + if accelerator == _accelerator.ACCELERATOR_CUB: + # result will be None if the reduction is not compatible with CUB + if self._f_contiguous and self.dtype == numpy.bool_: + # temporary workaround casting the inputs to int8 + # CUB argmax seems to return different values to + # NumPy for F-order bool array inputs + self = self.astype(numpy.int8) + result = cub.cub_reduction( + self, cub.CUPY_CUB_ARGMAX, axis, dtype, out, keepdims) + if result is not None: + return result + return _argmax(self, axis=axis, out=out, dtype=dtype, keepdims=keepdims) + + +# TODO(leofang): this signature is incompatible with NumPy! +cdef _ndarray_base _ndarray_argmin( + _ndarray_base self, axis, out, dtype, keepdims): + for accelerator in _accelerator._routine_accelerators: + if accelerator == _accelerator.ACCELERATOR_CUB: + # result will be None if the reduction is not compatible with CUB + result = cub.cub_reduction( + self, cub.CUPY_CUB_ARGMIN, axis, dtype, out, keepdims) + if result is not None: + return result + return _argmin(self, axis=axis, out=out, dtype=dtype, keepdims=keepdims) + + +cdef _ndarray_base _ndarray_mean( + _ndarray_base self, axis, dtype, out, keepdims): + cdef Py_ssize_t n + + dtype_sum = dtype_out = dtype + if dtype is None: + if self.dtype.kind in 'iub': + dtype_out = numpy.float64 + dtype_sum = numpy.float64 + elif self.dtype.char == 'e': + dtype_sum = numpy.float32 + dtype_out = numpy.float16 + elif numpy.dtype(dtype).kind in 'iub': + # output will be the requested type, but compute the mean using float + dtype_out = dtype + dtype_sum = numpy.float64 + + for accelerator in _accelerator._routine_accelerators: + if accelerator == _accelerator.ACCELERATOR_CUB and self.size != 0: + result = cub.cub_reduction( + self, cub.CUPY_CUB_SUM, axis, dtype_sum, out, keepdims) + if result is not None: + n = self.size // result.size + cupy.true_divide(result, n, out=result, casting='unsafe') + break + if (accelerator == _accelerator.ACCELERATOR_CUTENSOR and + cuda_cutensor is not None): + from cupyx import cutensor + reduce_axis, _ = _reduction._get_axis(axis, self._shape.size()) + n = 1 + for i in reduce_axis: + n *= self._shape[i] + n = max(n, 1) + result = cutensor._try_reduction_routine( + self, axis, dtype_sum, out, keepdims, + cuda_cutensor.OP_ADD, 1.0 / n, 0) + if result is not None: + break + else: + result = _mean( + self, axis=axis, dtype=dtype_sum, out=out, keepdims=keepdims) + + if dtype_out is not None and out is None: + result = result.astype(dtype_out) + return result + + +cdef _ndarray_base _ndarray_var( + _ndarray_base self, axis, dtype, out, ddof, keepdims): + return _var( + self, axis=axis, dtype=dtype, out=out, ddof=ddof, keepdims=keepdims) + + +cdef _ndarray_base _ndarray_std( + _ndarray_base self, axis, dtype, out, ddof, keepdims): + return _std( + self, axis=axis, dtype=dtype, out=out, ddof=ddof, keepdims=keepdims) + + +cdef _min_max_preamble = ''' +template +struct min_max_st{ + T value; + int index; + __device__ min_max_st() : index(-1) { } + __device__ min_max_st(T v) : value(v), index(0) { } + __device__ min_max_st(T v, int i) : value(v), index(i) { } +}; + +template +__device__ min_max_st my_min( + const min_max_st& a, const min_max_st& b) { + if (a.index == -1) return b; + if (b.index == -1) return a; + return min_max_st(min(a.value, b.value)); +} +template +__device__ min_max_st my_min_float( + const min_max_st& a, const min_max_st& b) { + if (a.index == -1) return b; + if (b.index == -1) return a; + if (isnan(a.value)) return a; + if (isnan(b.value)) return b; + return min_max_st(min(a.value, b.value)); +} + +template +__device__ min_max_st my_max( + const min_max_st& a, const min_max_st& b) { + if (a.index == -1) return b; + if (b.index == -1) return a; + return min_max_st(max(a.value, b.value)); +} +template +__device__ min_max_st my_max_float( + const min_max_st& a, const min_max_st& b) { + if (a.index == -1) return b; + if (b.index == -1) return a; + if (isnan(a.value)) return a; + if (isnan(b.value)) return b; + return min_max_st(max(a.value, b.value)); +} + +template +__device__ min_max_st my_argmin( + const min_max_st& a, const min_max_st& b) { + if (a.index == -1) return b; + if (b.index == -1) return a; + if (a.value == b.value) + return min_max_st(a.value, min(a.index, b.index)); + return (a.value <= b.value) ? a : b; +} +template +__device__ min_max_st my_argmin_float( + const min_max_st& a, const min_max_st& b) { + if (a.index == -1) return b; + if (b.index == -1) return a; + if (a.value == b.value) + return min_max_st(a.value, min(a.index, b.index)); + if (isnan(a.value)) return a; + if (isnan(b.value)) return b; + return (a.value <= b.value) ? a : b; +} + +template +__device__ min_max_st my_argmax( + const min_max_st& a, const min_max_st& b) { + if (a.index == -1) return b; + if (b.index == -1) return a; + if (a.value == b.value) + return min_max_st(a.value, min(a.index, b.index)); + return (a.value >= b.value) ? a : b; +} +template +__device__ min_max_st my_argmax_float( + const min_max_st& a, const min_max_st& b) { + if (a.index == -1) return b; + if (b.index == -1) return a; + if (a.value == b.value) + return min_max_st(a.value, min(a.index, b.index)); + if (isnan(a.value)) return a; + if (isnan(b.value)) return b; + return (a.value >= b.value) ? a : b; +} + +''' + + +cdef _amin = create_reduction_func( + 'cupy_min', + ('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L', + 'q->q', 'Q->Q', + ('e->e', (None, 'my_min_float(a, b)', None, None)), + ('f->f', (None, 'my_min_float(a, b)', None, None)), + ('d->d', (None, 'my_min_float(a, b)', None, None)), + ('F->F', (None, 'my_min_float(a, b)', None, None)), + ('D->D', (None, 'my_min_float(a, b)', None, None))), + ('min_max_st(in0)', 'my_min(a, b)', 'out0 = a.value', + 'min_max_st'), + None, _min_max_preamble) + + +cdef _amax = create_reduction_func( + 'cupy_max', + ('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L', + 'q->q', 'Q->Q', + ('e->e', (None, 'my_max_float(a, b)', None, None)), + ('f->f', (None, 'my_max_float(a, b)', None, None)), + ('d->d', (None, 'my_max_float(a, b)', None, None)), + ('F->F', (None, 'my_max_float(a, b)', None, None)), + ('D->D', (None, 'my_max_float(a, b)', None, None)), + ), + ('min_max_st(in0)', 'my_max(a, b)', 'out0 = a.value', + 'min_max_st'), + None, _min_max_preamble) + + +nanmin = create_reduction_func( + 'cupy_nanmin', + ('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L', + 'q->q', 'Q->Q', 'e->e', 'f->f', 'd->d', 'F->F', 'D->D'), + ('min_max_st(in0)', 'my_min(a, b)', 'out0 = a.value', + 'min_max_st'), + None, _min_max_preamble) + + +nanmax = create_reduction_func( + 'cupy_nanmax', + ('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L', + 'q->q', 'Q->Q', 'e->e', 'f->f', 'd->d', 'F->F', 'D->D'), + ('min_max_st(in0)', 'my_max(a, b)', 'out0 = a.value', + 'min_max_st'), + None, _min_max_preamble) + + +cdef _argmin = create_reduction_func( + 'cupy_argmin', + tuple(['{}->{}'.format(d, r) for r in 'qlihb' for d in '?BhHiIlLqQ']) + + ( + ('e->q', (None, 'my_argmin_float(a, b)', None, None)), + ('f->q', (None, 'my_argmin_float(a, b)', None, None)), + ('d->q', (None, 'my_argmin_float(a, b)', None, None)), + ('F->q', (None, 'my_argmin_float(a, b)', None, None)), + ('D->q', (None, 'my_argmin_float(a, b)', None, None))), + ('min_max_st(in0, _J)', 'my_argmin(a, b)', 'out0 = a.index', + 'min_max_st'), + None, _min_max_preamble, sort_reduce_axis=False) + + +cdef _argmax = create_reduction_func( + 'cupy_argmax', + tuple(['{}->{}'.format(d, r) for r in 'qlihb' for d in '?BhHiIlLqQ']) + + ( + ('e->q', (None, 'my_argmax_float(a, b)', None, None)), + ('f->q', (None, 'my_argmax_float(a, b)', None, None)), + ('d->q', (None, 'my_argmax_float(a, b)', None, None)), + ('F->q', (None, 'my_argmax_float(a, b)', None, None)), + ('D->q', (None, 'my_argmax_float(a, b)', None, None))), + ('min_max_st(in0, _J)', 'my_argmax(a, b)', 'out0 = a.index', + 'min_max_st'), + None, _min_max_preamble, sort_reduce_axis=False) + + +cpdef _ndarray_base _nanargmax(_ndarray_base a, axis, out, dtype, keepdims): + return _nanargmax_func( + a, axis=axis, out=out, dtype=dtype, keepdims=keepdims) + + +cpdef _ndarray_base _nanargmin(_ndarray_base a, axis, out, dtype, keepdims): + return _nanargmin_func( + a, axis=axis, out=out, dtype=dtype, keepdims=keepdims) + + +cdef _nanargmin_func = create_reduction_func( + 'cupy_nanargmin', + ('?->q', 'B->q', 'h->q', 'H->q', 'i->q', 'I->q', 'l->q', 'L->q', + 'q->q', 'Q->q', + ('e->q', (None, 'my_argmin_float(a, b)', None, None)), + ('f->q', (None, 'my_argmin_float(a, b)', None, None)), + ('d->q', (None, 'my_argmin_float(a, b)', None, None)), + ('F->q', (None, 'my_argmin_float(a, b)', None, None)), + ('D->q', (None, 'my_argmin_float(a, b)', None, None))), + ('min_max_st(in0, isnan(in0) ? -1 : _J)', + 'my_argmin(a, b)', 'out0 = a.index', 'min_max_st'), + None, _min_max_preamble, sort_reduce_axis=False) + + +cdef _nanargmax_func = create_reduction_func( + 'cupy_nanargmax', + ('?->q', 'B->q', 'h->q', 'H->q', 'i->q', 'I->q', 'l->q', 'L->q', + 'q->q', 'Q->q', + ('e->q', (None, 'my_argmax_float(a, b)', None, None)), + ('f->q', (None, 'my_argmax_float(a, b)', None, None)), + ('d->q', (None, 'my_argmax_float(a, b)', None, None)), + ('F->q', (None, 'my_argmax_float(a, b)', None, None)), + ('D->q', (None, 'my_argmax_float(a, b)', None, None))), + ('min_max_st(in0, isnan(in0) ? -1 : _J)', + 'my_argmax(a, b)', 'out0 = a.index', 'min_max_st'), + None, _min_max_preamble, sort_reduce_axis=False) + + +cdef _exists_nan = ReductionKernel( + 'T x', 'bool y', 'isnan(x)', 'a || b', 'y = a', 'false', '_exists_nan') + + +cpdef _ndarray_base _median( + _ndarray_base a, axis, out, overwrite_input, keepdims): + + keep_ndim = a.ndim + + out_shape = None + if sequence.PySequence_Check(axis): + # cupy.sort and cupy.partition only support integer axis, so move + # all reduced dimensions to the end and reshape them into a single + # reduction axis. + reduce_axis, out_axis = _reduction._get_axis(axis, keep_ndim) + out_shape = _reduction._get_out_shape(a.shape, reduce_axis, out_axis, + keepdims) + a = a.transpose(out_axis + reduce_axis) + sort_shape = tuple([a.shape[n] for n in range(len(out_axis))]) + (-1,) + a = a.reshape(sort_shape) + if not a.flags.c_contiguous: + a = cupy.ascontiguousarray(a) + axis = -1 + + if axis is None: + sz = a.size + else: + if axis < -keep_ndim or axis >= keep_ndim: + raise numpy.AxisError('Axis overrun') + sz = a.shape[axis] + if sz % 2 == 0: + szh = sz // 2 + kth = [szh - 1, szh] + else: + kth = [(sz - 1) // 2] + + if overwrite_input: + part = a + else: + part = a.copy() + + if axis is None: + part = part.ravel() + part.partition(kth) + else: + part.partition(kth, axis=axis) + + if part.shape == (): + return part + if axis is None: + axis = 0 + + indexer = [slice(None)] * part.ndim + + if keepdims and out_shape is None: + _indexer = [None] * (keep_ndim - part.ndim) + indexer.extend(_indexer) + + index = part.shape[axis] // 2 + if part.shape[axis] % 2 == 1: + indexer[axis] = slice(index, index+1) + else: + indexer[axis] = slice(index-1, index+1) + indexer = tuple(indexer) + + out = _mean( + part[indexer], axis=axis, dtype=None, out=out, keepdims=keepdims) + if part.dtype.kind in 'fc': + isnan = _exists_nan(part, axis=axis, keepdims=keepdims) + out = cupy.where(isnan, numpy.nan, out) + if out_shape is not None: + out = out.reshape(out_shape) + return out + + +cpdef _ndarray_base _nanmedian( + _ndarray_base a, axis, out, overwrite_input, keepdims): + + if axis is None: + axis = tuple(range(a.ndim)) + if not sequence.PySequence_Check(axis): + axis = (axis,) + + reduce_axis = [] + reduce_shape = [] + out_axis = [] + out_shape = [] + for i in range(a.ndim): + if axis is None or i in axis or i - a.ndim in axis: + reduce_axis.append(i) + reduce_shape.append(a.shape[i]) + else: + out_axis.append(i) + out_shape.append(a.shape[i]) + + a_data_ptr = a.data.ptr + a = a.transpose(out_axis + reduce_axis) + a = a.reshape(out_shape + [-1, ]) + a = cupy.ascontiguousarray(a) + + n_reduce = numpy.prod(reduce_shape) + n_reduce_each = cupy.full(out_shape, n_reduce, dtype='int32') + if a_data_ptr == a.data.ptr and overwrite_input is False: + a = a.copy() + _replace_nan_kernel(n_reduce, numpy.finfo(a.dtype).max, a, n_reduce_each) + a = cupy.sort(a, axis=-1) + + b = cupy.full(out_shape, cupy.nan, dtype=a.dtype) + _pickup_median_kernel(n_reduce, n_reduce_each, a, b) + + if keepdims: + b = b.reshape(out_shape + [1, ] * len(reduce_axis)) + axes = [-1, ] * b.ndim + for i, j in enumerate(out_axis + reduce_axis): + axes[j] = i + b = b.transpose(axes) + + if out is None: + out = b + else: + elementwise_copy(b, out) + return out + + +cdef _replace_nan_kernel = ElementwiseKernel( + 'I n_reduce, T val', 'T a, raw I n_reduce_each', + ''' + if (a != a) { + a = val; + atomicAdd(&(n_reduce_each[i / n_reduce]), -1); + } + ''', + 'cupy_replace_nan' +) + +cdef _pickup_median_kernel = ElementwiseKernel( + 'I n_reduce, I n_reduce_each, raw T a', 'T b', + ''' + if (n_reduce_each > 0) { + int l = (n_reduce_each - 1) / 2; + int h = (n_reduce_each ) / 2; + if (l == h) { + b = a[l + n_reduce * i]; + } else { + b = (a[l + n_reduce * i] + a[h + n_reduce * i]) + / static_cast(2.0); + } + } + ''', + 'cupy_pickup_median' +) + + +cdef _ndarray_base _mean( + _ndarray_base a, axis=None, dtype=None, out=None, keepdims=False): + if a.size == 0: + # Return nan; see also https://github.com/numpy/numpy/issues/13582 + return _mean_core_empty(a, axis, dtype, out, keepdims) + return _mean_core(a, axis, dtype, out, keepdims) + +cdef _ndarray_base _var( + _ndarray_base a, axis=None, dtype=None, out=None, ddof=0, + keepdims=False): + + if axis is None: + axis = tuple(range(a.ndim)) + if not isinstance(axis, tuple): + axis = (axis,) + + dtype_mean = a.dtype + dtype_out = numpy.dtype(dtype) + if dtype is None: + if a.dtype.kind in 'biu': + dtype_mean = 'float64' + dtype_out = 'float64' + else: + dtype_mean = a.dtype + dtype_out = a.dtype + if a.dtype.kind == 'c': + dtype_out = numpy.dtype(a.dtype.char.lower()) + + shape = a.shape + cdef Py_ssize_t items = 1 + for ax in axis: + items *= shape[ax] + + # Make alpha NaN when array is empty, mimics NumPy behavior, resulting in + # NaN. See https://github.com/numpy/numpy/issues/13582 for an explanation + # on why NaN is the result. + div = max(items - ddof, 0) + alpha = 1. / div if div != 0 else nan + + arrmean = a.mean(axis=axis, dtype=dtype_mean, out=None, keepdims=True) + + if out is None: + if dtype_out == 'float16': + var_core = _var_core_float16 + elif dtype_out == 'float32': + var_core = _var_core_float32 + else: + var_core = _var_core_float64 + return var_core(a, arrmean, alpha, axis=axis, keepdims=keepdims) + + out = _var_core_out(a, arrmean, alpha, out, axis=axis, keepdims=keepdims) + return out.astype(dtype_out, copy=False) + + +cdef _ndarray_base _std( + _ndarray_base a, axis=None, dtype=None, out=None, ddof=0, + keepdims=False): + ret = _var( + a, axis=axis, dtype=dtype, out=None, ddof=ddof, keepdims=keepdims) + return _math._sqrt(ret, dtype=dtype, out=out) + + +cdef _norm_preamble = ''' +template __device__ T my_norm(T x) { return x * x; } +__device__ float my_norm(const complex& x) { return norm(x); } +__device__ double my_norm(const complex& x) { return norm(x); } +''' + + +cdef _var_core_float16 = ReductionKernel( + 'S x, T mean, float32 alpha', 'float16 out', + 'my_norm(x - mean)', + 'a + b', 'out = alpha * a', '0', 'cupy_var_core_float16', + preamble=_norm_preamble) + + +cdef _var_core_float32 = ReductionKernel( + 'S x, T mean, float32 alpha', 'float32 out', + 'my_norm(x - mean)', + 'a + b', 'out = alpha * a', '0', 'cupy_var_core_float32', + preamble=_norm_preamble) + + +cdef _var_core_float64 = ReductionKernel( + 'S x, T mean, float64 alpha', 'float64 out', + 'my_norm(x - mean)', + 'a + b', 'out = alpha * a', '0', 'cupy_var_core_float64', + preamble=_norm_preamble) + + +cdef _var_core_out = ReductionKernel( + 'S x, T mean, U alpha', 'U out', + 'my_norm(x - mean)', + 'a + b', 'out = alpha * a', '0', 'cupy_var_core_out', + preamble=_norm_preamble) + + +# TODO(okuta) needs cast +cdef _mean_core = create_reduction_func( + 'cupy_mean', + ('?->d', 'B->d', 'h->d', 'H->d', 'i->d', 'I->d', 'l->d', 'L->d', + 'q->d', 'Q->d', + ('e->e', (None, None, None, 'float')), + 'f->f', 'd->d', 'F->F', 'D->D'), + ('in0', 'a + b', + 'out0 = a / _type_reduce(_in_ind.size() / _out_ind.size())', None)) + +cdef _mean_core_empty = create_reduction_func( + 'cupy_mean_empty', + ('?->d', 'B->d', 'h->d', 'H->d', 'i->d', 'I->d', 'l->d', 'L->d', + 'q->d', 'Q->d', + ('e->e', (None, None, None, 'float')), + 'f->f', 'd->d', 'F->F', 'D->D'), + ('in0', 'a + b', + 'out0 = a / _type_reduce(_in_ind.size() / _out_ind.size())', None), 0) + +cdef _nanmean_preamble = ''' +template +struct nanmean_st{ + typedef long long ll; + T value; + ll count; + __device__ nanmean_st() : value(0), count(0) { } + __device__ nanmean_st(T v) : + value(isnan(v) ? T(0) : v), count(isnan(v) ? 0 : 1) { } + __device__ nanmean_st(T v, ll c) : value(v), count(c) { } +}; + +template +__device__ nanmean_st my_nanmean( + const nanmean_st& a, const nanmean_st& b) { + return nanmean_st(a.value + b.value, a.count + b.count); +} +''' + + +cdef _nanmean_func = create_reduction_func( + 'cupy_nanmean', + ('e->e', 'f->f', 'd->d', 'F->F', 'D->D'), + ('in0', 'my_nanmean(a, b)', + 'out0 = a.value / type_out0_raw(a.count)', 'nanmean_st'), + None, _nanmean_preamble) + + +_count_non_nan = create_reduction_func( + 'cupy_count_non_nan', + ('e->q', 'f->q', 'd->q', 'F->q', 'D->q'), + ('isnan(in0) ? 0 : 1', 'a + b', 'out0 = a', None), 0) + + +cpdef _ndarray_base _nanmean(_ndarray_base a, axis, dtype, out, keepdims): + return _nanmean_func(a, axis=axis, dtype=dtype, out=out, keepdims=keepdims) + + +cpdef _ndarray_base _nanstd(_ndarray_base a, axis, dtype, out, ddof, keepdims): + var = _nanvar(a, axis, dtype, None, ddof, keepdims) + return _math._sqrt(var, dtype=dtype, out=out) + + +cpdef _ndarray_base _nanvar(_ndarray_base a, axis, dtype, out, ddof, keepdims): + + _count = _count_non_nan(a, axis=axis, keepdims=True) + arrsum = _math._nansum(a, axis=axis, dtype=dtype, out=None, keepdims=True) + + if out is None: + if a.dtype == cupy.complex64 or dtype == cupy.complex64: + nanvar_core = _nanvar_core_complex64 + elif a.dtype == cupy.complex128 or dtype == cupy.complex128: + nanvar_core = _nanvar_core_complex128 + else: + nanvar_core = _nanvar_core + out = nanvar_core( + a, arrsum, _count, ddof, axis=axis, keepdims=keepdims) + else: + _nanvar_core_out( + a, arrsum, _count, ddof, out, axis=axis, keepdims=keepdims) + return out + + +cdef _nanvar_preamble = ''' +template +__device__ T nanvar_impl(S x, T mean, long long alpha) { + return (isnan(x) ? T(0) : T((x - mean) * (x - mean))) / alpha; +} + +template +__device__ T nanvar_impl(complex x, complex mean, long long alpha) { + return (isnan(x) ? T(0) : T(norm(x - mean))) / alpha; +} +''' + + +cdef _nanvar_core = ReductionKernel( + 'S x, T sum, int64 _count, int64 ddof', 'S out', + 'nanvar_impl(x, sum / _count, max(_count - ddof, 0LL))', + 'a + b', 'out = a', '0', '_nanvar_core', preamble=_nanvar_preamble) + + +cdef _nanvar_core_complex64 = ReductionKernel( + 'complex64 x, complex64 sum, int64 _count, int64 ddof', 'float32 out', + 'nanvar_impl(x, sum/static_cast(_count), max(_count-ddof, 0LL))', + 'a + b', 'out = a', '0', '_nanvar_core_complex64', + preamble=_nanvar_preamble) + + +cdef _nanvar_core_complex128 = ReductionKernel( + 'complex128 x, complex128 sum, int64 _count, int64 ddof', 'float64 out', + 'nanvar_impl(x, sum/static_cast(_count), max(_count-ddof, 0LL))', + 'a + b', 'out = a', '0', '_nanvar_core_complex128', + preamble=_nanvar_preamble) + + +cdef _nanvar_core_out = ReductionKernel( + 'S x, T sum, int64 _count, int64 ddof', 'U out', + 'nanvar_impl(x, sum / static_cast(_count), max(_count - ddof, 0LL))', + 'a + b', 'out = a', '0', '_nanvar_core', preamble=_nanvar_preamble) + + +# Variables to expose to Python +# (cythonized data cannot be exposed to Python, even with cpdef.) + + +amax = _amax +amin = _amin diff --git a/cupy/_core/_scalar.pxd b/cupy/_core/_scalar.pxd new file mode 100644 index 0000000..8cd84c5 --- /dev/null +++ b/cupy/_core/_scalar.pxd @@ -0,0 +1,37 @@ +cimport cython # NOQA + +from libc.stdint cimport int8_t +from libc.stdint cimport int32_t + +from cupy.cuda.function cimport CPointer + + +@cython.final +cdef class CScalar(CPointer): + + cdef: + char kind + int8_t size + + @staticmethod + cdef CScalar from_int32(int32_t value) + + @staticmethod + cdef CScalar from_numpy_scalar_with_dtype(object x, object dtype) + + @staticmethod + cdef CScalar _from_python_scalar(object x) + + @staticmethod + cdef CScalar _from_numpy_scalar(object x) + + cpdef apply_dtype(self, dtype) + cpdef get_numpy_type(self) + + +cpdef str get_typename(dtype) + +cdef set scalar_type_set +cdef CScalar scalar_to_c_scalar(object x) +cdef object scalar_to_numpy_scalar(object x) +cpdef str _get_cuda_scalar_repr(obj, dtype) diff --git a/cupy/_core/_scalar.pyx b/cupy/_core/_scalar.pyx new file mode 100644 index 0000000..f63aef4 --- /dev/null +++ b/cupy/_core/_scalar.pyx @@ -0,0 +1,386 @@ +from cpython cimport mem +from libc.stdint cimport int8_t +from libc.stdint cimport int16_t +from libc.stdint cimport int32_t +from libc.stdint cimport int64_t +from libc.stdint cimport uint8_t +from libc.stdint cimport uint16_t +from libc.stdint cimport uint32_t +from libc.stdint cimport uint64_t + +import numpy + +from cupy._core cimport _dtype +from cupy._core import _dtype as _dtype_module +from cupy._core cimport internal + + +cdef union Scalar: + bint bool_ + int8_t int8_ + int16_t int16_ + int32_t int32_ + int64_t int64_ + uint8_t uint8_ + uint16_t uint16_ + uint32_t uint32_ + uint64_t uint64_ + float float32_ + double float64_ + + +cdef dict _typenames_base = { + numpy.dtype('float64'): 'double', + numpy.dtype('float32'): 'float', + numpy.dtype('float16'): 'float16', + numpy.dtype('complex128'): 'complex', + numpy.dtype('complex64'): 'complex', + numpy.dtype('int64'): 'long long', + numpy.dtype('int32'): 'int', + numpy.dtype('int16'): 'short', + numpy.dtype('int8'): 'signed char', + numpy.dtype('uint64'): 'unsigned long long', + numpy.dtype('uint32'): 'unsigned int', + numpy.dtype('uint16'): 'unsigned short', + numpy.dtype('uint8'): 'unsigned char', + numpy.dtype('bool'): 'bool', +} + + +cdef object _numpy_bool_ = numpy.bool_ +cdef object _numpy_int8 = numpy.int8 +cdef object _numpy_int16 = numpy.int16 +cdef object _numpy_int32 = numpy.int32 +cdef object _numpy_int64 = numpy.int64 +cdef object _numpy_uint8 = numpy.uint8 +cdef object _numpy_uint16 = numpy.uint16 +cdef object _numpy_uint32 = numpy.uint32 +cdef object _numpy_uint64 = numpy.uint64 +cdef object _numpy_float16 = numpy.float16 +cdef object _numpy_float32 = numpy.float32 +cdef object _numpy_float64 = numpy.float64 +cdef object _numpy_complex64 = numpy.complex64 +cdef object _numpy_complex128 = numpy.complex128 +cdef object _numpy_float_ = numpy.float_ +cdef object _numpy_complex_ = numpy.complex_ + + +cpdef str get_typename(dtype): + if dtype is None: + raise ValueError('dtype is None') + if dtype not in _typenames: + dtype = _dtype.get_dtype(dtype).type + return _typenames[dtype] + + +cdef dict _typenames = {} +cdef dict _dtype_kind_size_dict = {} + + +cdef _setup_type_dict(): + cdef char k + for i in _dtype_module.all_type_chars: + d = numpy.dtype(i) + t = d.type + _typenames[t] = _typenames_base[d] + k = ord(d.kind) + _dtype_kind_size_dict[t] = (k, d.itemsize) + # CUDA types + for t in ('cudaTextureObject_t',): + _typenames[t] = t + + +_setup_type_dict() + + +cdef set _python_scalar_type_set = {int, float, bool, complex} +cdef set _numpy_scalar_type_set = set(_typenames.keys()) +cdef set scalar_type_set = _python_scalar_type_set | _numpy_scalar_type_set + + +_int_iinfo = numpy.iinfo(int) +cdef _int_min = _int_iinfo.min +cdef _int_max = _int_iinfo.max +cdef _int_type = _int_iinfo.dtype.type +cdef bint _use_int32 = _int_type != _numpy_int64 +del _int_iinfo + + +cpdef _python_scalar_to_numpy_scalar(x): + # Note that isinstance(x, int) matches with bool. + typ = type(x) + if typ is bool: + numpy_type = _numpy_bool_ + elif typ is float: + numpy_type = _numpy_float_ + elif typ is complex: + numpy_type = _numpy_complex_ + else: + if 0x8000000000000000 <= x: + numpy_type = _numpy_uint64 + elif _use_int32 and (x < _int_min or _int_max < x): + numpy_type = _numpy_int64 + else: + # Generally `_int_type` is `numpy.int64`. + # On Windows, it is `numpy.int32`. + numpy_type = _int_type + return numpy_type(x) + + +cdef class CScalar(CPointer): + + ndim = 0 + + def __cinit__(self): + self.ptr = mem.PyMem_Malloc( + max(sizeof(Scalar), sizeof(double complex))) + self.kind = 0 + self.size = -1 + + def __dealloc__(self): + mem.PyMem_Free(self.ptr) + self.ptr = 0 + + @staticmethod + cdef CScalar from_int32(int32_t value): + cdef CScalar s = CScalar.__new__(CScalar) + (s.ptr)[0] = value + s.kind = b'i' + s.size = 4 + return s + + @staticmethod + cdef CScalar from_numpy_scalar_with_dtype(object x, object dtype): + cdef CScalar ret = CScalar._from_numpy_scalar(x) + ret.apply_dtype(dtype) + return ret + + @staticmethod + cdef CScalar _from_python_scalar(object x): + cdef CScalar ret = CScalar.__new__(CScalar) + cdef Scalar* s = ret.ptr + typ = type(x) + if typ is bool: + s.bool_ = x + ret.kind = b'b' + ret.size = 1 + elif typ is float: + s.float64_ = x + ret.kind = b'f' + ret.size = 8 + elif typ is complex: + (ret.ptr)[0] = x + ret.kind = b'c' + ret.size = 16 + else: + if 0x8000000000000000 <= x: + s.uint64_ = x + ret.kind = b'u' + else: + s.int64_ = x + ret.kind = b'i' + ret.size = 8 + return ret + + @staticmethod + cdef CScalar _from_numpy_scalar(object x): + cdef CScalar ret = CScalar.__new__(CScalar) + cdef Scalar* s = ret.ptr + ret.kind = ord(x.dtype.kind) + if ret.kind == b'i': + s.int64_ = x + ret.size = 8 + elif ret.kind == b'u': + s.uint64_ = x + ret.size = 8 + elif ret.kind == b'f': + s.float64_ = x + ret.size = 8 + elif ret.kind == b'b': + s.bool_ = x + ret.size = 1 + elif ret.kind == b'c': + (ret.ptr)[0] = x + ret.size = 16 + else: + assert False + return ret + + cpdef apply_dtype(self, dtype): + cdef Scalar* s = self.ptr + if self.kind == b'b': + val = s.bool_ + assert self.size == 1 + elif self.kind == b'c': + assert self.size == 16 + val = (self.ptr)[0] + else: + assert self.size == 8 + if self.kind == b'i': + val = s.int64_ + elif self.kind == b'u': + val = s.uint64_ + elif self.kind == b'f': + val = s.float64_ + else: + assert False + cdef char kind + cdef int size + kind, size = _dtype_kind_size_dict[dtype] + cdef int64_t val_i + cdef uint64_t val_u + if kind == b'b': + s.bool_ = val + assert size == 1 + elif kind == b'i': + if self.kind == b'u': + # avoid overflow exception + val_i = s.uint64_ + else: + val_i = val + if size == 1: + s.int8_ = val_i + elif size == 2: + s.int16_ = val_i + elif size == 4: + s.int32_ = val_i + elif size == 8: + s.int64_ = val_i + else: + assert False + elif kind == b'u': + if self.kind == b'i': + # avoid overflow exception + val_u = s.int64_ + else: + val_u = val + if size == 1: + s.uint8_ = val_u + elif size == 2: + s.uint16_ = val_u + elif size == 4: + s.uint32_ = val_u + elif size == 8: + s.uint64_ = val_u + else: + assert False + elif kind == b'f': + if size == 2: + s.uint16_ = internal.to_float16(val) + elif size == 4: + s.float32_ = val + elif size == 8: + s.float64_ = val + else: + assert False + elif kind == b'c': + if size == 8: + (self.ptr)[0] = val + elif size == 16: + (self.ptr)[0] = val + else: + assert False + else: + assert False + self.kind = kind + self.size = size + + cpdef get_numpy_type(self): + if self.kind == b'b': + return _numpy_bool_ + elif self.kind == b'i': + if self.size == 1: + return _numpy_int8 + elif self.size == 2: + return _numpy_int16 + elif self.size == 4: + return _numpy_int32 + elif self.size == 8: + return _numpy_int64 + elif self.kind == b'u': + if self.size == 1: + return _numpy_uint8 + elif self.size == 2: + return _numpy_uint16 + elif self.size == 4: + return _numpy_uint32 + elif self.size == 8: + return _numpy_uint64 + elif self.kind == b'f': + if self.size == 2: + return _numpy_float16 + elif self.size == 4: + return _numpy_float32 + elif self.size == 8: + return _numpy_float64 + elif self.kind == b'c': + if self.size == 8: + return _numpy_complex64 + elif self.size == 16: + return _numpy_complex128 + assert False + + +cdef CScalar scalar_to_c_scalar(object x): + # Converts a Python or NumPy scalar to a CScalar. + # Returns None if the argument is not a scalar. + typ = type(x) + if typ in _python_scalar_type_set: + return CScalar._from_python_scalar(x) + elif typ in _numpy_scalar_type_set: + return CScalar._from_numpy_scalar(x) + return None + + +cdef object scalar_to_numpy_scalar(object x): + # Converts a Python or NumPy scalar to a NumPy scalar. + # Returns None if the argument is not a scalar. + typ = type(x) + if typ in _python_scalar_type_set: + return _python_scalar_to_numpy_scalar(x) + elif typ in _numpy_scalar_type_set: + return x + return None + + +cpdef str _get_cuda_scalar_repr(obj, dtype): + if dtype.kind == 'b': + return str(bool(obj)).lower() + elif dtype.kind == 'i': + if dtype.itemsize < 8: + return str(int(obj)) + else: + return str(int(obj)) + 'll' + elif dtype.kind == 'u': + if dtype.itemsize < 8: + return str(int(obj)) + 'u' + else: + return str(int(obj)) + 'ull' + elif dtype.kind == 'f': + if dtype.itemsize < 8: + if numpy.isnan(obj): + return 'CUDART_NAN_F' + elif numpy.isinf(obj): + if obj > 0: + return 'CUDART_INF_F' + else: + return '-CUDART_INF_F' + else: + return str(float(obj)) + 'f' + else: + if numpy.isnan(obj): + return 'CUDART_NAN' + elif numpy.isinf(obj): + if obj > 0: + return 'CUDART_INF' + else: + return '-CUDART_INF' + else: + return str(float(obj)) + elif dtype.kind == 'c': + if dtype.itemsize == 8: + return f'thrust::complex({obj.real}, {obj.imag})' + elif dtype.itemsize == 16: + return f'thrust::complex({obj.real}, {obj.imag})' + + raise TypeError(f'Unsupported dtype: {dtype}') diff --git a/cupy/_core/_ufuncs.py b/cupy/_core/_ufuncs.py new file mode 100644 index 0000000..37dfe0c --- /dev/null +++ b/cupy/_core/_ufuncs.py @@ -0,0 +1,9 @@ +from cupy._core._kernel import create_ufunc + + +elementwise_copy = create_ufunc( + 'cupy_copy', + ('?->?', 'b->b', 'B->B', 'h->h', 'H->H', 'i->i', 'I->I', 'l->l', 'L->L', + 'q->q', 'Q->Q', 'e->e', 'f->f', 'd->d', 'F->F', 'D->D'), + 'out0 = in0', + default_casting='unsafe') diff --git a/cupy/_core/core.pxd b/cupy/_core/core.pxd new file mode 100644 index 0000000..d55cada --- /dev/null +++ b/cupy/_core/core.pxd @@ -0,0 +1,115 @@ +from libcpp cimport vector +from cupy.cuda cimport memory + +from cupy.cuda.function cimport CPointer +from cupy.cuda.function cimport Module +from cupy._core._carray cimport shape_t +from cupy._core._carray cimport strides_t + + +cdef class _ndarray_base: + cdef: + object __weakref__ + readonly Py_ssize_t size + public shape_t _shape + public strides_t _strides + readonly bint _c_contiguous + readonly bint _f_contiguous + # To do fast indexing in the CArray class + readonly bint _index_32_bits + readonly object dtype + readonly memory.MemoryPointer data + # TODO(niboshi): Return arbitrary owner object as `base` if the + # underlying memory is UnownedMemory. + readonly _ndarray_base base + + cdef _init_fast(self, const shape_t& shape, dtype, bint c_order) + cpdef item(self) + cpdef tolist(self) + cpdef bytes tobytes(self, order=*) + cpdef tofile(self, fid, sep=*, format=*) + cpdef dump(self, file) + cpdef bytes dumps(self) + cpdef _ndarray_base astype( + self, dtype, order=*, casting=*, subok=*, copy=*) + cpdef _ndarray_base copy(self, order=*) + cpdef _ndarray_base view(self, dtype=*, array_class=*) + cpdef fill(self, value) + cpdef _ndarray_base swapaxes(self, Py_ssize_t axis1, Py_ssize_t axis2) + cpdef _ndarray_base flatten(self, order=*) + cpdef _ndarray_base ravel(self, order=*) + cpdef _ndarray_base squeeze(self, axis=*) + cpdef _ndarray_base take(self, indices, axis=*, out=*) + cpdef put(self, indices, values, mode=*) + cpdef repeat(self, repeats, axis=*) + cpdef choose(self, choices, out=*, mode=*) + cpdef sort(self, int axis=*) + cpdef _ndarray_base argsort(self, axis=*) + cpdef partition(self, kth, int axis=*) + cpdef _ndarray_base argpartition(self, kth, axis=*) + cpdef tuple nonzero(self) + cpdef _ndarray_base compress(self, condition, axis=*, out=*) + cpdef _ndarray_base diagonal(self, offset=*, axis1=*, axis2=*) + cpdef _ndarray_base max(self, axis=*, out=*, keepdims=*) + cpdef _ndarray_base argmax(self, axis=*, out=*, dtype=*, keepdims=*) + cpdef _ndarray_base min(self, axis=*, out=*, keepdims=*) + cpdef _ndarray_base argmin(self, axis=*, out=*, dtype=*, keepdims=*) + cpdef _ndarray_base ptp(self, axis=*, out=*, keepdims=*) + cpdef _ndarray_base clip(self, min=*, max=*, out=*) + cpdef _ndarray_base round(self, decimals=*, out=*) + + cpdef _ndarray_base trace(self, offset=*, axis1=*, axis2=*, dtype=*, out=*) + cpdef _ndarray_base sum(self, axis=*, dtype=*, out=*, keepdims=*) + cpdef _ndarray_base cumsum(self, axis=*, dtype=*, out=*) + cpdef _ndarray_base mean(self, axis=*, dtype=*, out=*, keepdims=*) + cpdef _ndarray_base var(self, axis=*, dtype=*, out=*, ddof=*, keepdims=*) + cpdef _ndarray_base std(self, axis=*, dtype=*, out=*, ddof=*, keepdims=*) + cpdef _ndarray_base prod(self, axis=*, dtype=*, out=*, keepdims=*) + cpdef _ndarray_base cumprod(self, axis=*, dtype=*, out=*) + cpdef _ndarray_base _add_reduceat(self, indices, axis, dtype, out) + cpdef _ndarray_base all(self, axis=*, out=*, keepdims=*) + cpdef _ndarray_base any(self, axis=*, out=*, keepdims=*) + cpdef _ndarray_base conj(self) + cpdef _ndarray_base conjugate(self) + cpdef get(self, stream=*, order=*, out=*) + cpdef set(self, arr, stream=*) + cpdef _ndarray_base reduced_view(self, dtype=*) + cpdef _update_c_contiguity(self) + cpdef _update_f_contiguity(self) + cpdef _update_contiguity(self) + cpdef _set_shape_and_strides(self, const shape_t& shape, + const strides_t& strides, + bint update_c_contiguity, + bint update_f_contiguity) + cdef _ndarray_base _view(self, subtype, const shape_t& shape, + const strides_t& strides, + bint update_c_contiguity, + bint update_f_contiguity, obj) + cpdef _set_contiguous_strides( + self, Py_ssize_t itemsize, bint is_c_contiguous) + cdef CPointer get_pointer(self) + cpdef object toDlpack(self) + + +cpdef _ndarray_base _internal_ascontiguousarray(_ndarray_base a) +cpdef _ndarray_base _internal_asfortranarray(_ndarray_base a) +cpdef _ndarray_base ascontiguousarray(_ndarray_base a, dtype=*) +cpdef _ndarray_base asfortranarray(_ndarray_base a, dtype=*) + +cpdef Module compile_with_cache(str source, tuple options=*, arch=*, + cachd_dir=*, prepend_cupy_headers=*, + backend=*, translate_cucomplex=*, + enable_cooperative_groups=*, + name_expressions=*, log_stream=*, + bint jitify=*) + + +# TODO(niboshi): Move to _routines_creation.pyx +cpdef _ndarray_base array( + obj, dtype=*, bint copy=*, order=*, bint subok=*, Py_ssize_t ndmin=*) +cpdef _ndarray_base _convert_object_with_cuda_array_interface(a) + +cdef _ndarray_base _ndarray_init(subtype, const shape_t& shape, dtype, obj) + +cdef _ndarray_base _create_ndarray_from_shape_strides( + subtype, const shape_t& shape, const strides_t& strides, dtype, obj) diff --git a/cupy/_core/core.pyx b/cupy/_core/core.pyx new file mode 100644 index 0000000..1cad745 --- /dev/null +++ b/cupy/_core/core.pyx @@ -0,0 +1,2813 @@ +# distutils: language = c++ + +import contextlib +import functools +import os +import pickle +import re +import warnings + +import numpy + +import cupy +from cupy._core._kernel import create_ufunc +from cupy._core._kernel import ElementwiseKernel +from cupy._core._ufuncs import elementwise_copy +from cupy._core import flags +from cupy._core import syncdetect +from cupy import cuda +from cupy.cuda import memory as memory_module +from cupy.cuda import stream as stream_mod + + +from cupy_backends.cuda.api.runtime import CUDARuntimeError +from cupy import _util + +cimport cython # NOQA +from libc.stdint cimport int64_t, intptr_t + +from cupy._core cimport _carray +from cupy._core cimport _dtype +from cupy._core._dtype cimport get_dtype +from cupy._core._kernel cimport create_ufunc +from cupy._core cimport _routines_binary as _binary +from cupy._core cimport _routines_indexing as _indexing +from cupy._core cimport _routines_linalg as _linalg +from cupy._core cimport _routines_logic as _logic +from cupy._core cimport _routines_manipulation as _manipulation +from cupy._core cimport _routines_math as _math +from cupy._core cimport _routines_sorting as _sorting +from cupy._core cimport _routines_statistics as _statistics +from cupy._core cimport _scalar +from cupy._core cimport dlpack +from cupy._core cimport internal +from cupy.cuda cimport device +from cupy.cuda cimport function +from cupy.cuda cimport pinned_memory +from cupy.cuda cimport memory +from cupy.cuda cimport stream as stream_module +from cupy_backends.cuda cimport stream as _stream_module +from cupy_backends.cuda.api cimport runtime +from cupy_backends.cuda.libs cimport cublas + + +# If rop of cupy.ndarray is called, cupy's op is the last chance. +# If op of cupy.ndarray is called and the `other` is cupy.ndarray, too, +# it is safe to call cupy's op. +# Otherwise, use this function `_should_use_rop` to choose +# * [True] return NotImplemented to defer rhs, or +# * [False] call NumPy's ufunc to try all `__array_ufunc__`. +# Note that extension types (`cdef class`) in Cython 0.x shares +# implementations of op and rop. (i.e. `__radd__(self, other)` is +# `__add__(other, self)`.) +# +# It follows NEP 13 except that cupy also implements the fallback to +# `__array_priority__`, which seems fair and necessary because of the +# following facts: +# * `numpy` : `scipy.sparse` = `cupy` : `cupyx.scipy.sparse`; +# * NumPy ignores `__array_priority__` attributes of arguments if NumPy finds +# `__array_function__` of `cupy.ndarray`; +# * SciPy sparse classes don't implement `__array_function__` and they even +# don't set `__array_function__ = None` to opt-out the feature; and +# * `__array_priority__` of SciPy sparse classes is respected because +# `numpy.ndarray.__array_function__` does not disable `__array_priority__`. +@cython.profile(False) +cdef inline _should_use_rop(x, y): + try: + y_ufunc = y.__array_ufunc__ + except AttributeError: + # NEP 13's recommendation is `return False`. + xp = getattr(x, '__array_priority__', 0) + yp = getattr(y, '__array_priority__', 0) + return xp < yp + return y_ufunc is None + + +cdef tuple _HANDLED_TYPES + +cdef object _null_context = contextlib.nullcontext() + + +class ndarray(_ndarray_base): + """ + __init__(self, shape, dtype=float, memptr=None, strides=None, order='C') + + Multi-dimensional array on a CUDA device. + + This class implements a subset of methods of :class:`numpy.ndarray`. + The difference is that this class allocates the array content on the + current GPU device. + + Args: + shape (tuple of ints): Length of axes. + dtype: Data type. It must be an argument of :class:`numpy.dtype`. + memptr (cupy.cuda.MemoryPointer): Pointer to the array content head. + strides (tuple of ints or None): Strides of data in memory. + order ({'C', 'F'}): Row-major (C-style) or column-major + (Fortran-style) order. + + Attributes: + base (None or cupy.ndarray): Base array from which this array is + created as a view. + data (cupy.cuda.MemoryPointer): Pointer to the array content head. + ~ndarray.dtype(numpy.dtype): Dtype object of element type. + + .. seealso:: + `Data type objects (dtype) \ + `_ + ~ndarray.size (int): Number of elements this array holds. + + This is equivalent to product over the shape tuple. + + .. seealso:: :attr:`numpy.ndarray.size` + + """ + + __module__ = 'cupy' + + def __new__(cls, *args, _obj=None, _no_init=False, **kwargs): + x = super().__new__(cls, *args, **kwargs) + if _no_init: + return x + x._init(*args, **kwargs) + if cls is not ndarray: + x.__array_finalize__(_obj) + return x + + def __init__(self, *args, **kwargs): + # Prevent from calling the super class `_ndarray_base.__init__()` as + # it is used to check accidental direct instantiation of underlaying + # `_ndarray_base` extention. + pass + + def __array_finalize__(self, obj): + pass + + # We provide the Python-level wrapper of `view` method to follow NumPy's + # API signature, as it seems that Cython's `cpdef`d methods does not take + # an argument named `type`. Cython also does not take starargs + # (`*args` and `**kwargs`) for `cpdef`d methods so we can not interpret the + # arguments `dtype` and `type` from them. + def view(self, dtype=None, type=None): + """Returns a view of the array. + + Args: + dtype: If this is different from the data type of the array, the + returned view reinterpret the memory sequence as an array of + this type. + + Returns: + cupy.ndarray: A view of the array. A reference to the original + array is stored at the :attr:`~ndarray.base` attribute. + + .. seealso:: :meth:`numpy.ndarray.view` + + """ + return super(ndarray, self).view(dtype=dtype, array_class=type) + + +cdef class _ndarray_base: + + def __init__(self, *args, **kwargs): + # Raise an error if underlaying `_ndarray_base` extension type is + # directly instantiated. We must instantiate `ndarray` class instead + # for our ndarray subclassing mechanism. + raise RuntimeError('Must not be directly instantiated') + + def _init(self, shape, dtype=float, memptr=None, strides=None, + order='C'): + cdef Py_ssize_t x, itemsize + cdef tuple s = internal.get_size(shape) + del shape + + cdef int order_char = ( + b'C' if order is None else internal._normalize_order(order)) + + # `strides` is prioritized over `order`, but invalid `order` should be + # checked even if `strides` is given. + if order_char != b'C' and order_char != b'F': + raise ValueError('order not understood. order=%s' % order) + + # Check for erroneous shape + if len(s) > _carray.MAX_NDIM: + msg = 'maximum supported dimension for an ndarray is ' + msg += f'{_carray.MAX_NDIM}, found {len(s)}' + raise ValueError(msg) + self._shape.reserve(len(s)) + for x in s: + if x < 0: + raise ValueError('Negative dimensions are not allowed') + self._shape.push_back(x) + del s + + # dtype + self.dtype, itemsize = _dtype.get_dtype_with_itemsize(dtype) + + # Store shape and strides + if strides is not None: + if memptr is None: + raise ValueError('memptr is required if strides is given.') + self._set_shape_and_strides(self._shape, strides, True, True) + elif order_char == b'C': + self._set_contiguous_strides(itemsize, True) + elif order_char == b'F': + self._set_contiguous_strides(itemsize, False) + else: + assert False + + # data + if memptr is None: + self.data = memory.alloc(self.size * itemsize) + self._index_32_bits = (self.size * itemsize) <= (1 << 31) + else: + self.data = memptr + bound = cupy._core._memory_range.get_bound(self) + self._index_32_bits = bound[1] - bound[0] <= (1 << 31) + + cdef _init_fast(self, const shape_t& shape, dtype, bint c_order): + """ For internal ndarray creation. """ + cdef Py_ssize_t itemsize + if shape.size() > _carray.MAX_NDIM: + msg = 'maximum supported dimension for an ndarray is ' + msg += f'{_carray.MAX_NDIM}, found {shape.size()}' + raise ValueError(msg) + self._shape = shape + self.dtype, itemsize = _dtype.get_dtype_with_itemsize(dtype) + self._set_contiguous_strides(itemsize, c_order) + self.data = memory.alloc(self.size * itemsize) + self._index_32_bits = (self.size * itemsize) <= (1 << 31) + + @property + def __cuda_array_interface__(self): + if runtime._is_hip_environment: + raise AttributeError( + 'HIP/ROCm does not support cuda array interface') + cdef dict desc = { + 'shape': self.shape, + 'typestr': self.dtype.str, + 'descr': self.dtype.descr, + } + cdef int ver = _util.CUDA_ARRAY_INTERFACE_EXPORT_VERSION + cdef intptr_t stream_ptr + + if ver == 3: + stream_ptr = stream_module.get_current_stream_ptr() + # CAI v3 says setting the stream field to 0 is disallowed + if stream_ptr == 0: + stream_ptr = _stream_module.get_default_stream_ptr() + desc['stream'] = stream_ptr + elif ver == 2: + # Old behavior (prior to CAI v3): stream sync is explicitly handled + # by users. To restore this behavior, we do not export any stream + # if CUPY_CUDA_ARRAY_INTERFACE_EXPORT_VERSION is set to 2 (so that + # other participating libraries lacking a finer control over sync + # behavior can avoid syncing). + pass + else: + raise ValueError('CUPY_CUDA_ARRAY_INTERFACE_EXPORT_VERSION can ' + 'only be set to 3 (default) or 2') + desc['version'] = ver + if self._c_contiguous: + desc['strides'] = None + else: + desc['strides'] = self.strides + if self.size > 0: + desc['data'] = (self.data.ptr, False) + else: + desc['data'] = (0, False) + + return desc + + def __dlpack__(self, stream=None): + # Note: the stream argument is supplied by the consumer, not by CuPy + curr_stream = stream_module.get_current_stream() + curr_stream_ptr = curr_stream.ptr + + # stream must be an int for CUDA/ROCm + if not runtime._is_hip_environment: # CUDA + if stream is None: + stream = runtime.streamLegacy + elif not isinstance(stream, int) or stream < -1: + # DLPack does not accept 0 as a valid stream, but there is a + # bug in PyTorch that exports the default stream as 0, which + # renders the protocol unusable, we will accept a 0 value + # meanwhile. + raise ValueError( + f'On CUDA, the valid stream for the DLPack protocol is -1,' + f' 1, 2, or any larger value, but {stream} was provided') + if stream == 0: + warnings.warn( + 'Stream 0 is passed from a library that you are' + ' converting to; CuPy assumes 0 as a legacy default ' + 'stream. Please report this problem to the library as this' + ' violates the DLPack protocol.') + stream = runtime.streamLegacy + if curr_stream_ptr == 0: + curr_stream_ptr = runtime.streamLegacy + else: # ROCm/HIP + if stream is None: + stream = 0 + elif (not isinstance(stream, int) or stream < -1 + or stream in (1, 2)): + raise ValueError( + f'On ROCm/HIP, the valid stream for the DLPack protocol is' + f' -1, 0, or any value > 2, but {stream} was provided') + + # if -1, no stream order should be established; otherwise, the consumer + # stream should wait for the work on CuPy's current stream to finish + if stream >= 0 and stream != curr_stream_ptr: + next_stream = stream_mod.ExternalStream(stream) + event = curr_stream.record() + next_stream.wait_event(event) + + return dlpack.toDlpack(self) + + def __dlpack_device__(self): + if not runtime._is_hip_environment: + attrs = runtime.pointerGetAttributes(self.data.ptr) + is_managed = ( + attrs.type == runtime.memoryTypeManaged + and _util.DLPACK_EXPORT_VERSION >= (0, 6)) + if is_managed: + device_type = dlpack.managed_CUDA + else: + device_type = dlpack.device_CUDA + else: + device_type = dlpack.device_ROCM + return (device_type, self.device.id) + + # The definition order of attributes and methods are borrowed from the + # order of documentation at the following NumPy document. + # https://numpy.org/doc/stable/reference/arrays.ndarray.html + + # ------------------------------------------------------------------------- + # Memory layout + # ------------------------------------------------------------------------- + @property + def flags(self): + """Object containing memory-layout information. + + It only contains ``c_contiguous``, ``f_contiguous``, and ``owndata`` + attributes. All of these are read-only. Accessing by indexes is also + supported. + + .. seealso:: :attr:`numpy.ndarray.flags` + + """ + return flags.Flags(self._c_contiguous, self._f_contiguous, + self.base is None) + + property shape: + """Lengths of axes. + + Setter of this property involves reshaping without copy. If the array + cannot be reshaped without copy, it raises an exception. + + .. seealso: :attr:`numpy.ndarray.shape` + + """ + + def __get__(self): + return tuple(self._shape) + + def __set__(self, newshape): + _manipulation._ndarray_shape_setter(self, newshape) + + @property + def strides(self): + """Strides of axes in bytes. + + .. seealso:: :attr:`numpy.ndarray.strides` + + """ + return tuple(self._strides) + + @property + def ndim(self): + """Number of dimensions. + + ``a.ndim`` is equivalent to ``len(a.shape)``. + + .. seealso:: :attr:`numpy.ndarray.ndim` + + """ + return self._shape.size() + + @property + def itemsize(self): + """Size of each element in bytes. + + .. seealso:: :attr:`numpy.ndarray.itemsize` + + """ + return self.dtype.itemsize + + @property + def nbytes(self): + """Total size of all elements in bytes. + + It does not count skips between elements. + + .. seealso:: :attr:`numpy.ndarray.nbytes` + + """ + return self.size * self.dtype.itemsize + + # ------------------------------------------------------------------------- + # Other attributes + # ------------------------------------------------------------------------- + @property + def T(self): + """Shape-reversed view of the array. + + If ndim < 2, then this is just a reference to the array itself. + + """ + if self.ndim < 2: + return self + else: + return _manipulation._T(self) + + @property + def flat(self): + return cupy.flatiter(self) + + __array_priority__ = 100 + + # ------------------------------------------------------------------------- + # Array interface + # ------------------------------------------------------------------------- + # TODO(beam2d): Implement __array_interface__ + + # ------------------------------------------------------------------------- + # foreign function interface + # ------------------------------------------------------------------------- + @property + def cstruct(self): + """C representation of the array. + + This property is used for sending an array to CUDA kernels. The type of + returned C structure is different for different dtypes and ndims. The + definition of C type is written in ``cupy/carray.cuh``. + + """ + return _CArray_from_ndarray(self) + + # ------------------------------------------------------------------------- + # Array conversion + # ------------------------------------------------------------------------- + cpdef item(self): + """Converts the array with one element to a Python scalar + + Returns: + int or float or complex: The element of the array. + + .. seealso:: :meth:`numpy.ndarray.item` + + """ + if self.size != 1: + raise ValueError( + 'can only convert an array of size 1 to a Python scalar') + return self.get().item() + + cpdef tolist(self): + """Converts the array to a (possibly nested) Python list. + + Returns: + list: The possibly nested Python list of array elements. + + .. seealso:: :meth:`numpy.ndarray.tolist` + + """ + return self.get().tolist() + + # TODO(okuta): Implement itemset + # TODO(okuta): Implement tostring + + cpdef bytes tobytes(self, order='C'): + """Turns the array into a Python bytes object.""" + return self.get().tobytes(order) + + cpdef tofile(self, fid, sep='', format='%s'): + """Writes the array to a file. + + .. seealso:: :meth:`numpy.ndarray.tofile` + + """ + self.get().tofile(fid, sep, format) + + cpdef dump(self, file): + """Dumps a pickle of the array to a file. + + Dumped file can be read back to :class:`cupy.ndarray` by + :func:`cupy.load`. + + """ + pickle.dump(self, file, -1) + + cpdef bytes dumps(self): + """Dumps a pickle of the array to a string.""" + return pickle.dumps(self, -1) + + cpdef _ndarray_base astype( + self, dtype, order='K', casting=None, subok=None, copy=True): + """Casts the array to given data type. + + Args: + dtype: Type specifier. + order ({'C', 'F', 'A', 'K'}): Row-major (C-style) or column-major + (Fortran-style) order. + When ``order`` is 'A', it uses 'F' if ``a`` is column-major and + uses 'C' otherwise. + And when ``order`` is 'K', it keeps strides as closely as + possible. + copy (bool): If it is False and no cast happens, then this method + returns the array itself. Otherwise, a copy is returned. + + Returns: + If ``copy`` is False and no cast is required, then the array itself + is returned. Otherwise, it returns a (possibly casted) copy of the + array. + + .. note:: + This method currently does not support ``casting``, and ``subok`` + arguments. + + .. seealso:: :meth:`numpy.ndarray.astype` + + """ + cdef strides_t strides + + # TODO(beam2d): Support casting and subok option + if casting is not None: + raise TypeError('casting is not supported yet') + if subok is not None: + raise TypeError('subok is not supported yet') + + if order is None: + order = 'K' + cdef int order_char = internal._normalize_order(order) + + dtype = get_dtype(dtype) + if dtype == self.dtype: + if not copy and ( + order_char == b'K' or + order_char == b'A' and (self._c_contiguous or + self._f_contiguous) or + order_char == b'C' and self._c_contiguous or + order_char == b'F' and self._f_contiguous): + return self + + order_char = internal._update_order_char( + self._c_contiguous, self._f_contiguous, order_char) + + if order_char == b'K': + strides = internal._get_strides_for_order_K(self, dtype) + newarray = _ndarray_init(ndarray, self._shape, dtype, None) + # TODO(niboshi): Confirm update_x_contiguity flags + newarray._set_shape_and_strides(self._shape, strides, True, True) + else: + newarray = ndarray(self.shape, dtype=dtype, order=chr(order_char)) + + if self.size == 0: + # skip copy + if self.dtype.kind == 'c' and newarray.dtype.kind not in 'bc': + warnings.warn( + 'Casting complex values to real discards the imaginary ' + 'part', + numpy.ComplexWarning) + else: + elementwise_copy(self, newarray) + return newarray + + # TODO(okuta): Implement byteswap + + cpdef _ndarray_base copy(self, order='C'): + """Returns a copy of the array. + + This method makes a copy of a given array in the current device. + Even when a given array is located in another device, you can copy it + to the current device. + + Args: + order ({'C', 'F', 'A', 'K'}): Row-major (C-style) or column-major + (Fortran-style) order. + When ``order`` is 'A', it uses 'F' if ``a`` is column-major and + uses 'C' otherwise. + And when `order` is 'K', it keeps strides as closely as + possible. + + .. seealso:: + :func:`cupy.copy` for full documentation, + :meth:`numpy.ndarray.copy` + + """ + cdef _ndarray_base x + if self.size == 0: + return self.astype(self.dtype, order=order) + + dev_id = device.get_device_id() + if self.data.device_id == dev_id: + return self.astype(self.dtype, order=order) + + # It need to make a contiguous copy for copying from another device + prev_device = runtime.getDevice() + try: + runtime.setDevice(self.device.id) + x = self.astype(self.dtype, order=order, copy=False) + finally: + runtime.setDevice(prev_device) + newarray = _ndarray_init(ndarray, x._shape, x.dtype, None) + if not x._c_contiguous and not x._f_contiguous: + raise NotImplementedError( + 'CuPy cannot copy non-contiguous array between devices.') + # TODO(niboshi): Confirm update_x_contiguity flags + newarray._strides = x._strides + newarray._c_contiguous = x._c_contiguous + newarray._f_contiguous = x._f_contiguous + + copy_context = _null_context + if runtime._is_hip_environment: + # HIP requires changing the active device to the one where + # src data is before the copy. From the docs: + # it is recommended to set the current device to the device + # where the src data is physically located. + copy_context = self.device + with copy_context: + newarray.data.copy_from_device_async(x.data, x.nbytes) + return newarray + + cpdef _ndarray_base view(self, dtype=None, array_class=None): + cdef Py_ssize_t ndim, axis, tmp_size + cdef int self_is, v_is + + if dtype is not None: + if type(dtype) is type and issubclass(dtype, ndarray): + if array_class is not None: + raise ValueError('Cannot specify output type twice.') + array_class = dtype + dtype = None + + if ( + array_class is not None and ( + type(array_class) is not type or + not issubclass(array_class, ndarray) + ) + ): + raise ValueError('Type must be a sub-type of ndarray type') + + if array_class is None: + array_class = type(self) + + v = self._view( + array_class, self._shape, self._strides, False, False, self) + if dtype is None: + return v + + v.dtype, v_is = _dtype.get_dtype_with_itemsize(dtype) + self_is = self.dtype.itemsize + if v_is == self_is: + return v + + ndim = self._shape.size() + if ndim == 0: + raise ValueError( + 'Changing the dtype of a 0d array is only supported if ' + 'the itemsize is unchanged') + axis = ndim - 1 + if ( + self._shape[axis] != 1 + and self.size != 0 + and self._strides[axis] != self.dtype.itemsize + ): + raise ValueError( + 'To change to a dtype of a different size, the last axis ' + 'must be contiguous') + + # Normalize `_strides[axis]` whenever itemsize changes + v._strides[axis] = v_is + + tmp_size = v._shape[axis] * self_is + if tmp_size % v_is != 0: + raise ValueError( + 'When changing to a larger dtype, its size must be a ' + 'divisor of the total size in bytes of the last axis ' + 'of the array.') + # itemsize of dtype in CuPy is one of 1, 2, 4, 8, 16. + # Thus, CuPy does not raise the following: + # raise ValueError( + # 'When changing to a smaller dtype, its size must be a ' + # 'divisor of the size of original dtype') + v._shape[axis] = tmp_size // v_is + v.size = v.size * self_is // v_is # divisible because shape[axis] is. + + if axis != ndim - 1: + v._update_c_contiguity() + if axis != 0: + v._update_f_contiguity() + return v + + # TODO(okuta): Implement getfield + # TODO(okuta): Implement setflags + + cpdef fill(self, value): + """Fills the array with a scalar value. + + Args: + value: A scalar value to fill the array content. + + .. seealso:: :meth:`numpy.ndarray.fill` + + """ + if isinstance(value, cupy.ndarray): + if value.shape != (): + raise ValueError( + 'non-scalar cupy.ndarray cannot be used for fill') + value = value.astype(self.dtype, copy=False) + fill_kernel(value, self) + return + + if isinstance(value, numpy.ndarray): + if value.shape != (): + raise ValueError( + 'non-scalar numpy.ndarray cannot be used for fill') + value = value.astype(self.dtype, copy=False).item() + + if value == 0 and self._c_contiguous: + self.data.memset_async(0, self.nbytes) + else: + fill_kernel(value, self) + + # ------------------------------------------------------------------------- + # Shape manipulation + # ------------------------------------------------------------------------- + def reshape(self, *shape, order='C'): + """Returns an array of a different shape and the same content. + + .. seealso:: + :func:`cupy.reshape` for full documentation, + :meth:`numpy.ndarray.reshape` + + """ + return _manipulation._ndarray_reshape(self, shape, order) + + # TODO(okuta): Implement resize + + def transpose(self, *axes): + """Returns a view of the array with axes permuted. + + .. seealso:: + :func:`cupy.transpose` for full documentation, + :meth:`numpy.ndarray.reshape` + + """ + return _manipulation._ndarray_transpose(self, axes) + + cpdef _ndarray_base swapaxes(self, Py_ssize_t axis1, Py_ssize_t axis2): + """Returns a view of the array with two axes swapped. + + .. seealso:: + :func:`cupy.swapaxes` for full documentation, + :meth:`numpy.ndarray.swapaxes` + + """ + return _manipulation._ndarray_swapaxes(self, axis1, axis2) + + cpdef _ndarray_base flatten(self, order='C'): + """Returns a copy of the array flatten into one dimension. + + Args: + order ({'C', 'F', 'A', 'K'}): + 'C' means to flatten in row-major (C-style) order. + 'F' means to flatten in column-major (Fortran- + style) order. 'A' means to flatten in column-major + order if `self` is Fortran *contiguous* in memory, + row-major order otherwise. 'K' means to flatten + `self` in the order the elements occur in memory. + The default is 'C'. + + Returns: + cupy.ndarray: A copy of the array with one dimension. + + .. seealso:: :meth:`numpy.ndarray.flatten` + + """ + return _manipulation._ndarray_flatten(self, order) + + cpdef _ndarray_base ravel(self, order='C'): + """Returns an array flattened into one dimension. + + .. seealso:: + :func:`cupy.ravel` for full documentation, + :meth:`numpy.ndarray.ravel` + + """ + return _internal_ascontiguousarray( + _manipulation._ndarray_ravel(self, order)) + + cpdef _ndarray_base squeeze(self, axis=None): + """Returns a view with size-one axes removed. + + .. seealso:: + :func:`cupy.squeeze` for full documentation, + :meth:`numpy.ndarray.squeeze` + + """ + return _manipulation._ndarray_squeeze(self, axis) + + # ------------------------------------------------------------------------- + # Item selection and manipulation + # ------------------------------------------------------------------------- + cpdef _ndarray_base take(self, indices, axis=None, out=None): + """Returns an array of elements at given indices along the axis. + + .. seealso:: + :func:`cupy.take` for full documentation, + :meth:`numpy.ndarray.take` + + """ + return _indexing._ndarray_take(self, indices, axis, out) + + cpdef put(self, indices, values, mode='wrap'): + """Replaces specified elements of an array with given values. + + .. seealso:: + :func:`cupy.put` for full documentation, + :meth:`numpy.ndarray.put` + """ + return _indexing._ndarray_put(self, indices, values, mode) + + cpdef repeat(self, repeats, axis=None): + """Returns an array with repeated arrays along an axis. + + .. seealso:: + :func:`cupy.repeat` for full documentation, + :meth:`numpy.ndarray.repeat` + + """ + return _manipulation._ndarray_repeat(self, repeats, axis) + + cpdef choose(self, choices, out=None, mode='raise'): + # TODO(niboshi): Write docstring + return _indexing._ndarray_choose(self, choices, out, mode) + + cpdef sort(self, int axis=-1): + """Sort an array, in-place with a stable sorting algorithm. + + Args: + axis (int): Axis along which to sort. Default is -1, which means + sort along the last axis. + + .. note:: + For its implementation reason, ``ndarray.sort`` currently supports + only arrays with their own data, and does not support ``kind`` and + ``order`` parameters that ``numpy.ndarray.sort`` does support. + + .. seealso:: + :func:`cupy.sort` for full documentation, + :meth:`numpy.ndarray.sort` + + """ + # TODO(takagi): Support kind argument. + _sorting._ndarray_sort(self, axis) + + cpdef _ndarray_base argsort(self, axis=-1): + """Returns the indices that would sort an array with stable sorting + + Args: + axis (int or None): Axis along which to sort. Default is -1, which + means sort along the last axis. If None is supplied, the array + is flattened before sorting. + + Returns: + cupy.ndarray: Array of indices that sort the array. + + .. seealso:: + :func:`cupy.argsort` for full documentation, + :meth:`numpy.ndarray.argsort` + + """ + # TODO(takagi): Support kind argument. + return _sorting._ndarray_argsort(self, axis) + + cpdef partition(self, kth, int axis=-1): + """Partitions an array. + + Args: + kth (int or sequence of ints): Element index to partition by. If + supplied with a sequence of k-th it will partition all elements + indexed by k-th of them into their sorted position at once. + + axis (int): Axis along which to sort. Default is -1, which means + sort along the last axis. + + .. seealso:: + :func:`cupy.partition` for full documentation, + :meth:`numpy.ndarray.partition` + + """ + _sorting._ndarray_partition(self, kth, axis) + + cpdef _ndarray_base argpartition(self, kth, axis=-1): + """Returns the indices that would partially sort an array. + + Args: + kth (int or sequence of ints): Element index to partition by. If + supplied with a sequence of k-th it will partition all elements + indexed by k-th of them into their sorted position at once. + axis (int or None): Axis along which to sort. Default is -1, which + means sort along the last axis. If None is supplied, the array + is flattened before sorting. + + Returns: + cupy.ndarray: Array of the same type and shape as ``a``. + + .. seealso:: + :func:`cupy.argpartition` for full documentation, + :meth:`numpy.ndarray.argpartition` + + """ + return _sorting._ndarray_argpartition(self, kth, axis) + + def searchsorted(self, v, side='left', sorter=None): + """Finds indices where elements of v should be inserted to maintain order. + + For full documentation, see :func:`cupy.searchsorted` + + Returns: + + .. seealso:: :func:`numpy.searchsorted` + + """ # NOQA + return cupy.searchsorted(self, v, side, sorter) + + cpdef tuple nonzero(self): + """Return the indices of the elements that are non-zero. + + Returned Array is containing the indices of the non-zero elements + in that dimension. + + Returns: + tuple of arrays: Indices of elements that are non-zero. + + .. warning:: + + This function may synchronize the device. + + .. seealso:: + :func:`numpy.nonzero` + + """ + return _indexing._ndarray_nonzero(self) + + cpdef _ndarray_base compress(self, condition, axis=None, out=None): + """Returns selected slices of this array along given axis. + + .. warning:: + + This function may synchronize the device. + + .. seealso:: + :func:`cupy.compress` for full documentation, + :meth:`numpy.ndarray.compress` + + """ + return _indexing._ndarray_compress(self, condition, axis, out) + + cpdef _ndarray_base diagonal(self, offset=0, axis1=0, axis2=1): + """Returns a view of the specified diagonals. + + .. seealso:: + :func:`cupy.diagonal` for full documentation, + :meth:`numpy.ndarray.diagonal` + + """ + return _indexing._ndarray_diagonal(self, offset, axis1, axis2) + + # ------------------------------------------------------------------------- + # Calculation + # ------------------------------------------------------------------------- + cpdef _ndarray_base max(self, axis=None, out=None, keepdims=False): + """Returns the maximum along a given axis. + + .. seealso:: + :func:`cupy.amax` for full documentation, + :meth:`numpy.ndarray.max` + + """ + return _statistics._ndarray_max(self, axis, out, None, keepdims) + + cpdef _ndarray_base argmax( + self, axis=None, out=None, dtype=None, keepdims=False): + """Returns the indices of the maximum along a given axis. + + .. note:: + ``dtype`` and ``keepdim`` arguments are specific to CuPy. They are + not in NumPy. + + .. note:: + ``axis`` argument accepts a tuple of ints, but this is specific to + CuPy. NumPy does not support it. + + .. seealso:: + :func:`cupy.argmax` for full documentation, + :meth:`numpy.ndarray.argmax` + + """ + return _statistics._ndarray_argmax(self, axis, out, dtype, keepdims) + + cpdef _ndarray_base min(self, axis=None, out=None, keepdims=False): + """Returns the minimum along a given axis. + + .. seealso:: + :func:`cupy.amin` for full documentation, + :meth:`numpy.ndarray.min` + + """ + return _statistics._ndarray_min(self, axis, out, None, keepdims) + + cpdef _ndarray_base argmin( + self, axis=None, out=None, dtype=None, keepdims=False): + """Returns the indices of the minimum along a given axis. + + .. note:: + ``dtype`` and ``keepdim`` arguments are specific to CuPy. They are + not in NumPy. + + .. note:: + ``axis`` argument accepts a tuple of ints, but this is specific to + CuPy. NumPy does not support it. + + .. seealso:: + :func:`cupy.argmin` for full documentation, + :meth:`numpy.ndarray.argmin` + + """ + return _statistics._ndarray_argmin(self, axis, out, dtype, keepdims) + + cpdef _ndarray_base ptp(self, axis=None, out=None, keepdims=False): + """Returns (maximum - minimum) along a given axis. + + .. seealso:: + :func:`cupy.ptp` for full documentation, + :meth:`numpy.ndarray.ptp` + + """ + return _statistics._ndarray_ptp(self, axis, out, keepdims) + + cpdef _ndarray_base clip(self, min=None, max=None, out=None): + """Returns an array with values limited to [min, max]. + + .. seealso:: + :func:`cupy.clip` for full documentation, + :meth:`numpy.ndarray.clip` + + """ + return _math._ndarray_clip(self, min, max, out) + + cpdef _ndarray_base round(self, decimals=0, out=None): + """Returns an array with values rounded to the given number of decimals. + + .. seealso:: + :func:`cupy.around` for full documentation, + :meth:`numpy.ndarray.round` + + """ # NOQA + return _round_ufunc(self, decimals, out=out) + + cpdef _ndarray_base trace( + self, offset=0, axis1=0, axis2=1, dtype=None, out=None): + """Returns the sum along diagonals of the array. + + .. seealso:: + :func:`cupy.trace` for full documentation, + :meth:`numpy.ndarray.trace` + + """ + d = self.diagonal(offset, axis1, axis2) + return d.sum(-1, dtype, out, False) + + cpdef _ndarray_base sum( + self, axis=None, dtype=None, out=None, keepdims=False): + """Returns the sum along a given axis. + + .. seealso:: + :func:`cupy.sum` for full documentation, + :meth:`numpy.ndarray.sum` + + """ + return _math._ndarray_sum(self, axis, dtype, out, keepdims) + + cpdef _ndarray_base cumsum(self, axis=None, dtype=None, out=None): + """Returns the cumulative sum of an array along a given axis. + + .. seealso:: + :func:`cupy.cumsum` for full documentation, + :meth:`numpy.ndarray.cumsum` + + """ + return _math._ndarray_cumsum(self, axis, dtype, out) + + cpdef _ndarray_base mean( + self, axis=None, dtype=None, out=None, keepdims=False): + """Returns the mean along a given axis. + + .. seealso:: + :func:`cupy.mean` for full documentation, + :meth:`numpy.ndarray.mean` + + """ + return _statistics._ndarray_mean(self, axis, dtype, out, keepdims) + + cpdef _ndarray_base var( + self, axis=None, dtype=None, out=None, ddof=0, keepdims=False): + """Returns the variance along a given axis. + + .. seealso:: + :func:`cupy.var` for full documentation, + :meth:`numpy.ndarray.var` + + """ + return _statistics._ndarray_var( + self, axis, dtype, out, ddof, keepdims) + + cpdef _ndarray_base std( + self, axis=None, dtype=None, out=None, ddof=0, keepdims=False): + """Returns the standard deviation along a given axis. + + .. seealso:: + :func:`cupy.std` for full documentation, + :meth:`numpy.ndarray.std` + + """ + return _statistics._ndarray_std(self, axis, dtype, out, ddof, keepdims) + + cpdef _ndarray_base prod( + self, axis=None, dtype=None, out=None, keepdims=None): + """Returns the product along a given axis. + + .. seealso:: + :func:`cupy.prod` for full documentation, + :meth:`numpy.ndarray.prod` + + """ + return _math._ndarray_prod(self, axis, dtype, out, keepdims) + + cpdef _ndarray_base cumprod(self, axis=None, dtype=None, out=None): + """Returns the cumulative product of an array along a given axis. + + .. seealso:: + :func:`cupy.cumprod` for full documentation, + :meth:`numpy.ndarray.cumprod` + + """ + return _math._ndarray_cumprod(self, axis, dtype, out) + + cpdef _ndarray_base _add_reduceat(self, indices, axis, dtype, out): + return _indexing._add_reduceat(self, indices, axis, dtype, out) + + cpdef _ndarray_base all(self, axis=None, out=None, keepdims=False): + # TODO(niboshi): Write docstring + return _logic._ndarray_all(self, axis, out, keepdims) + + cpdef _ndarray_base any(self, axis=None, out=None, keepdims=False): + # TODO(niboshi): Write docstring + return _logic._ndarray_any(self, axis, out, keepdims) + + # ------------------------------------------------------------------------- + # Arithmetic and comparison operations + # ------------------------------------------------------------------------- + # Comparison operators: + + def __richcmp__(object self, object other, int op): + if isinstance(other, ndarray): + if op == 0: + return _logic._ndarray_less(self, other) + if op == 1: + return _logic._ndarray_less_equal(self, other) + if op == 2: + return _logic._ndarray_equal(self, other) + if op == 3: + return _logic._ndarray_not_equal(self, other) + if op == 4: + return _logic._ndarray_greater(self, other) + if op == 5: + return _logic._ndarray_greater_equal(self, other) + elif not _should_use_rop(self, other): + if isinstance(other, numpy.ndarray) and other.ndim == 0: + other = other.item() # Workaround for numpy<1.13 + if op == 0: + return numpy.less(self, other) + if op == 1: + return numpy.less_equal(self, other) + if op == 2: + # cupy.ndarray does not support dtype=object, but + # allow comparison with None, Ellipsis, and etc. + if type(other).__eq__ is object.__eq__: + # Implies `other` is neither (Python/NumPy) scalar nor + # ndarray. With object's default __eq__, it never + # equals to an element of cupy.ndarray. + return cupy.zeros(self._shape, dtype=cupy.bool_) + return numpy.equal(self, other) + if op == 3: + if ( + type(other).__eq__ is object.__eq__ + and type(other).__ne__ is object.__ne__ + ): + # Similar to eq, but ne falls back to `not __eq__`. + return cupy.ones(self._shape, dtype=cupy.bool_) + return numpy.not_equal(self, other) + if op == 4: + return numpy.greater(self, other) + if op == 5: + return numpy.greater_equal(self, other) + return NotImplemented + + # Truth value of an array (bool): + + def __nonzero__(self): + if self.size == 0: + msg = ('The truth value of an empty array is ambiguous. Returning ' + 'False, but in future this will result in an error. Use ' + '`array.size > 0` to check that an array is not empty.') + warnings.warn(msg, DeprecationWarning) + return False + elif self.size == 1: + return bool(self.get()) + else: + msg = ('The truth value of an array with more than one element is ' + 'ambiguous. Use a.any() or a.all()') + raise ValueError(msg) + + # Unary operations: + + def __neg__(self): + return _math._negative(self) + + def __pos__(self): + if self.dtype == numpy.bool_: + msg = ("Applying '+' to a non-numerical array is ill-defined. " + 'Returning a copy, but in the future this will error.') + warnings.warn(msg, DeprecationWarning) + return self.copy() + return _math._positive(self) + + def __abs__(self): + return _math._absolute(self) + + def __invert__(self): + return _binary._invert(self) + + # Arithmetic: + + def __add__(x, y): + if isinstance(y, ndarray): + return _math._add(x, y) + elif _should_use_rop(x, y): + return NotImplemented + else: + return numpy.add(x, y) + + def __sub__(x, y): + if isinstance(y, ndarray): + return _math._subtract(x, y) + elif _should_use_rop(x, y): + return NotImplemented + else: + return numpy.subtract(x, y) + + def __mul__(x, y): + if isinstance(y, ndarray): + return _math._multiply(x, y) + elif _should_use_rop(x, y): + return NotImplemented + else: + return numpy.multiply(x, y) + + def __matmul__(x, y): + if isinstance(y, ndarray): + return _linalg.matmul(x, y) + elif _should_use_rop(x, y): + return NotImplemented + else: + return numpy.matmul(x, y) + + def __div__(x, y): + if isinstance(y, ndarray): + return _math._divide(x, y) + elif _should_use_rop(x, y): + return NotImplemented + else: + return numpy.divide(x, y) + + def __truediv__(x, y): + if isinstance(y, ndarray): + return _math._true_divide(x, y) + elif _should_use_rop(x, y): + return NotImplemented + else: + return numpy.true_divide(x, y) + + def __floordiv__(x, y): + if isinstance(y, ndarray): + return _math._floor_divide(x, y) + elif _should_use_rop(x, y): + return NotImplemented + else: + return numpy.floor_divide(x, y) + + def __mod__(x, y): + if isinstance(y, ndarray): + return _math._remainder(x, y) + elif _should_use_rop(x, y): + return NotImplemented + else: + return numpy.remainder(x, y) + + def __divmod__(x, y): + if isinstance(y, ndarray): + return divmod(x, y) + elif _should_use_rop(x, y): + return NotImplemented + else: + return numpy.divmod(x, y) + + def __pow__(x, y, modulo): + # Note that we ignore the modulo argument as well as NumPy. + if isinstance(y, ndarray): + return _math._power(x, y) + elif _should_use_rop(x, y): + return NotImplemented + else: + return numpy.power(x, y) + + def __lshift__(x, y): + if isinstance(y, ndarray): + return _binary._left_shift(x, y) + elif _should_use_rop(x, y): + return NotImplemented + else: + return numpy.left_shift(x, y) + + def __rshift__(x, y): + if isinstance(y, ndarray): + return _binary._right_shift(x, y) + elif _should_use_rop(x, y): + return NotImplemented + else: + return numpy.right_shift(x, y) + + def __and__(x, y): + if isinstance(y, ndarray): + return _binary._bitwise_and(x, y) + elif _should_use_rop(x, y): + return NotImplemented + else: + return numpy.bitwise_and(x, y) + + def __or__(x, y): + if isinstance(y, ndarray): + return _binary._bitwise_or(x, y) + elif _should_use_rop(x, y): + return NotImplemented + else: + return numpy.bitwise_or(x, y) + + def __xor__(x, y): + if isinstance(y, ndarray): + return _binary._bitwise_xor(x, y) + elif _should_use_rop(x, y): + return NotImplemented + else: + return numpy.bitwise_xor(x, y) + + # Arithmetic, in-place: + + def __iadd__(self, other): + return _math._add(self, other, self) + + def __isub__(self, other): + return _math._subtract(self, other, self) + + def __imul__(self, other): + return _math._multiply(self, other, self) + + def __idiv__(self, other): + return _math._divide(self, other, self) + + def __itruediv__(self, other): + return _math._true_divide(self, other, self) + + def __ifloordiv__(self, other): + return _math._floor_divide(self, other, self) + + def __imod__(self, other): + return _math._remainder(self, other, self) + + def __ipow__(self, other): + return _math._power(self, other, self) + + def __ilshift__(self, other): + return _binary._left_shift(self, other, self) + + def __irshift__(self, other): + return _binary._right_shift(self, other, self) + + def __iand__(self, other): + return _binary._bitwise_and(self, other, self) + + def __ior__(self, other): + return _binary._bitwise_or(self, other, self) + + def __ixor__(self, other): + return _binary._bitwise_xor(self, other, self) + + cpdef _ndarray_base conj(self): + return _math._ndarray_conj(self) + + cpdef _ndarray_base conjugate(self): + return _math._ndarray_conj(self) + + @property + def real(self): + return _math._ndarray_real_getter(self) + + @real.setter + def real(self, value): + _math._ndarray_real_setter(self, value) + + @property + def imag(self): + return _math._ndarray_imag_getter(self) + + @imag.setter + def imag(self, value): + _math._ndarray_imag_setter(self, value) + + # ------------------------------------------------------------------------- + # Special methods + # ------------------------------------------------------------------------- + # For standard library functions: + + def __copy__(self): + return self.copy() + + def __deepcopy__(self, memo): + # It need to make a contiguous copy for copying from another device + prev_device = runtime.getDevice() + try: + runtime.setDevice(self.device.id) + return self.copy() + finally: + runtime.setDevice(prev_device) + + def __reduce__(self): + return array, (self.get(),) + + # Basic customization: + + # _ndarray_base does not define __new__ + + def __array__(self, dtype=None): + # TODO(imanishi): Support an environment variable or a global + # configure flag that allows implicit conversions to NumPy array. + # (See https://github.com/cupy/cupy/issues/589 for the detail.) + raise TypeError( + 'Implicit conversion to a NumPy array is not allowed. ' + 'Please use `.get()` to construct a NumPy array explicitly.') + + @classmethod + def __class_getitem__(cls, tuple item): + from cupy.typing._generic_alias import GenericAlias + item1, item2 = item + return GenericAlias(cupy.ndarray, (item1, item2)) + + # TODO(okuta): Implement __array_wrap__ + + # Container customization: + + def __iter__(self): + if self._shape.size() == 0: + raise TypeError('iteration over a 0-d array') + return (self[i] for i in range(self._shape[0])) + + def __len__(self): + if self._shape.size() == 0: + raise TypeError('len() of unsized object') + return self._shape[0] + + def __getitem__(self, slices): + """x.__getitem__(y) <==> x[y] + + Supports both basic and advanced indexing. + + .. note:: + + Currently, it does not support ``slices`` that consists of more + than one boolean arrays + + .. note:: + + CuPy handles out-of-bounds indices differently from NumPy. + NumPy handles them by raising an error, but CuPy wraps around them. + + Example: + + >>> a = cupy.arange(3) + >>> a[[1, 3]] + array([1, 0]) + + """ + return _indexing._ndarray_getitem(self, slices) + + def __setitem__(self, slices, value): + """x.__setitem__(slices, y) <==> x[slices] = y + + Supports both basic and advanced indexing. + + .. note:: + + Currently, it does not support ``slices`` that consists of more + than one boolean arrays + + .. note:: + + CuPy handles out-of-bounds indices differently from NumPy when + using integer array indexing. + NumPy handles them by raising an error, but CuPy wraps around them. + + >>> import cupy + >>> x = cupy.arange(3) + >>> x[[1, 3]] = 10 + >>> x + array([10, 10, 2]) + + .. note:: + + The behavior differs from NumPy when integer arrays in ``slices`` + reference the same location multiple times. + In that case, the value that is actually stored is undefined. + + >>> import cupy + >>> a = cupy.zeros((2,)) + >>> i = cupy.arange(10000) % 2 + >>> v = cupy.arange(10000).astype(cupy.float_) + >>> a[i] = v + >>> a # doctest: +SKIP + array([9150., 9151.]) + + On the other hand, NumPy stores the value corresponding to the + last index among the indices referencing duplicate locations. + + >>> import numpy + >>> a_cpu = numpy.zeros((2,)) + >>> i_cpu = numpy.arange(10000) % 2 + >>> v_cpu = numpy.arange(10000).astype(numpy.float_) + >>> a_cpu[i_cpu] = v_cpu + >>> a_cpu + array([9998., 9999.]) + + """ + if _util.ENABLE_SLICE_COPY and ( + type(slices) is slice + and slices == slice(None, None, None) + and isinstance(value, numpy.ndarray) + ): + if (self.dtype == value.dtype + and self.shape == value.shape + and (self._f_contiguous or self._c_contiguous)): + order = 'F' if self._f_contiguous else 'C' + tmp = value.ravel(order) + ptr = tmp.ctypes.data + stream_ptr = stream_module.get_current_stream_ptr() + if stream_ptr == 0: + self.data.copy_from_host(ptr, self.nbytes) + else: + self.data.copy_from_host_async(ptr, self.nbytes) + else: + raise ValueError( + 'copying a numpy.ndarray to a cupy.ndarray by empty slice ' + 'assignment must ensure arrays have same shape and dtype') + else: + _indexing._ndarray_setitem(self, slices, value) + + def scatter_add(self, slices, value): + """Adds given values to specified elements of an array. + + .. seealso:: + :func:`cupyx.scatter_add` for full documentation. + + """ + warnings.warn( + '`ndarray.scatter_add` is deprecated. ' + 'Please use `cupy.add.at` instead.', + DeprecationWarning) + self._scatter_op(slices, value, 'add') + + def scatter_max(self, slices, value): + """Stores a maximum value of elements specified by indices to an array. + + .. seealso:: + :func:`cupyx.scatter_max` for full documentation. + + """ + warnings.warn( + '`ndarray.scatter_max` is deprecated ' + 'Please use `cupy.maximum.at` instead.', + DeprecationWarning) + self._scatter_op(slices, value, 'max') + + def scatter_min(self, slices, value): + """Stores a minimum value of elements specified by indices to an array. + + .. seealso:: + :func:`cupyx.scatter_min` for full documentation. + + """ + warnings.warn( + '`ndarray.scatter_min` is deprecated ' + 'Please use `cupy.minimum.at` instead.', + DeprecationWarning) + self._scatter_op(slices, value, 'min') + + def _scatter_op(self, slices, value, op): + _indexing._scatter_op(self, slices, value, op) + + # TODO(okuta): Implement __getslice__ + # TODO(okuta): Implement __setslice__ + # TODO(okuta): Implement __contains__ + + # numpy/ufunc compat + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + + """Apply unary or binary ufunc to this array + + If binary, only allow if second argument is another cupy ndarray or + a number, i.e., raise ValueError instead of silently converting a + numpy array. + """ + import cupy # top-level ufuncs + import cupyx.scipy.special # special ufuncs + inout = inputs + if 'out' in kwargs: + # need to unfold tuple argument in kwargs + # TODO(ecastill) GUFuncs support more than one output + out = kwargs['out'] + if len(out) != 1: + raise ValueError('The \'out\' parameter must have exactly one ' + 'array value') + inout += out + kwargs['out'] = out[0] + + if method in ( + '__call__', 'outer', 'at', 'reduce', 'accumulate', 'reduceat' + ): + name = ufunc.__name__ + try: + func = getattr(cupy, name, None) or getattr( + cupyx.scipy.special, name + ) + if method != '__call__': + func = getattr(func, method) + except AttributeError: + return NotImplemented + for x in inout: + # numpy.ndarray is handled and then TypeError is raised due to + # implicit host-to-device conversion. + # Except for numpy.ndarray, types should be supported by + # `_kernel._preprocess_args`. + check = (hasattr(x, '__cuda_array_interface__') + or hasattr(x, '__cupy_get_ndarray__')) + if runtime._is_hip_environment and isinstance(x, ndarray): + check = True + if (not check + and not type(x) in _scalar.scalar_type_set + and not isinstance(x, numpy.ndarray)): + return NotImplemented + if name in [ + 'greater', 'greater_equal', 'less', 'less_equal', + 'equal', 'not_equal']: + # workaround for numpy/numpy#12142 + inputs = tuple([ + x.item() + if isinstance(x, numpy.ndarray) and x.ndim == 0 + else x + for x in inputs + ]) + return func(*inputs, **kwargs) + else: + return NotImplemented + + def __array_function__(self, func, types, args, kwargs): + try: + module = functools.reduce( + getattr, func.__module__.split('.')[1:], cupy) + cupy_func = getattr(module, func.__name__) + except AttributeError: + return NotImplemented + if cupy_func is func: + # avoid NumPy func + return NotImplemented + for t in types: + for handled_type in _HANDLED_TYPES: + if issubclass(t, handled_type): + break + else: + return NotImplemented + return cupy_func(*args, **kwargs) + + # Conversion: + + def __int__(self): + return int(self.get()) + + def __float__(self): + return float(self.get()) + + def __complex__(self): + return complex(self.get()) + + def __oct__(self): + return oct(self.get()) + + def __hex__(self): + return hex(self.get()) + + def __bytes__(self): + return bytes(self.get()) + + # String representations: + + def __repr__(self): + return repr(self.get()) + + def __str__(self): + return str(self.get()) + + def __format__(self, format_spec): + return format(self.get(), format_spec) + + # ------------------------------------------------------------------------- + # Methods outside of the ndarray main documentation + # ------------------------------------------------------------------------- + def dot(self, _ndarray_base b, _ndarray_base out=None): + """Returns the dot product with given array. + + .. seealso:: + :func:`cupy.dot` for full documentation, + :meth:`numpy.ndarray.dot` + + """ + return _linalg.dot(self, b, out) + + # ------------------------------------------------------------------------- + # Cupy specific attributes and methods + # ------------------------------------------------------------------------- + @property + def device(self): + """CUDA device on which this array resides.""" + return self.data.device + + cpdef get(self, stream=None, order='C', out=None): + """Returns a copy of the array on host memory. + + Args: + stream (cupy.cuda.Stream): CUDA stream object. If it is given, the + copy runs asynchronously. Otherwise, the copy is synchronous. + The default uses CUDA stream object of the current context. + order ({'C', 'F', 'A'}): The desired memory layout of the host + array. When ``order`` is 'A', it uses 'F' if the array is + fortran-contiguous and 'C' otherwise. The ``order`` will be + ignored if ``out`` is specified. + out (numpy.ndarray): Output array. In order to enable asynchronous + copy, the underlying memory should be a pinned memory. + + Returns: + numpy.ndarray: Copy of the array on host memory. + + """ + if out is not None: + if not isinstance(out, numpy.ndarray): + raise TypeError('Only numpy.ndarray can be obtained from' + 'cupy.ndarray') + if self.dtype != out.dtype: + raise TypeError( + '{} array cannot be obtained from {} array'.format( + out.dtype, self.dtype)) + if self.shape != out.shape: + raise ValueError( + 'Shape mismatch. Expected shape: {}, ' + 'actual shape: {}'.format(self.shape, out.shape)) + if not (out.flags.c_contiguous and self._c_contiguous or + out.flags.f_contiguous and self._f_contiguous): + prev_device = runtime.getDevice() + try: + runtime.setDevice(self.device.id) + if out.flags.c_contiguous: + a_gpu = _internal_ascontiguousarray(self) + elif out.flags.f_contiguous: + a_gpu = _internal_asfortranarray(self) + else: + raise RuntimeError( + '`out` cannot be specified when copying to ' + 'non-contiguous ndarray') + finally: + runtime.setDevice(prev_device) + else: + a_gpu = self + a_cpu = out + else: + if self.size == 0: + return numpy.ndarray(self._shape, dtype=self.dtype) + + order = order.upper() + if order == 'A': + if self._f_contiguous: + order = 'F' + else: + order = 'C' + if not (order == 'C' and self._c_contiguous or + order == 'F' and self._f_contiguous): + prev_device = runtime.getDevice() + try: + runtime.setDevice(self.device.id) + if order == 'C': + a_gpu = _internal_ascontiguousarray(self) + elif order == 'F': + a_gpu = _internal_asfortranarray(self) + else: + raise ValueError('unsupported order: {}'.format(order)) + finally: + runtime.setDevice(prev_device) + else: + a_gpu = self + a_cpu = numpy.empty(self._shape, dtype=self.dtype, order=order) + + syncdetect._declare_synchronize() + ptr = a_cpu.ctypes.data + prev_device = runtime.getDevice() + try: + runtime.setDevice(self.device.id) + if stream is not None: + a_gpu.data.copy_to_host_async(ptr, a_gpu.nbytes, stream) + else: + stream_ptr = stream_module.get_current_stream_ptr() + if stream_ptr == 0: + a_gpu.data.copy_to_host(ptr, a_gpu.nbytes) + else: + a_gpu.data.copy_to_host_async(ptr, a_gpu.nbytes) + finally: + runtime.setDevice(prev_device) + return a_cpu + + cpdef set(self, arr, stream=None): + """Copies an array on the host memory to :class:`cupy.ndarray`. + + Args: + arr (numpy.ndarray): The source array on the host memory. + stream (cupy.cuda.Stream): CUDA stream object. If it is given, the + copy runs asynchronously. Otherwise, the copy is synchronous. + The default uses CUDA stream object of the current context. + + """ + if not isinstance(arr, numpy.ndarray): + raise TypeError('Only numpy.ndarray can be set to cupy.ndarray') + if self.dtype != arr.dtype: + raise TypeError('{} array cannot be set to {} array'.format( + arr.dtype, self.dtype)) + if self.shape != arr.shape: + raise ValueError( + 'Shape mismatch. Old shape: {}, new shape: {}'.format( + self.shape, arr.shape)) + if self._c_contiguous: + arr = numpy.ascontiguousarray(arr) + elif self._f_contiguous: + arr = numpy.asfortranarray(arr) + else: + raise RuntimeError('Cannot set to non-contiguous array') + + ptr = arr.ctypes.data + prev_device = runtime.getDevice() + try: + runtime.setDevice(self.device.id) + if stream is not None: + self.data.copy_from_host_async(ptr, self.nbytes, stream) + else: + stream_ptr = stream_module.get_current_stream_ptr() + if stream_ptr == 0: + self.data.copy_from_host(ptr, self.nbytes) + else: + self.data.copy_from_host_async(ptr, self.nbytes) + finally: + runtime.setDevice(prev_device) + + cpdef _ndarray_base reduced_view(self, dtype=None): + """Returns a view of the array with minimum number of dimensions. + + Args: + dtype: (Deprecated) Data type specifier. + If it is given, then the memory + sequence is reinterpreted as the new type. + + Returns: + cupy.ndarray: A view of the array with reduced dimensions. + + """ + cdef shape_t shape + cdef strides_t strides + cdef Py_ssize_t ndim + cdef _ndarray_base view + if dtype is not None: + warnings.warn( + 'calling reduced_view with dtype is deprecated', + DeprecationWarning) + return self.reduced_view().view(dtype) + + ndim = self._shape.size() + if ndim <= 1: + return self + if self._c_contiguous: + view = self.view() + view._shape.assign(1, self.size) + view._strides.assign(1, self.dtype.itemsize) + view._update_f_contiguity() + return view + + internal.get_reduced_dims( + self._shape, self._strides, self.dtype.itemsize, shape, strides) + if ndim == shape.size(): + return self + + # TODO(niboshi): Confirm update_x_contiguity flags + return self._view(type(self), shape, strides, False, True, self) + + cpdef _update_c_contiguity(self): + if self.size == 0: + self._c_contiguous = True + return + self._c_contiguous = internal.get_c_contiguity( + self._shape, self._strides, self.dtype.itemsize) + + cpdef _update_f_contiguity(self): + cdef Py_ssize_t i, count + cdef shape_t rev_shape + cdef strides_t rev_strides + if self.size == 0: + self._f_contiguous = True + return + if self._c_contiguous: + count = 0 + for i in self._shape: + if i == 1: + count += 1 + self._f_contiguous = (self._shape.size()) - count <= 1 + return + rev_shape.assign(self._shape.rbegin(), self._shape.rend()) + rev_strides.assign(self._strides.rbegin(), self._strides.rend()) + self._f_contiguous = internal.get_c_contiguity( + rev_shape, rev_strides, self.dtype.itemsize) + + cpdef _update_contiguity(self): + self._update_c_contiguity() + self._update_f_contiguity() + + cpdef _set_shape_and_strides(self, const shape_t& shape, + const strides_t& strides, + bint update_c_contiguity, + bint update_f_contiguity): + if shape.size() != strides.size(): + raise ValueError('len(shape) != len(strides)') + if shape.size() > _carray.MAX_NDIM: + msg = 'maximum supported dimension for an ndarray is ' + msg += f'{_carray.MAX_NDIM}, found {shape.size()}' + raise ValueError(msg) + self._shape = shape + self._strides = strides + self.size = internal.prod(shape) + if update_c_contiguity: + self._update_c_contiguity() + if update_f_contiguity: + self._update_f_contiguity() + + cdef _ndarray_base _view(self, subtype, const shape_t& shape, + const strides_t& strides, + bint update_c_contiguity, + bint update_f_contiguity, obj): + cdef _ndarray_base v + # Use `_no_init=True` to skip recomputation of contiguity. Now + # calling `__array_finalize__` is responsibility of this method.` + v = ndarray.__new__(subtype, _obj=obj, _no_init=True) + v.data = self.data + v.base = self.base if self.base is not None else self + v.dtype = self.dtype + v._c_contiguous = self._c_contiguous + v._f_contiguous = self._f_contiguous + v._index_32_bits = self._index_32_bits + v._set_shape_and_strides( + shape, strides, update_c_contiguity, update_f_contiguity) + if subtype is not ndarray: + v.__array_finalize__(self) + return v + + cpdef _set_contiguous_strides( + self, Py_ssize_t itemsize, bint is_c_contiguous): + self.size = internal.get_contiguous_strides_inplace( + self._shape, self._strides, itemsize, is_c_contiguous, True) + if is_c_contiguous: + self._c_contiguous = True + self._update_f_contiguity() + else: + self._f_contiguous = True + self._update_c_contiguity() + + cdef function.CPointer get_pointer(self): + return _CArray_from_ndarray(self) + + cpdef object toDlpack(self): + """Zero-copy conversion to a DLPack tensor. + + DLPack is a open in memory tensor structure proposed in this + repository: `dmlc/dlpack `_. + + This function returns a :class:`PyCapsule` object which contains a + pointer to a DLPack tensor converted from the own ndarray. This + function does not copy the own data to the output DLpack tensor + but it shares the pointer which is pointing to the same memory region + for the data. + + Returns: + dltensor (:class:`PyCapsule`): Output DLPack tensor which is + encapsulated in a :class:`PyCapsule` object. + + .. seealso:: + + :meth:`~cupy.fromDlpack` is a method for zero-copy conversion from + a DLPack tensor (which is encapsulated in a :class:`PyCapsule` + object) to a :class:`ndarray` + + .. warning:: + + As of the DLPack v0.3 specification, it is (implicitly) assumed + that the user is responsible to ensure the Producer and the + Consumer are operating on the same stream. This requirement might + be relaxed/changed in a future DLPack version. + + .. admonition:: Example + + >>> import cupy + >>> array1 = cupy.array([0, 1, 2], dtype=cupy.float32) + >>> dltensor = array1.toDlpack() + >>> array2 = cupy.fromDlpack(dltensor) + >>> cupy.testing.assert_array_equal(array1, array2) + + """ + return dlpack.toDlpack(self) + + +cdef inline _carray.CArray _CArray_from_ndarray(_ndarray_base arr): + # Creates CArray from ndarray. + # Note that this function cannot be defined in _carray.pxd because that + # would cause cyclic cimport dependencies. + cdef _carray.CArray carr = _carray.CArray.__new__(_carray.CArray) + carr.init(arr.data.ptr, arr.size, arr._shape, arr._strides) + return carr + + +_HANDLED_TYPES = (ndarray, numpy.ndarray) + + +# ============================================================================= +# compile_with_cache +# ============================================================================= +# TODO(niboshi): Move it out of core.pyx + +cdef bint _is_hip = runtime._is_hip_environment +cdef int _cuda_runtime_version = -1 +cdef str _cuda_path = '' # '' for uninitialized, None for non-existing + +cdef list cupy_header_list = [ + 'cupy/complex.cuh', + 'cupy/carray.cuh', + 'cupy/atomics.cuh', + 'cupy/math_constants.h', +] +if _is_hip: + cupy_header_list.append('cupy/hip_workaround.cuh') + +# expose to Python for unit testing +_cupy_header_list = cupy_header_list + +cdef str _cupy_header = ''.join( + ['#include <%s>\n' % i for i in cupy_header_list]) + +# This is indirect include header list. +# These header files are subject to a hash key. +cdef list _cupy_extra_header_list = [ + 'cupy/complex/complex.h', + 'cupy/complex/math_private.h', + 'cupy/complex/complex_inl.h', + 'cupy/complex/arithmetic.h', + 'cupy/complex/cproj.h', + 'cupy/complex/cexp.h', + 'cupy/complex/cexpf.h', + 'cupy/complex/clog.h', + 'cupy/complex/clogf.h', + 'cupy/complex/cpow.h', + 'cupy/complex/ccosh.h', + 'cupy/complex/ccoshf.h', + 'cupy/complex/csinh.h', + 'cupy/complex/csinhf.h', + 'cupy/complex/ctanh.h', + 'cupy/complex/ctanhf.h', + 'cupy/complex/csqrt.h', + 'cupy/complex/csqrtf.h', + 'cupy/complex/catrig.h', + 'cupy/complex/catrigf.h', + 'cupy/swap.cuh', + 'cupy/tuple/type_traits.h', + 'cupy/tuple/tuple.h', + 'cupy/tuple.cuh', +] + +cdef str _header_path_cache = None +cdef str _header_source = None +cdef dict _header_source_map = {} + + +cpdef str _get_header_dir_path(): + global _header_path_cache + if _header_path_cache is None: + # Cython cannot use __file__ in global scope + _header_path_cache = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'include')) + return _header_path_cache + + +cpdef str _get_header_source(): + global _header_source + global _header_source_map + cdef str header_path, base_path, file_path, header + cdef list source + + if _header_source is None or not _header_source_map: + source = [] + base_path = _get_header_dir_path() + for file_path in _cupy_extra_header_list + cupy_header_list: + header_path = os.path.join(base_path, file_path) + with open(header_path) as header_file: + header = header_file.read() + source.append(header) + _header_source_map[file_path.encode()] = header.encode() + _header_source = '\n'.join(source) + return _header_source + + +cpdef dict _get_header_source_map(): + global _header_source_map + if not _header_source_map: + _get_header_source() + return _header_source_map + + +# added at the module level for precompiling the regex +_cucomplex_include_tokens = ['', '#', 'include', '<', r'cuComplex\.h', '>'] +_cucomplex_include_pattern = re.compile(r'\s*'.join(_cucomplex_include_tokens)) + + +cdef inline str _translate_cucomplex_to_thrust(str source): + lines = [] + for line in source.splitlines(keepends=True): + if _cucomplex_include_pattern.match(line): + lines += '#include '\ + '// translate_cucomplex\n' + else: + lines += line + return ''.join(lines) + + +cpdef function.Module compile_with_cache( + str source, tuple options=(), arch=None, cachd_dir=None, + prepend_cupy_headers=True, backend='nvrtc', translate_cucomplex=False, + enable_cooperative_groups=False, name_expressions=None, + log_stream=None, bint jitify=False): + if translate_cucomplex: + source = _translate_cucomplex_to_thrust(source) + cupy_header_list.append('cupy/cuComplex_bridge.h') + prepend_cupy_headers = True + + if prepend_cupy_headers: + source = _cupy_header + source + extra_source = _get_header_source() + options += ('-I%s' % _get_header_dir_path(),) + + # The variable _cuda_runtime_version is declared in cupy/_core/core.pyx, + # but it might not have been set appropriately before coming here. + global _cuda_runtime_version + if _cuda_runtime_version < 0: + _cuda_runtime_version = runtime.runtimeGetVersion() + + global _cuda_path + if _cuda_path == '': + if not _is_hip: + _cuda_path = cuda.get_cuda_path() + else: + _cuda_path = cuda.get_rocm_path() + + if not _is_hip: + if 10020 <= _cuda_runtime_version < 10030: + bundled_include = 'cuda-10.2' + elif 11000 <= _cuda_runtime_version < 11010: + bundled_include = 'cuda-11.0' + elif 11010 <= _cuda_runtime_version < 11020: + bundled_include = 'cuda-11.1' + elif 11020 <= _cuda_runtime_version < 12000: + # CUDA Enhanced Compatibility + bundled_include = 'cuda-11' + elif 12000 <= _cuda_runtime_version < 13000: + # CUDA Enhanced Compatibility + bundled_include = 'cuda-12' + else: + # CUDA versions not yet supported. + bundled_include = None + + if bundled_include is None and _cuda_path is None: + raise RuntimeError( + 'Failed to auto-detect CUDA root directory. ' + 'Please specify `CUDA_PATH` environment variable if you ' + 'are using CUDA versions not yet supported by CuPy.') + + if bundled_include is not None: + options += ('-I' + os.path.join( + _get_header_dir_path(), 'cupy', '_cuda', bundled_include),) + elif _is_hip: + if _cuda_path is None: + raise RuntimeError( + 'Failed to auto-detect ROCm root directory. ' + 'Please specify `ROCM_HOME` environment variable.') + + if _cuda_path is not None: + options += ('-I' + os.path.join(_cuda_path, 'include'),) + + return cuda.compiler._compile_module_with_cache( + source, options, arch, cachd_dir, extra_source, backend, + enable_cooperative_groups=enable_cooperative_groups, + name_expressions=name_expressions, log_stream=log_stream, + jitify=jitify) + + +# ============================================================================= +# Routines +# ============================================================================= + +cdef str _id = 'out0 = in0' + +cdef fill_kernel = ElementwiseKernel('T x', 'T y', 'y = x', 'cupy_fill') + +cdef str _divmod_float = ''' + out0_type a = _floor_divide(in0, in1); + out0 = a; + out1 = in0 - a * in1''' + + +divmod = create_ufunc( + 'cupy_divmod', + ('bb->bb', 'BB->BB', 'hh->hh', 'HH->HH', 'ii->ii', 'II->II', 'll->ll', + 'LL->LL', 'qq->qq', 'QQ->QQ', + ('ee->ee', _divmod_float), + ('ff->ff', _divmod_float), + ('dd->dd', _divmod_float)), + ''' + if (in1 == 0) { + out0 = 0; + out1 = 0; + } else { + out0_type a = _floor_divide(in0, in1); + out0 = a; + out1 = in0 - a * in1; + }''') + + +cdef _round_preamble = ''' +#ifdef __HIP_DEVICE_COMPILE__ +#define round_float llrintf +#else +#define round_float __float2ll_rn +#endif + +template __device__ T pow10(long long n){ + T x = 1, a = 10; + while (n) { + if (n & 1) x *= a; + a *= a; + n >>= 1; + } + return x; +}; +''' + + +cdef _round_float = ''' +if (in1 == 0) { + out0 = rint(in0); +} else { + double x; + x = pow10(abs(in1)); // TODO(okuta): Move before loop + out0 = in1 < 0 ? rint(in0 / x) * x : rint(in0 * x) / x; +}''' + +cdef _round_complex = ''' +if (in1 == 0) { + out0 = in0_type(rint(in0.real()), rint(in0.imag())); +} else { + double x = pow10(abs(in1)); // TODO(okuta): Move before loop + if (in1 < 0) { + out0 = in0_type(rint(in0.real() / x) * x, + rint(in0.imag() / x) * x); + } else { + out0 = in0_type(rint(in0.real() * x) / x, + rint(in0.imag() * x) / x); + } +}''' + + +# There is a known incompatibility with NumPy (as of 1.16.4) such as +# `numpy.around(2**63, -1) == cupy.around(2**63, -1)` gives `False`. +# +# NumPy seems to round integral values via double. As double has +# only 53 bit precision, last few bits of (u)int64 value may be lost. +# As a consequence, `numpy.around(2**63, -1)` does NOT round up the +# last digit (9223372036854775808 instead of ...810). +# +# The following code fixes the problem, so `cupy.around(2**63, -1)` +# gives `...810`, which (may correct but) is incompatible with NumPy. +_round_ufunc = create_ufunc( + 'cupy_round', + ('?q->e', + 'bq->b', 'Bq->B', 'hq->h', 'Hq->H', 'iq->i', 'Iq->I', 'lq->l', 'Lq->L', + 'qq->q', 'Qq->Q', + ('eq->e', _round_float), + ('fq->f', _round_float), + ('dq->d', _round_float), + ('Fq->F', _round_complex), + ('Dq->D', _round_complex)), + ''' + if (in1 >= 0) { + out0 = in0; + } else { + // TODO(okuta): Move before loop + long long x = pow10(-in1 - 1); + + // TODO(okuta): Check Numpy + // `cupy.around(-123456789, -4)` works as follows: + // (1) scale by `x` above: -123456.789 + // (2) split at the last 2 digits: -123400 + (-5.6789 * 10) + // (3) round the latter by `rint()`: -123400 + (-6.0 * 10) + // (4) unscale by `x` above: -123460000 + long long q = in0 / x / 100; + int r = in0 - q*x*100; + out0 = (q*100 + round_float(r/(x*10.0f))*10) * x; + }''', preamble=_round_preamble) + + +# ----------------------------------------------------------------------------- +# Array creation routines +# ----------------------------------------------------------------------------- + +cpdef _ndarray_base array(obj, dtype=None, bint copy=True, order='K', + bint subok=False, Py_ssize_t ndmin=0): + # TODO(beam2d): Support subok options + if subok: + raise NotImplementedError + if order is None: + order = 'K' + + if isinstance(obj, ndarray): + return _array_from_cupy_ndarray(obj, dtype, copy, order, ndmin) + + if hasattr(obj, '__cuda_array_interface__'): + return _array_from_cuda_array_interface( + obj, dtype, copy, order, subok, ndmin) + if hasattr(obj, '__cupy_get_ndarray__'): + return _array_from_cupy_ndarray( + obj.__cupy_get_ndarray__(), dtype, copy, order, ndmin) + + concat_shape, concat_type, concat_dtype = ( + _array_info_from_nested_sequence(obj)) + if concat_shape is not None: + return _array_from_nested_sequence( + obj, dtype, order, ndmin, concat_shape, concat_type, concat_dtype) + + return _array_default(obj, dtype, order, ndmin) + + +cdef _ndarray_base _array_from_cupy_ndarray( + obj, dtype, bint copy, order, Py_ssize_t ndmin): + cdef Py_ssize_t ndim + cdef _ndarray_base a, src + + src = obj + + if dtype is None: + dtype = src.dtype + + if src.data.device_id == device.get_device_id(): + a = src.astype(dtype, order=order, copy=copy) + else: + a = src.copy(order=order).astype(dtype, copy=False) + + ndim = a._shape.size() + if ndmin > ndim: + if a is obj: + # When `copy` is False, `a` is same as `obj`. + a = a.view() + a.shape = (1,) * (ndmin - ndim) + a.shape + + return a + + +cdef _ndarray_base _array_from_cuda_array_interface( + obj, dtype, bint copy, order, bint subok, Py_ssize_t ndmin): + return array( + _convert_object_with_cuda_array_interface(obj), + dtype, copy, order, subok, ndmin) + + +cdef _ndarray_base _array_from_nested_sequence( + obj, dtype, order, Py_ssize_t ndmin, concat_shape, concat_type, + concat_dtype): + cdef Py_ssize_t ndim + + # resulting array is C order unless 'F' is explicitly specified + # (i.e., it ignores order of element arrays in the sequence) + order = ( + 'F' + if order is not None and len(order) >= 1 and order[0] in 'Ff' + else 'C') + + ndim = len(concat_shape) + if ndmin > ndim: + concat_shape = (1,) * (ndmin - ndim) + concat_shape + + if dtype is None: + dtype = concat_dtype.newbyteorder('<') + + if concat_type is numpy.ndarray: + return _array_from_nested_numpy_sequence( + obj, concat_dtype, dtype, concat_shape, order, ndmin) + elif concat_type is ndarray: # TODO(takagi) Consider subclases + return _array_from_nested_cupy_sequence( + obj, dtype, concat_shape, order) + else: + assert False + + +cdef _ndarray_base _array_from_nested_numpy_sequence( + arrays, src_dtype, dst_dtype, const shape_t& shape, order, + Py_ssize_t ndmin): + a_dtype = get_dtype(dst_dtype) # convert to numpy.dtype + if a_dtype.char not in '?bhilqBHILQefdFD': + raise ValueError('Unsupported dtype %s' % a_dtype) + cdef _ndarray_base a # allocate it after pinned memory is secured + cdef size_t itemcount = internal.prod(shape) + cdef size_t nbytes = itemcount * a_dtype.itemsize + + stream = stream_module.get_current_stream() + # Note: even if arrays are already backed by pinned memory, we still need + # to allocate an extra buffer and copy from it to avoid potential data + # race, see the discussion here: + # https://github.com/cupy/cupy/pull/5155#discussion_r621808782 + cdef pinned_memory.PinnedMemoryPointer mem = ( + _alloc_async_transfer_buffer(nbytes)) + if mem is not None: + # write concatenated arrays to the pinned memory directly + src_cpu = ( + numpy.frombuffer(mem, a_dtype, itemcount) + .reshape(shape, order=order)) + _concatenate_numpy_array( + [numpy.expand_dims(e, 0) for e in arrays], + 0, + get_dtype(src_dtype), + a_dtype, + src_cpu) + a = ndarray(shape, dtype=a_dtype, order=order) + a.data.copy_from_host_async(mem.ptr, nbytes) + pinned_memory._add_to_watch_list(stream.record(), mem) + else: + # fallback to numpy array and send it to GPU + # Note: a_cpu.ndim is always >= 1 + a_cpu = numpy.array(arrays, dtype=a_dtype, copy=False, order=order, + ndmin=ndmin) + a = ndarray(shape, dtype=a_dtype, order=order) + a.data.copy_from_host(a_cpu.ctypes.data, nbytes) + + return a + + +cdef _ndarray_base _array_from_nested_cupy_sequence(obj, dtype, shape, order): + lst = _flatten_list(obj) + + # convert each scalar (0-dim) ndarray to 1-dim + lst = [cupy.expand_dims(x, 0) if x.ndim == 0 else x for x in lst] + + a = _manipulation.concatenate_method(lst, 0) + a = a.reshape(shape) + a = a.astype(dtype, order=order, copy=False) + return a + + +cdef _ndarray_base _array_default(obj, dtype, order, Py_ssize_t ndmin): + if order is not None and len(order) >= 1 and order[0] in 'KAka': + if isinstance(obj, numpy.ndarray) and obj.flags.fnc: + order = 'F' + else: + order = 'C' + a_cpu = numpy.array(obj, dtype=dtype, copy=False, order=order, + ndmin=ndmin) + if a_cpu.dtype.char not in '?bhilqBHILQefdFD': + raise ValueError('Unsupported dtype %s' % a_cpu.dtype) + a_cpu = a_cpu.astype(a_cpu.dtype.newbyteorder('<'), copy=False) + a_dtype = a_cpu.dtype + cdef shape_t a_shape = a_cpu.shape + cdef _ndarray_base a = ndarray(a_shape, dtype=a_dtype, order=order) + if a_cpu.ndim == 0: + a.fill(a_cpu) + return a + cdef Py_ssize_t nbytes = a.nbytes + + stream = stream_module.get_current_stream() + # Note: even if obj is already backed by pinned memory, we still need to + # allocate an extra buffer and copy from it to avoid potential data race, + # see the discussion here: + # https://github.com/cupy/cupy/pull/5155#discussion_r621808782 + cdef pinned_memory.PinnedMemoryPointer mem = ( + _alloc_async_transfer_buffer(nbytes)) + if mem is not None: + src_cpu = numpy.frombuffer(mem, a_dtype, a_cpu.size) + src_cpu[:] = a_cpu.ravel(order) + a.data.copy_from_host_async(mem.ptr, nbytes) + pinned_memory._add_to_watch_list(stream.record(), mem) + else: + a.data.copy_from_host(a_cpu.ctypes.data, nbytes) + + return a + + +cdef tuple _array_info_from_nested_sequence(obj): + # Returns a tuple containing information if we can simply concatenate the + # input to make a CuPy array (i.e., a (nested) sequence that only contains + # NumPy/CuPy arrays with the same shape and dtype). `(None, None, None)` + # means we do not concatenate the input. + # 1. A concatenated shape + # 2. The type of the arrays to concatenate (numpy.ndarray or cupy.ndarray) + # 3. The dtype of the arrays to concatenate + if isinstance(obj, (list, tuple)): + return _compute_concat_info_impl(obj) + else: + return None, None, None + + +cdef tuple _compute_concat_info_impl(obj): + cdef Py_ssize_t dim + + if isinstance(obj, (numpy.ndarray, ndarray)): + return obj.shape, type(obj), obj.dtype + + if hasattr(obj, '__cupy_get_ndarray__'): + return obj.shape, ndarray, obj.dtype + + if isinstance(obj, (list, tuple)): + dim = len(obj) + if dim == 0: + return None, None, None + + concat_shape, concat_type, concat_dtype = ( + _compute_concat_info_impl(obj[0])) + if concat_shape is None: + return None, None, None + + for elem in obj[1:]: + concat_shape1, concat_type1, concat_dtype1 = ( + _compute_concat_info_impl(elem)) + if concat_shape1 is None: + return None, None, None + + if concat_shape != concat_shape1: + return None, None, None + if concat_type is not concat_type1: + return None, None, None + if concat_dtype != concat_dtype1: + concat_dtype = numpy.promote_types(concat_dtype, concat_dtype1) + + return (dim,) + concat_shape, concat_type, concat_dtype + + return None, None, None + + +cdef list _flatten_list(object obj): + ret = [] + if isinstance(obj, (list, tuple)): + for elem in obj: + ret += _flatten_list(elem) + return ret + return [obj] + + +cdef bint _numpy_concatenate_has_out_argument = ( + numpy.lib.NumpyVersion(numpy.__version__) >= '1.14.0') + + +cdef inline _concatenate_numpy_array(arrays, axis, src_dtype, dst_dtype, out): + # type(*_dtype) must be numpy.dtype + + if (_numpy_concatenate_has_out_argument + and src_dtype.kind == dst_dtype.kind): + # concatenate only accepts same_kind casting + numpy.concatenate(arrays, axis, out) + else: + out[:] = numpy.concatenate(arrays, axis) + + +cdef inline _alloc_async_transfer_buffer(Py_ssize_t nbytes): + try: + return pinned_memory.alloc_pinned_memory(nbytes) + except CUDARuntimeError as e: + if e.status != runtime.errorMemoryAllocation: + raise + warnings.warn( + 'Using synchronous transfer as pinned memory ({} bytes) ' + 'could not be allocated. ' + 'This generally occurs because of insufficient host memory. ' + 'The original error was: {}'.format(nbytes, e), + _util.PerformanceWarning) + + return None + + +cpdef _ndarray_base _internal_ascontiguousarray(_ndarray_base a): + if a._c_contiguous: + return a + newarray = _ndarray_init(ndarray, a._shape, a.dtype, None) + elementwise_copy(a, newarray) + return newarray + + +cpdef _ndarray_base _internal_asfortranarray(_ndarray_base a): + cdef _ndarray_base newarray + cdef int m, n + cdef intptr_t handle + + if a._f_contiguous: + return a + + newarray = ndarray(a.shape, a.dtype, order='F') + if (a._c_contiguous and a._shape.size() == 2 and + (a.dtype == numpy.float32 or a.dtype == numpy.float64)): + m, n = a.shape + handle = device.get_cublas_handle() + one = numpy.array(1, dtype=a.dtype) + zero = numpy.array(0, dtype=a.dtype) + if a.dtype == numpy.float32: + cublas.sgeam( + handle, + 1, # transpose a + 1, # transpose newarray + m, n, one.ctypes.data, a.data.ptr, n, + zero.ctypes.data, a.data.ptr, n, newarray.data.ptr, m) + elif a.dtype == numpy.float64: + cublas.dgeam( + handle, + 1, # transpose a + 1, # transpose newarray + m, n, one.ctypes.data, a.data.ptr, n, + zero.ctypes.data, a.data.ptr, n, newarray.data.ptr, m) + else: + elementwise_copy(a, newarray) + return newarray + + +cpdef _ndarray_base ascontiguousarray(_ndarray_base a, dtype=None): + cdef bint same_dtype = False + zero_dim = a._shape.size() == 0 + if dtype is None: + same_dtype = True + dtype = a.dtype + else: + dtype = get_dtype(dtype) + same_dtype = dtype == a.dtype + + if same_dtype and a._c_contiguous: + if zero_dim: + return _manipulation._ndarray_ravel(a, 'C') + return a + + shape = (1,) if zero_dim else a.shape + newarray = ndarray(shape, dtype) + elementwise_copy(a, newarray) + return newarray + + +cpdef _ndarray_base asfortranarray(_ndarray_base a, dtype=None): + cdef _ndarray_base newarray + cdef bint same_dtype = False + zero_dim = a._shape.size() == 0 + + if dtype is None: + dtype = a.dtype + same_dtype = True + else: + dtype = get_dtype(dtype) + same_dtype = dtype == a.dtype + + if same_dtype and a._f_contiguous: + if zero_dim: + return _manipulation._ndarray_ravel(a, 'F') + return a + + if same_dtype and not zero_dim: + return _internal_asfortranarray(a) + + newarray = ndarray((1,) if zero_dim else a.shape, dtype, order='F') + elementwise_copy(a, newarray) + return newarray + + +cpdef _ndarray_base _convert_object_with_cuda_array_interface(a): + if runtime._is_hip_environment: + raise RuntimeError( + 'HIP/ROCm does not support cuda array interface') + + cdef Py_ssize_t sh, st + cdef dict desc = a.__cuda_array_interface__ + cdef tuple shape = desc['shape'] + cdef int dev_id = -1 + cdef size_t nbytes + + ptr = desc['data'][0] + dtype = numpy.dtype(desc['typestr']) + if dtype.byteorder == '>': + raise ValueError('CuPy does not support the big-endian byte-order') + mask = desc.get('mask') + if mask is not None: + raise ValueError('CuPy currently does not support masked arrays.') + strides = desc.get('strides') + if strides is not None: + nbytes = 0 + for sh, st in zip(shape, strides): + nbytes = max(nbytes, abs(sh * st)) + else: + nbytes = internal.prod_sequence(shape) * dtype.itemsize + # the v2 protocol sets ptr=0 for 0-size arrays, so we can't look up + # the pointer attributes and must use the current device + if nbytes == 0: + dev_id = device.get_device_id() + mem = memory_module.UnownedMemory(ptr, nbytes, a, dev_id) + memptr = memory.MemoryPointer(mem, 0) + # the v3 protocol requires an immediate synchronization, unless + # 1. the stream is not set (ex: from v0 ~ v2) or is None + # 2. users explicitly overwrite this requirement + stream_ptr = desc.get('stream') + if stream_ptr is not None: + if _util.CUDA_ARRAY_INTERFACE_SYNC: + runtime.streamSynchronize(stream_ptr) + return ndarray(shape, dtype, memptr, strides) + + +cdef _ndarray_base _ndarray_init(subtype, const shape_t& shape, dtype, obj): + # Use `_no_init=True` for fast init. Now calling `__array_finalize__` is + # responsibility of this function. + cdef _ndarray_base ret = ndarray.__new__(subtype, _obj=obj, _no_init=True) + ret._init_fast(shape, dtype, True) + if subtype is not ndarray: + ret.__array_finalize__(obj) + return ret + + +cdef _ndarray_base _create_ndarray_from_shape_strides( + subtype, const shape_t& shape, const strides_t& strides, dtype, obj): + cdef int ndim = shape.size() + cdef int64_t begin = 0, end = dtype.itemsize + cdef memory.MemoryPointer ptr + for i in range(ndim): + if strides[i] > 0: + end += strides[i] * (shape[i] - 1) + elif strides[i] < 0: + begin += strides[i] * (shape[i] - 1) + ptr = memory.alloc(end - begin) + begin + return ndarray.__new__( + subtype, shape, dtype, _obj=obj, memptr=ptr, strides=strides) + + +cpdef min_scalar_type(a): + """ + For scalar ``a``, returns the data type with the smallest size + and smallest scalar kind which can hold its value. For non-scalar + array ``a``, returns the vector's dtype unmodified. + + .. seealso:: :func:`numpy.min_scalar_type` + """ + if isinstance(a, ndarray): + return a.dtype + _, concat_type, concat_dtype = _array_info_from_nested_sequence(a) + if concat_type is not None: + return concat_dtype + return numpy.min_scalar_type(a) diff --git a/cupy/_core/dlpack.pxd b/cupy/_core/dlpack.pxd new file mode 100644 index 0000000..d09cdc3 --- /dev/null +++ b/cupy/_core/dlpack.pxd @@ -0,0 +1,12 @@ +from cupy._core.core cimport _ndarray_base + + +cdef extern from './include/cupy/dlpack/dlpack.h' nogil: + int device_CUDA 'kDLCUDA' + int managed_CUDA 'kDLCUDAManaged' + int device_ROCM 'kDLROCM' + + +cpdef object toDlpack(_ndarray_base array) except + +cpdef _ndarray_base fromDlpack(object dltensor) except + +cpdef from_dlpack(array) diff --git a/cupy/_core/dlpack.pyx b/cupy/_core/dlpack.pyx new file mode 100644 index 0000000..7e4e3c6 --- /dev/null +++ b/cupy/_core/dlpack.pyx @@ -0,0 +1,412 @@ +cimport cpython # NOQA + +from libc cimport stdlib +from libc.stdint cimport uint8_t +from libc.stdint cimport uint16_t +from libc.stdint cimport int32_t +from libc.stdint cimport int64_t +from libc.stdint cimport uint64_t +from libc.stdint cimport intptr_t +from libcpp.vector cimport vector + +from cupy_backends.cuda.api cimport runtime +from cupy_backends.cuda cimport stream as stream_module +from cupy._core.core cimport _ndarray_base +from cupy.cuda cimport memory + +import warnings + +import cupy +import cupy._core.core as core + + +cdef extern from './include/cupy/dlpack/dlpack.h' nogil: + cdef int DLPACK_VERSION + + cdef enum DLDeviceType: + kDLCPU + kDLCUDA + kDLCUDAHost + kDLOpenCL + kDLVulkan + kDLMetal + kDLVPI + kDLROCM + kDLROCMHost + kDLExtDev + kDLCUDAManaged + kDLOneAPI + kDLWebGPU + kDLHexagon + + ctypedef struct DLDevice: + DLDeviceType device_type + int32_t device_id + + cdef enum DLDataTypeCode: + kDLInt + kDLUInt + kDLFloat + kDLBfloat + kDLComplex + kDLBool + + ctypedef struct DLDataType: + uint8_t code + uint8_t bits + uint16_t lanes + + ctypedef struct DLTensor: + void* data + DLDevice device + int32_t ndim + DLDataType dtype + int64_t* shape + int64_t* strides + uint64_t byte_offset + + ctypedef struct DLManagedTensor: + DLTensor dl_tensor + void* manager_ctx + void (*deleter)(DLManagedTensor*) # noqa: E211 + + +def get_build_version(): + return str(DLPACK_VERSION) + + +cdef void pycapsule_deleter(object dltensor): + cdef DLManagedTensor* dlm_tensor + # Do not invoke the deleter on a used capsule + if cpython.PyCapsule_IsValid(dltensor, 'dltensor'): + dlm_tensor = cpython.PyCapsule_GetPointer( + dltensor, 'dltensor') + dlm_tensor.deleter(dlm_tensor) + + +cdef void deleter(DLManagedTensor* tensor) with gil: + if tensor.manager_ctx is NULL: + return + stdlib.free(tensor.dl_tensor.shape) + cpython.Py_DECREF(<_ndarray_base>tensor.manager_ctx) + tensor.manager_ctx = NULL + stdlib.free(tensor) + + +# The name of this function is following the framework integration guide of +# TensorComprehensions. +cpdef object toDlpack(_ndarray_base array) except +: + cdef DLManagedTensor* dlm_tensor = \ + stdlib.malloc(sizeof(DLManagedTensor)) + + cdef size_t ndim = array._shape.size() + cdef DLTensor* dl_tensor = &dlm_tensor.dl_tensor + cdef intptr_t data_ptr = array.data.ptr + dl_tensor.data = data_ptr + dl_tensor.ndim = ndim + + cdef int64_t* shape_strides = \ + stdlib.malloc(ndim * sizeof(int64_t) * 2) + for n in range(ndim): + shape_strides[n] = array._shape[n] + dl_tensor.shape = shape_strides + for n in range(ndim): + shape_strides[n + ndim] = array._strides[n] // array.dtype.itemsize + + dl_tensor.strides = shape_strides + ndim + dl_tensor.byte_offset = 0 + + cdef DLDevice* device = &dl_tensor.device + cdef bint is_managed + cdef int dev_id = array.data.device_id + if not runtime._is_hip_environment: + attrs = runtime.pointerGetAttributes(data_ptr) + is_managed = (attrs.type == runtime.memoryTypeManaged) + if is_managed: + device.device_type = kDLCUDAManaged + dev_id = 0 # make it accessible on CPU too + else: + device.device_type = kDLCUDA + else: + device.device_type = kDLROCM + device.device_id = dev_id + + cdef DLDataType* dtype = &dl_tensor.dtype + if array.dtype.kind == 'u': + dtype.code = kDLUInt + elif array.dtype.kind == 'i': + dtype.code = kDLInt + elif array.dtype.kind == 'f': + dtype.code = kDLFloat + elif array.dtype.kind == 'c': + dtype.code = kDLComplex + elif array.dtype.kind == 'b': + dtype.code = kDLBool + else: + raise ValueError('Unknown dtype') + dtype.lanes = 1 + dtype.bits = (array.dtype.itemsize * 8) + + dlm_tensor.manager_ctx = array + cpython.Py_INCREF(array) + dlm_tensor.deleter = deleter + + return cpython.PyCapsule_New(dlm_tensor, 'dltensor', pycapsule_deleter) + + +# TODO(leofang): Support kDLCUDAPinned and kDLROCMPinned +cdef class DLPackMemory(memory.BaseMemory): + + """Memory object for a dlpack tensor. + + This does not allocate any memory. + + """ + + cdef DLManagedTensor* dlm_tensor + cdef object dltensor + + def __init__(self, object dltensor): + cdef DLManagedTensor* dlm_tensor + + # sanity checks + if not cpython.PyCapsule_IsValid(dltensor, 'dltensor'): + raise ValueError('A DLPack tensor object cannot be consumed ' + 'multiple times') + dlm_tensor = cpython.PyCapsule_GetPointer( + dltensor, 'dltensor') + if runtime._is_hip_environment: + if dlm_tensor.dl_tensor.device.device_type != kDLROCM: + raise RuntimeError('CuPy is built against ROCm/HIP, different ' + 'from the backend that backs the incoming ' + 'DLPack tensor') + else: + if dlm_tensor.dl_tensor.device.device_type not in ( + kDLCUDA, kDLCUDAManaged): + raise RuntimeError('CuPy is built against CUDA, different ' + 'from the backend that backs the incoming ' + 'DLPack tensor') + + self.dltensor = dltensor + self.dlm_tensor = dlm_tensor + self.ptr = dlm_tensor.dl_tensor.data + if dlm_tensor.dl_tensor.device.device_type == kDLCUDAManaged: + # look up the actual physical device as the id from + # dl_tensor could be 0 + attrs = runtime.pointerGetAttributes(self.ptr) + self.device_id = attrs.device + else: + self.device_id = dlm_tensor.dl_tensor.device.device_id + + cdef int n = 0, s = 0 + cdef int ndim = dlm_tensor.dl_tensor.ndim + cdef int64_t* shape = dlm_tensor.dl_tensor.shape + for s in shape[:ndim]: + n += s + self.size = dlm_tensor.dl_tensor.dtype.bits * n // 8 + + def __dealloc__(self): + cdef DLManagedTensor* dlm_tensor = self.dlm_tensor + # dlm_tensor could be uninitialized if an error is raised in __init__ + if dlm_tensor != NULL: + dlm_tensor.deleter(dlm_tensor) + + +# The name of this function is following the framework integration guide of +# TensorComprehensions. +cpdef _ndarray_base fromDlpack(object dltensor) except +: + """Zero-copy conversion from a DLPack tensor to a :class:`~cupy.ndarray`. + + DLPack is a open in memory tensor structure proposed in this repository: + `dmlc/dlpack `_. + + This function takes a :class:`PyCapsule` object which contains a pointer to + a DLPack tensor as input, and returns a :class:`~cupy.ndarray`. This + function does not copy the data in the DLPack tensor but both + DLPack tensor and :class:`~cupy.ndarray` have pointers which are pointing + to the same memory region for the data. + + Args: + dltensor (:class:`PyCapsule`): Input DLPack tensor which is + encapsulated in a :class:`PyCapsule` object. + + Returns: + array (:class:`~cupy.ndarray`): A CuPy ndarray. + + .. warning:: + + This function is deprecated in favor of :func:`~cupy.from_dlpack` and + will be removed in a future version of CuPy. + + .. warning:: + + As of the DLPack v0.5 specification, it is implicitly assumed that + the user is responsible to ensure the Producer and the Consumer are + operating on the same stream. + + .. seealso:: + + :meth:`cupy.ndarray.toDlpack` is a method for zero-copy conversion + from a :class:`~cupy.ndarray` to a DLPack tensor (which is encapsulated + in a :class:`PyCapsule` object). + + .. admonition:: Example + + >>> import cupy + >>> array1 = cupy.array([0, 1, 2], dtype=cupy.float32) + >>> dltensor = array1.toDlpack() + >>> array2 = cupy.fromDlpack(dltensor) + >>> cupy.testing.assert_array_equal(array1, array2) + + """ + warnings.warn('This function is deprecated in favor of cupy.from_dlpack', + DeprecationWarning) + return _dlpack_to_cupy_array(dltensor) + + +cdef inline _ndarray_base _dlpack_to_cupy_array(dltensor) except +: + cdef DLPackMemory mem = DLPackMemory(dltensor) + cdef DLDataType dtype = mem.dlm_tensor.dl_tensor.dtype + cdef int bits = dtype.bits + if dtype.lanes != 1: + raise ValueError(f'vector dtypes (lanes={dtype.lanes}) is ' + 'not supported') + if dtype.code == kDLUInt: + if bits == 8: + cp_dtype = cupy.uint8 + elif bits == 16: + cp_dtype = cupy.uint16 + elif bits == 32: + cp_dtype = cupy.uint32 + elif bits == 64: + cp_dtype = cupy.uint64 + else: + raise TypeError('uint{} is not supported.'.format(bits)) + elif dtype.code == kDLInt: + if bits == 8: + cp_dtype = cupy.int8 + elif bits == 16: + cp_dtype = cupy.int16 + elif bits == 32: + cp_dtype = cupy.int32 + elif bits == 64: + cp_dtype = cupy.int64 + else: + raise TypeError('int{} is not supported.'.format(bits)) + elif dtype.code == kDLFloat: + if bits == 16: + cp_dtype = cupy.float16 + elif bits == 32: + cp_dtype = cupy.float32 + elif bits == 64: + cp_dtype = cupy.float64 + else: + raise TypeError('float{} is not supported.'.format(bits)) + elif dtype.code == kDLComplex: + # TODO(leofang): support complex32 + if bits == 64: + cp_dtype = cupy.complex64 + elif bits == 128: + cp_dtype = cupy.complex128 + else: + raise TypeError('complex{} is not supported.'.format(bits)) + elif dtype.code == kDLBool: + if bits == 8: + cp_dtype = cupy.bool_ + else: + raise TypeError(f'{bits}-bit bool is not supported') + elif dtype.code == kDLBfloat: + raise NotImplementedError('CuPy does not support bfloat16 yet') + else: + raise TypeError('Unsupported dtype. dtype code: {}'.format(dtype.code)) + + mem_ptr = memory.MemoryPointer(mem, mem.dlm_tensor.dl_tensor.byte_offset) + cdef int64_t ndim = mem.dlm_tensor.dl_tensor.ndim + + cdef int64_t* shape = mem.dlm_tensor.dl_tensor.shape + cdef vector[Py_ssize_t] shape_vec + shape_vec.assign(shape, shape + ndim) + + if mem.dlm_tensor.dl_tensor.strides is NULL: + # Make sure this capsule will never be used again. + cpython.PyCapsule_SetName(mem.dltensor, 'used_dltensor') + return core.ndarray(shape_vec, cp_dtype, mem_ptr, strides=None) + cdef int64_t* strides = mem.dlm_tensor.dl_tensor.strides + cdef vector[Py_ssize_t] strides_vec + for i in range(ndim): + strides_vec.push_back(strides[i] * (bits // 8)) + + # Make sure this capsule will never be used again. + cpython.PyCapsule_SetName(mem.dltensor, 'used_dltensor') + return core.ndarray(shape_vec, cp_dtype, mem_ptr, strides=strides_vec) + + +cpdef from_dlpack(array): + """Zero-copy conversion between array objects compliant with the DLPack + data exchange protocol. + + Args: + array (object): an array object that implements two methods: + ``__dlpack__()`` and ``__dlpack_device__()``. + + Returns: + cupy.ndarray: a CuPy array that can be safely accessed on CuPy's + current stream. + + .. note:: + This function is different from CuPy's legacy :func:`~cupy.fromDlpack` + function. This function takes any object implementing the DLPack data + exchange protocol, as well as a raw :class:`PyCapsule` object that + contains the DLPack tensor as input (for backward compatibility), + whereas :func:`~cupy.fromDlpack` only accepts :class:`PyCapsule` + objects. If the input object is not compliant with the protocol, users + are responsible to ensure data safety. + + .. seealso:: + :func:`numpy.from_dlpack`, + `Python Specification for DLPack`_, + `Data interchange mechanisms`_ + + .. _Python Specification for DLPack: + https://dmlc.github.io/dlpack/latest/python_spec.html + .. _Data interchange mechanisms: + https://data-apis.org/array-api/latest/design_topics/data_interchange.html + """ + if not hasattr(array, '__dlpack_device__'): + # backward compatibility: accept passing in a pycapsule + dltensor = array + return _dlpack_to_cupy_array(dltensor) + else: + dev_type, dev_id = array.__dlpack_device__() + + # CuPy is the consumer, so we provide our current stream to the producer + if dev_type == kDLCUDA or dev_type == kDLCUDAManaged: + prev_device = cupy.cuda.runtime.getDevice() + try: + cupy.cuda.runtime.setDevice(dev_id) + assert not runtime._is_hip_environment + stream = stream_module.get_current_stream_ptr() + if stream == 0: + stream = stream_module.get_default_stream_ptr() + dltensor = array.__dlpack__(stream=stream) + finally: + cupy.cuda.runtime.setDevice(prev_device) + elif dev_type == kDLROCM: + prev_device = cupy.cuda.runtime.getDevice() + try: + cupy.cuda.runtime.setDevice(dev_id) + assert runtime._is_hip_environment + stream = stream_module.get_current_stream_ptr() + dltensor = array.__dlpack__(stream=stream) + finally: + cupy.cuda.runtime.setDevice(prev_device) + elif dev_type == kDLCPU: + raise TypeError( + 'CPU arrays cannot be directly imported to CuPy. ' + 'Use `cupy.array(numpy.from_dlpack(input))` instead.') + else: + # TODO(leofang): support kDLCUDAPinned etc + dltensor = None + raise TypeError(f'Unsupported array type: {dev_type}') + + return _dlpack_to_cupy_array(dltensor) diff --git a/cupy/_core/flags.pyx b/cupy/_core/flags.pyx new file mode 100644 index 0000000..8fc9702 --- /dev/null +++ b/cupy/_core/flags.pyx @@ -0,0 +1,34 @@ +# distutils: language = c++ + + +class Flags(object): + + def __init__(self, c_contiguous, f_contiguous, owndata): + self.c_contiguous = c_contiguous + self.f_contiguous = f_contiguous + self.owndata = owndata + + @property + def fnc(self): + return self.f_contiguous and not self.c_contiguous + + @property + def forc(self): + return self.f_contiguous or self.c_contiguous + + def __getitem__(self, name): + if name == 'C_CONTIGUOUS': + return self.c_contiguous + elif name == 'F_CONTIGUOUS': + return self.f_contiguous + elif name == 'OWNDATA': + return self.owndata + else: + raise KeyError('%s is not defined for cupy.ndarray.flags' % name) + + def __repr__(self): + t = ' %s : %s' + ret = [] + for name in 'C_CONTIGUOUS', 'F_CONTIGUOUS', 'OWNDATA': + ret.append(t % (name, self[name])) + return '\n'.join(ret) diff --git a/cupy/_core/fusion.pyx b/cupy/_core/fusion.pyx new file mode 100644 index 0000000..6d496e2 --- /dev/null +++ b/cupy/_core/fusion.pyx @@ -0,0 +1,1004 @@ +from cupy._core cimport _accelerator +from cupy._core._accelerator cimport ACCELERATOR_CUB +from cupy._core._scalar cimport get_typename, _get_cuda_scalar_repr + +import functools +import string + +import numpy + +import cupy +from cupy._core._dtype import get_dtype, _raise_if_invalid_cast +from cupy._core import _kernel +from cupy._core import _fusion_thread_local +from cupy._core import _reduction +from cupy._core import core +from cupy._core import new_fusion + + +_is_fusing = _fusion_thread_local.is_fusing # NOQA +_thread_local = _fusion_thread_local.thread_local + +cdef dict _kind_score = { + 'b': 0, + 'u': 1, + 'i': 1, + 'f': 2, + 'c': 2, +} + +cdef list _dtype_list = [numpy.dtype(_) for _ in '?bhilqBHILQefdFD'] + +cdef tuple _acceptable_types = ( + core.ndarray, numpy.ndarray, numpy.generic, + int, float, complex, bool, type(None)) + + +class _Submodule(object): + """Ufunc or elementwise kernel with types. + + Attributes: + name (str): The name of submodule + in_params (list of tuples of dtype and str): + The tuple of dtype and name of input parameters. + out_params (list of tuples of dtype and str): + The tuple of dtype and name of output parameters. + op (str): The operation code. + preamble (str): The preamble code. + dtypes (list of dtypes): The list of dtypes of the parameters. + """ + + def __init__(self, ufunc, in_params, out_params, op): + self.name = ufunc.name + self.in_params = in_params + self.out_params = out_params + self.op = op + self.preamble = ufunc._preamble + self.dtypes = [dtype for dtype, _ in self.in_params + self.out_params] + + def __repr__(self): + return '<_Submodule {}>'.format(self.name) + + def fcall(self, args): + return self.name + '(' + ', '.join(args) + ');\n' + + def key(self): + return (self.name, tuple(self.dtypes)) + + def code(self): + params = ', '.join('{} &{}'.format(get_typename(t), s) + for t, s in self.in_params + self.out_params) + typedef = ''.join('typedef {} {}_type;\n'.format(get_typename(t), s) + for t, s in self.in_params + self.out_params) + module_code = string.Template(''' + __device__ void ${name}(${parameters}) { + ${typedef} + ${operation}; + } + ''').substitute( + name=self.name, + parameters=params, + operation=self.op, + typedef=typedef) + return module_code + '\n' + + +class _FusionVarCUDA(object): + + """Local variable in CUDA program. + + Attributes: + index (int): The name of the variable. + dtype (dtype): The dtype of the variable. + const_value (any of primitive types): The constant value (or None) + """ + + def __init__(self, index, dtype, const_value=None): + self.index = index + self.dtype = dtype + self.const_value = const_value + self.mutable = False + + def __repr__(self): + return 'v{}'.format(self.index) + + def mutate(self): + self.mutable = True + + def declaration(self): + c = self.const_value + val = c.item() if hasattr(c, 'dtype') else c + ctype = get_typename(self.dtype) + + if self.const_value is None: + return '{} v{};\n'.format(ctype, self.index) + + code = _get_cuda_scalar_repr(val, self.dtype) + return 'const {} v{} = {};\n'.format(ctype, self.index, code) + + def declaration_in_param(self): + non_const = '_non_const ' if self.mutable else '' + return '{}{} v{}'.format(non_const, self.dtype, self.index) + + def declaration_out_param(self): + return '{} v{}'.format(self.dtype, self.index) + + +class _FusionOp(object): + + """Function call with arguments in CUDA program. + + Attributes: + index (int): The index of this operation. + submodule (submodule): The submodules called in this operation. + args (list of _FusionVarCUDA): The arguments. + types (list of dtype): The types of parameters. + """ + + def __init__(self, index, submodule, args): + self.index = index + self.submodule = submodule + self.args = args + self.dtypes = submodule.dtypes + + def __repr__(self): + return '<_FusionOp #{}, {} types=[{}]>'.format( + self.index, self.submodule.name, ', '.join(map(str, self.dtypes))) + + def declaration_args(self): + return ' '.join('{} v{}_{};'.format(get_typename(t), self.index, j) + for j, t in enumerate(self.dtypes)) + '\n' + + def code(self): + args_sub = ['v{}_{}'.format(self.index, i) + for i in range(len(self.args))] + ctypes = [get_typename(t) for t in self.dtypes] + args_list = list(zip(self.args, args_sub, ctypes)) + code = '// op # {}\n'.format(self.index) + code += ''.join('{} = static_cast< {} >(v{});\n'.format(s, t, v.index) + for v, s, t in args_list) + code += self.submodule.fcall(args_sub) + code += ''.join('v{} = static_cast< {} >({});\n'.format( + v.index, get_typename(v.dtype), s) + for v, s, _ in + args_list[len(self.submodule.in_params):]) + return code + + +class _FusionVarScalar(object): + + """The values of variables in target function of fusion. + + Args: + var (_FusionVarCUDA) + ndim (int) + is_postmap (bool) + + Attributes: + dtype (dtype): The data type. + """ + + def __init__(self, var, ndim, is_postmap): + self._var = var + self.dtype = var.dtype + self.ndim = ndim + self._is_postmap = is_postmap + assert ndim == -1 + + def __repr__(self): + return '<_FusionVar {} scalar>'.format(self.dtype) + + def __neg__(self): + return cupy.negative(self) + + def __add__(self, other): + return cupy.add(self, other) + + def __radd__(self, other): + return cupy.add(other, self) + + def __sub__(self, other): + return cupy.subtract(self, other) + + def __rsub__(self, other): + return cupy.subtract(other, self) + + def __mul__(self, other): + return cupy.multiply(self, other) + + def __rmul__(self, other): + return cupy.multiply(other, self) + + def __div__(self, other): + return cupy.divide(self, other) + + def __rdiv__(self, other): + return cupy.divide(other, self) + + def __truediv__(self, other): + return cupy.true_divide(self, other) + + def __rtruediv__(self, other): + return cupy.true_divide(other, self) + + def __floordiv__(self, other): + return cupy.floor_divide(self, other) + + def __rfloordiv__(self, other): + return cupy.floor_divide(other, self) + + def __mod__(self, other): + return cupy.remainder(self, other) + + def __rmod__(self, other): + return cupy.remainder(other, self) + + def __pow__(x, y): + return cupy.power(x, y) + + def __lshift__(self, other): + return cupy.left_shift(self, other) + + def __rlshift__(self, other): + return cupy.left_shift(other, self) + + def __rshift__(self, other): + return cupy.right_shift(self, other) + + def __rrshift__(self, other): + return cupy.right_shift(other, self) + + def __and__(self, other): + return cupy.bitwise_and(self, other) + + def __rand__(self, other): + return cupy.bitwise_and(other, self) + + def __or__(self, other): + return cupy.bitwise_or(self, other) + + def __ror__(self, other): + return cupy.bitwise_or(other, self) + + def __xor__(self, other): + return cupy.bitwise_xor(self, other) + + def __rxor__(self, other): + return cupy.bitwise_xor(other, self) + + def __invert__(self): + return cupy.invert(self) + + def __lt__(self, other): + return cupy.less(self, other) + + def __le__(self, other): + return cupy.less_equal(self, other) + + def __eq__(self, other): + return cupy.equal(self, other) + + def __ne__(self, other): + return cupy.not_equal(self, other) + + def __gt__(self, other): + return cupy.greater(self, other) + + def __ge__(self, other): + return cupy.greater_equal(self, other) + + def __nonzero__(self): + raise Exception('Can\'t cast to bool') + + def __bool__(self): + raise Exception('Can\'t cast to bool') + + def __setitem__(self, slices, value): + if slices is Ellipsis or (isinstance(slices, slice) and + slices == slice(None)): + _call_ufunc(core.elementwise_copy, value, out=self) + else: + raise ValueError('The fusion supports `[...]` or `[:]`.') + + def copy(self): + return cupy.copy(self) + + def astype(self, dtype, order=None, casting=None, subok=None, copy=True): + dtype = get_dtype(dtype) + if order is not None: + raise TypeError('order is not supported yet') + if casting is not None: + raise TypeError('casting is not supported yet') + if subok is not None: + raise TypeError('subok is not supported yet') + if not copy and self.dtype == dtype: + return self + return _dtype_to_astype(dtype)(self) + + +class _FusionVarArray(_FusionVarScalar): + + def __init__(self, var, ndim, is_postmap): + self._var = var + self.dtype = var.dtype + self.ndim = ndim + self._is_postmap = is_postmap + assert ndim >= 0 + + def __repr__(self): + return '<_FusionVar {} {}-dim array>'.format(self.dtype, self.ndim) + + def __iadd__(self, other): + return cupy.add(self, other, self) + + def __isub__(self, other): + return cupy.subtract(self, other, self) + + def __imul__(self, other): + return cupy.multiply(self, other, self) + + def __idiv__(self, other): + return cupy.divide(self, other, self) + + def __itruediv__(self, other): + return cupy.true_divide(self, other, self) + + def __ifloordiv__(self, other): + return cupy.floor_divide(self, other, self) + + def __imod__(self, other): + return cupy.remainder(self, other, self) + + def __ipow__(self, other): + return cupy.power(self, other, self) + + def __ilshift__(self, other): + return cupy.left_shift(self, other, self) + + def __irshift__(self, other): + return cupy.right_shift(self, other, self) + + def __iand__(self, other): + return cupy.bitwise_and(self, other, self) + + def __ior__(self, other): + return cupy.bitwise_or(self, other, self) + + def __ixor__(self, other): + return cupy.bitwise_xor(self, other, self) + + +class _FusionHistory(object): + + """History of operation exectuted in the target function of fusion. + + Attributes: + preamble_set (set of str): The preambles of submodules. + submodules (dict from str to submodule): The submodules. + count (int): The number of variables in the fused function. + + op_list (list of _FusionOp): The map operations. + param_list (list of _FusionVarCUDA): The parameters + local_list (list of _FusionVarCUDA): The local variables. + + Only when fusing the reduction, the following attributes are updated. + + reduce_op (tuple): One of the element of reduction.***._raws._ops. + reduce_identity (any type): The identity value of the reduction. + reduce_kwargs (dict or None): kwargs of the reduction. + + premap_ret (_FusionVarCUDA or None): The target of reduction + postmap_param (_FusionVarCUDA or None): The result of reduction + postmap_op_list (list of FuisonOp): The post-map operations. + postmap_local_list (list of _FusionVarCUDA): The local variables which + appears in the post-map operations + """ + + def __init__(self): + self.preamble_set = set() + self.submodules = dict() + self.count = 0 + + self.op_list = [] + self.param_list = [] + self.local_list = [] + + self.reduce_op = None + self.reduce_identity = None + self.reduce_kwargs = None + + self.postmap_op_list = [] + self.premap_ret = None + self.postmap_param = None + self.postmap_local_list = [] + + def __repr__(self): + return '<_FusionMem, op_list={}, param_list={}, local_list={}>'.format( + self.op_list, self.param_list, self.local_list) + + def _has_reduction(self): + return self.reduce_op is not None + + def _fresh_index(self): + res = self.count + self.count += 1 + return res + + def _fresh_premap_param(self, *args, **kwargs): + index = self._fresh_index() + var = _FusionVarCUDA(index, *args, **kwargs) + self.param_list.append(var) + return var + + def _fresh_postmap_param(self, *args, **kwargs): + assert self.postmap_param is None + index = self._fresh_index() + var = _FusionVarCUDA(index, *args, **kwargs) + self.postmap_param = var + return var + + def _fresh_premap_local(self, *args, **kwargs): + index = self._fresh_index() + var = _FusionVarCUDA(index, *args, **kwargs) + self.local_list.append(var) + return var + + def _fresh_postmap_local(self, *args, **kwargs): + index = self._fresh_index() + var = _FusionVarCUDA(index, *args, **kwargs) + self.postmap_local_list.append(var) + return var + + def _fresh_local(self, *args, **kwargs): + if self._has_reduction(): + return self._fresh_postmap_local(*args, **kwargs) + else: + return self._fresh_premap_local(*args, **kwargs) + + def _add_premap_op(self, *args, **kwargs): + op = _FusionOp(len(self.op_list), *args, **kwargs) + subm = op.submodule + self.submodules[subm.key()] = subm + self.op_list.append(op) + self._add_preamble(subm.preamble) + return op + + def _add_postmap_op(self, *args, **kwargs): + op = _FusionOp(len(self.postmap_op_list), *args, **kwargs) + subm = op.submodule + self.submodules[subm.key()] = subm + self.postmap_op_list.append(op) + self._add_preamble(subm.preamble) + return op + + def add_op(self, *args, **kwargs): + if self._has_reduction(): + return self._add_postmap_op(*args, **kwargs) + else: + return self._add_premap_op(*args, **kwargs) + + def set_reduce_op(self, raw, arg, kwargs): + assert self.reduce_op is None + for op in raw._ops.ops: + input_type, = op.in_types + output_type, = op.out_types + if numpy.can_cast(arg.dtype, input_type): + return_dtype = numpy.dtype(output_type) + self.premap_ret = self._get_fusion_var(arg)._var + self.reduce_op = op + self.reduce_identity = raw.identity + self.reduce_kwargs = kwargs + self._add_preamble(raw.preamble) + return self._fresh_postmap_param(return_dtype) + raise TypeError('Type is mismatched. {}(...), {}'.format( + self.raw._ops.name, arg.dtype.type)) + + def _add_preamble(self, preamble): + self.preamble_set.add(preamble) + + def _get_fusion_var(self, arg): + """This converts `arg` to _FusionVarScalar or _FusionVarArray data. + + Args: + arg (_FusionVarScalar, _FusionVarArray or a primitive type) + + Return value: + _FusionVarScalar or _FusionVarArray + """ + if isinstance(arg, (_FusionVarScalar, _FusionVarArray)): + if arg._is_postmap == self._has_reduction(): + return arg + else: + # Map operation between pre-map variable and post-map variable + raise Exception('Shape mismatch') + if isinstance(arg, (int, float, bool, complex, numpy.generic)): + var = self._fresh_local(numpy.dtype(type(arg)), const_value=arg) + return _FusionVarScalar(var, -1, self._has_reduction()) + raise TypeError('Unsupported type {}'.format(type(arg))) + + def call_ufunc(self, ufunc, *args, **kwargs): + nin = ufunc.nin + nout = ufunc.nout + + # Corresponds to _check_should_use_min_scalar in elementwise.pxi + # This function decides which typecast rule to use. + def _should_use_min_scalar(in_args): + max_array_kind = -2 + max_scalar_kind = -1 + for arg in in_args: + kind = _kind_score[arg.dtype.kind] + if isinstance(arg, _FusionVarArray): + max_array_kind = max(max_array_kind, kind) + elif isinstance(arg, _FusionVarScalar): + max_scalar_kind = max(max_scalar_kind, kind) + else: + assert False + return (max_scalar_kind != -1 and + max_array_kind >= max_scalar_kind) + + def can_cast1(args, in_dtypes): + for i in range(nin): + arg = args[i] + if isinstance(arg, _FusionVarArray): + if not numpy.can_cast(arg.dtype, in_dtypes[i]): + return False + elif isinstance(arg, _FusionVarScalar): + scalar_value = arg._var.const_value + if scalar_value is None: + # This typecast is not safe. + # The result of a typecast of an element-wise operation + # between a numpy ndarray and a numpy scalar is not + # decidable statically, because it depends on the value + # of the scalar variable. + scalar_value = arg.dtype.type(0) + if not numpy.can_cast(scalar_value, in_dtypes[i]): + return False + else: + assert False + return True + + def can_cast2(args, in_dtypes): + for i in range(nin): + if not numpy.can_cast(args[i].dtype, in_dtypes[i]): + return False + return True + + def make_fusion_var(var, ndim): + if ndim == -1: + return _FusionVarScalar(var, ndim, self._has_reduction()) + else: + return _FusionVarArray(var, ndim, self._has_reduction()) + + # Make FusionVar list + var_list = [self._get_fusion_var(arg) for arg in args] + in_vars = var_list[:nin] + out_vars = var_list[nin:] + if 'out' in kwargs: + out = kwargs.pop('out') + if out_vars: + raise ValueError('cannot specify \'out\' as both a positional ' + 'and keyword argument') + if isinstance(out, _FusionVarArray): + out_vars.append(self._get_fusion_var(out)) + elif out is not None: + raise ValueError('The \'out\' tuple must have exactly one ' + 'entry per ufunc output') + if kwargs: + raise TypeError('Wrong arguments {}'.format(kwargs)) + if len(in_vars) != nin or len(out_vars) > nout: + raise ValueError('invalid number of arguments') + if not all([isinstance(v, _FusionVarArray) for v in out_vars]): + raise TypeError('return arrays must be of ArrayType') + var_list = in_vars + out_vars + + # Broadcast + ndim = max([v.ndim for v in in_vars]) + if any([v.ndim < ndim for v in out_vars]): + raise ValueError('non-broadcastable output operand') + + # Typecast and add an operation + can_cast = can_cast1 if _should_use_min_scalar(var_list) else can_cast2 + # TODO(asi1024): Fix to use ``guess_routine``. + for op in ufunc._ops.ops: + in_dtypes = [numpy.dtype(t) for t in op.in_types] + out_dtypes = [numpy.dtype(t) for t in op.out_types] + if can_cast(var_list, in_dtypes): + ret = [] + for i in range(nout): + if i >= len(out_vars): + out_var = self._fresh_local(out_dtypes[i]) + out_var = make_fusion_var(out_var, ndim) + out_vars.append(out_var) + else: + _raise_if_invalid_cast( + out_dtypes[i], out_vars[i].dtype, 'same_kind', + lambda: f'output {i}') + out_var = out_vars[i] + + out_var._var.mutate() + ret.append(out_var) + + in_params = [(in_dtypes[i], 'in{}'.format(i)) + for i, _ in enumerate(in_vars)] + out_params = [(out_dtypes[i], 'out{}'.format(i)) + for i, _ in enumerate(out_vars)] + subm = _Submodule(ufunc, in_params, out_params, op.routine) + self.add_op(subm, [v._var for v in in_vars + out_vars]) + return ret[0] if len(ret) == 1 else tuple(ret) + in_dtypes = [v.dtype for v in in_vars] + out_dtypes = [v.dtype for v in out_vars] + raise TypeError('Invalid type cast in \'{}\': {} -> {}'.format( + ufunc.name, in_dtypes, out_dtypes)) + + def call_elementwise(self, f, args, kwargs): + raise NotImplementedError( + 'Fusion for elementwise-kernel is not implemented yet') + + def _emit_submodules_code(self): + res = ''.join(self.preamble_set) + res += '\n'.join([_.code() for _ in self.submodules.values()]) + return res + + def _emit_operation_code(self): + res = '// {} operations\n'.format(len(self.op_list)) + res += ''.join(v.declaration() for v in self.local_list) + res += ''.join(op.declaration_args() for op in self.op_list) + res += ''.join(op.code() for op in self.op_list) + return res + + def _emit_premap_code(self, in_params, operation): + return_var = self.premap_ret + module_code = string.Template(''' + __device__ ${return_ctype} _pre_map(${in_params}) { + ${operation}; + return ${return_var}; + } + ''').substitute( + return_ctype=get_typename(return_var.dtype), + in_params=', '.join('{} v{}'.format(get_typename(v.dtype), + v.index) + for v in in_params), + operation=operation, + return_var=return_var) + return module_code + + def _emit_postmap_code(self, out_params, operation): + in_param = self.postmap_param + in_ctype = get_typename(in_param.dtype) + module_code = string.Template(''' + __device__ void _post_map(${in_ctype} in, ${out_params}) { + ${in_param} = in; + ${operation}; + } + ''').substitute( + in_ctype=in_ctype, + in_param='{} v{}'.format(in_ctype, in_param.index), + out_params=', '.join('{} &v{}'.format(get_typename(v.dtype), + v.index) + for v in out_params), + operation=operation) + return module_code + + def _emit_postmap_cast_code(self, reduce_ctype, postmap_dtype, operation): + module_code = string.Template(''' + __device__ ${postmap_ctype} _postmap_cast(${reduce_ctype} a) { + ${postmap_ctype} out0; + ${operation}; + return out0; + } + ''').substitute( + reduce_ctype=reduce_ctype, + postmap_ctype=get_typename(postmap_dtype), + operation=operation) + return module_code + + def _gen_abstracted_args(self, a): + if isinstance(a, core.ndarray): + cuda_var = self._fresh_premap_param(a.dtype) + python_var = _FusionVarArray(cuda_var, a.ndim, False) + elif a is None: + cuda_var = None + python_var = None + else: + cuda_var = self._fresh_premap_param(numpy.dtype(type(a))) + python_var = _FusionVarScalar(cuda_var, -1, False) + return cuda_var, python_var + + def get_fusion(self, func, args, name): + """This generates CUDA kernel from the given function and dtypes. + + This function generates ElementwiseKernel or ReductioKernel from the + given function and the list of dtypes of parameters. + + Args: + func (function): The function to be fused. + args (tuple): The tuple of arguments. + name (str): The name of the kernel. + + Return value (tuple of ElementwiseKernel/ReductionKernel and dict): + The second element of return values is kwargs that will give into + the elementwise kernel or reduction kernel. + """ + self.ndim = max([a.ndim for a in args if isinstance(a, core.ndarray)]) + + in_params = [] + function_args = [] + for a in args: + cuda_var, python_var = self._gen_abstracted_args(a) + if cuda_var is not None: + in_params.append(cuda_var) + function_args.append(python_var) + + return_value = func(*function_args) + + if isinstance(return_value, tuple): + return_tuple = True + no_return = False + out_pvars = return_value + elif isinstance(return_value, (_FusionVarScalar, _FusionVarArray)): + return_tuple = False + no_return = False + out_pvars = [return_value] + elif return_value is None: + return_tuple = False + no_return = True + out_pvars = [] + else: + raise TypeError( + 'Fusion function can\'t return {}'.format(type(return_value))) + + out_pvars = [_ for _ in out_pvars if _ is not None] + out_cvars = [self._get_fusion_var(_)._var for _ in out_pvars] + + out_dtypes = [_.dtype for _ in out_pvars] + out_params = [self._fresh_premap_param(t) for t in out_dtypes] + + in_params_code = ', '.join(var.declaration_in_param() + for var in in_params) + out_params_code = ', '.join(var.declaration_out_param() + for var in out_params) + + operation = self._emit_operation_code() + submodule_code = self._emit_submodules_code() + + if self.reduce_op is None: + operation += ' '.join('{} = {};'.format(t, s) + for s, t in zip(out_cvars, out_params)) + kernel = _kernel.ElementwiseKernel( + in_params_code, out_params_code, operation, + preamble=submodule_code, + return_tuple=return_tuple, + no_return=no_return, + name=name) + return kernel, {} + else: + _, reduce_expr, postmap_expr, reduce_ctype = self.reduce_op.routine + if reduce_ctype is None: + reduce_ctype = 'type_out0_raw' + + postmap_type, = self.reduce_op.out_types + postmap_dtype = numpy.dtype(postmap_type) + postmap_ctype = get_typename(postmap_dtype) + + postmap_code = '// {} operations\n'.format( + len(self.postmap_op_list)) + postmap_code += ''.join(v.declaration() + for v in self.postmap_local_list) + postmap_code += ''.join(op.declaration_args() + for op in self.postmap_op_list) + postmap_code += ''.join(op.code() for op in self.postmap_op_list) + postmap_code += ' '.join('{} = {};'.format(t, s) + for s, t in zip(out_cvars, out_params)) + + submodule_code += self._emit_premap_code(in_params, operation) + use_cub = ACCELERATOR_CUB in _accelerator._reduction_accelerators + if not use_cub: + submodule_code += 'typedef {} type_in0_raw;\n'.format( + postmap_ctype) + submodule_code += 'typedef {} type_out0_raw;\n'.format( + postmap_ctype) + submodule_code += self._emit_postmap_cast_code( + reduce_ctype, postmap_dtype, postmap_expr) + submodule_code += self._emit_postmap_code(out_params, postmap_code) + + kernel = _reduction.ReductionKernel( + in_params_code, + out_params_code, + '_pre_map({})'.format(', '.join([repr(p) for p in in_params])), + reduce_expr, + '_post_map(_postmap_cast(a), {})'.format( + ', '.join([repr(p) for p in out_params])), + self.reduce_identity, + name=name, + reduce_type=reduce_ctype, + preamble=submodule_code) + return kernel, self.reduce_kwargs + + +class Fusion(object): + + """Function class. + + This class can be get by using `fuse` function and + works like `ElementwiseKernel` or `ReductionKernel`. + + Attributes: + func (function): The function before fusing. + name (str): The name of the function. + """ + + def __init__(self, func, name=None): + self.func = func + self.name = name or func.__name__ + self._memo = {} + self.new_fusion = None + + def __repr__(self): + return ''.format(self.name) + + def __call__(self, *args): + if self.new_fusion is not None: + return self.new_fusion(*args) + + # Inner function of composition of multiple fused functions. + if _fusion_thread_local.is_old_fusing(): + return self.func(*args) + + exec_cupy = False + for a in args: + if isinstance(a, core.ndarray): + exec_cupy = True + break + if not exec_cupy: + # No cupy ndarray exists in the arguments + return self.func(*args) + + # Invalid argument types + for arg in args: + if not isinstance(arg, _acceptable_types): + mes = 'Invalid argument type for \'{}\': ({})' + arg_types = ', '.join(repr(type(a)) for a in args) + raise TypeError(mes.format(self.name, arg_types)) + + # Cache the result of execution path analysis + cdef list params_info = [] + for arg in args: + if isinstance(arg, core.ndarray): + params_info.append(arg.dtype.char) + params_info.append(arg.ndim) + elif isinstance(arg, numpy.generic): + params_info.append(arg.dtype.char) + elif arg is None: + params_info.append(None) + elif isinstance(arg, float): + params_info.append('d') + elif isinstance(arg, int): + params_info.append('l') + elif isinstance(arg, bool): + params_info.append('?') + elif isinstance(arg, complex): + params_info.append('D') + else: + assert False + + cdef tuple key = tuple(params_info) + if key not in self._memo: + try: + history = _FusionHistory() + _thread_local.history = history + _thread_local.is_old_fusing = True + try: + self._memo[key] = history.get_fusion( + self.func, args, self.name) + except Exception: + self.new_fusion = new_fusion.Fusion(self.func, self.name) + _thread_local.history = None + _thread_local.is_old_fusing = False + return self.new_fusion(*args) + finally: + _thread_local.history = None + _thread_local.is_old_fusing = False + kernel, kwargs = self._memo[key] + + return kernel( + *[a for a in args if a is not None], + **kwargs) + + def clear_cache(self): + self._memo = {} + + +def fuse(*args, **kwargs): + """Decorator that fuses a function. + + This decorator can be used to define an elementwise or reduction kernel + more easily than :class:`~cupy.ElementwiseKernel` or + :class:`~cupy.ReductionKernel`. + + Since the fused kernels are cached and reused, it is recommended to reuse + the same decorated functions instead of e.g. decorating local functions + that are defined multiple times. + + Args: + kernel_name (str): Name of the fused kernel function. + If omitted, the name of the decorated function is used. + + Example: + + >>> @cupy.fuse(kernel_name='squared_diff') + ... def squared_diff(x, y): + ... return (x - y) * (x - y) + ... + >>> x = cupy.arange(10) + >>> y = cupy.arange(10)[::-1] + >>> squared_diff(x, y) + array([81, 49, 25, 9, 1, 1, 9, 25, 49, 81]) + """ + + def wrapper(f, kernel_name=None): + return Fusion(f, kernel_name) + + if len(args) == 1 and len(kwargs) == 0 and callable(args[0]): + return functools.update_wrapper(wrapper(args[0]), args[0]) + else: + return lambda f: functools.update_wrapper( + wrapper(f, *args, **kwargs), f) + + +def _call_ufunc(fusion_op, *args, **kwargs): + return _thread_local.history.call_ufunc(fusion_op, *args, **kwargs) + + +def _call_reduction(fusion_op, *args, **kwargs): + if len(args) != 1: + mes = '{}() takes 1 positional argument but {} were given' + raise TypeError(mes.format(fusion_op._ops.name, len(args))) + + arg = args[0] + kwargs = dict([(key, value) for key, value in kwargs.items() + if (key in ('axis', 'out') and value is not None)]) + + if arg._is_postmap: + # Multiple reduction + raise NotImplementedError( + 'Multiple reduction is not implemented yet') + + var = _thread_local.history.set_reduce_op(fusion_op, arg, kwargs) + + src_ndim = max(0, arg.ndim) + if 'axis' in kwargs: + axis = kwargs['axis'] + if isinstance(axis, (tuple, list)): + ndim = src_ndim - len(axis) + else: + ndim = src_ndim - 1 + else: + ndim = 0 + if ndim < 0: + raise numpy.AxisError(axis, src_ndim) + + _thread_local.history.ndim = ndim + if ndim >= 1: + return _FusionVarArray(var, ndim, True) + else: + return _FusionVarScalar(var, -1, True) + + +def _create_astype_ufunc(dtype): + name = 'astype_{}'.format(dtype) + rules = tuple(['{}->{}'.format(cast_from.char, dtype.char) + for cast_from in _dtype_list]) + command = 'out0 = static_cast< {} >(in0)'.format(get_typename(dtype)) + return core.create_ufunc(name, rules, command) + + +_dtype_to_astype_dict = None + + +def _dtype_to_astype(dtype): + global _dtype_to_astype_dict + if _dtype_to_astype_dict is None: + _dtype_to_astype_dict = dict([ + (dt, _create_astype_ufunc(dt)) + for dt in _dtype_list]) + return _dtype_to_astype_dict[dtype] diff --git a/cupy/_core/halffloat.h b/cupy/_core/halffloat.h new file mode 100644 index 0000000..ca0ebed --- /dev/null +++ b/cupy/_core/halffloat.h @@ -0,0 +1,125 @@ +typedef unsigned short npy_uint16; +typedef unsigned int npy_uint32; + +/* + ******************************************************************** + * BIT-LEVEL CONVERSIONS * + ******************************************************************** + */ + + +npy_uint16 npy_floatbits_to_halfbits(npy_uint32 f) +{ + npy_uint32 f_exp, f_sig; + npy_uint16 h_sgn, h_exp, h_sig; + + h_sgn = (npy_uint16) ((f&0x80000000u) >> 16); + f_exp = (f&0x7f800000u); + + /* Exponent overflow/NaN converts to signed inf/NaN */ + if (f_exp >= 0x47800000u) { + if (f_exp == 0x7f800000u) { + /* Inf or NaN */ + f_sig = (f&0x007fffffu); + if (f_sig != 0) { + /* NaN - propagate the flag in the significand... */ + npy_uint16 ret = (npy_uint16) (0x7c00u + (f_sig >> 13)); + /* ...but make sure it stays a NaN */ + if (ret == 0x7c00u) { + ret++; + } + return h_sgn + ret; + } else { + /* signed inf */ + return (npy_uint16) (h_sgn + 0x7c00u); + } + } else { + /* overflow to signed inf */ + return (npy_uint16) (h_sgn + 0x7c00u); + } + } + + /* Exponent underflow converts to a subnormal half or signed zero */ + if (f_exp <= 0x38000000u) { + /* + * Signed zeros, subnormal floats, and floats with small + * exponents all convert to signed zero half-floats. + */ + if (f_exp < 0x33000000u) { + return h_sgn; + } + /* Make the subnormal significand */ + f_exp >>= 23; + f_sig = (0x00800000u + (f&0x007fffffu)); + f_sig >>= (113 - f_exp); + /* Handle rounding by adding 1 to the bit beyond half precision */ + /* + * If the last bit in the half significand is 0 (already even), and + * the remaining bit pattern is 1000...0, then we do not add one + * to the bit after the half significand. In all other cases, we do. + */ + if ((f_sig&0x00003fffu) != 0x00001000u) { + f_sig += 0x00001000u; + } + h_sig = (npy_uint16) (f_sig >> 13); + /* + * If the rounding causes a bit to spill into h_exp, it will + * increment h_exp from zero to one and h_sig will be zero. + * This is the correct result. + */ + return (npy_uint16) (h_sgn + h_sig); + } + + /* Regular case with no overflow or underflow */ + h_exp = (npy_uint16) ((f_exp - 0x38000000u) >> 13); + /* Handle rounding by adding 1 to the bit beyond half precision */ + f_sig = (f&0x007fffffu); + /* + * If the last bit in the half significand is 0 (already even), and + * the remaining bit pattern is 1000...0, then we do not add one + * to the bit after the half significand. In all other cases, we do. + */ + if ((f_sig&0x00003fffu) != 0x00001000u) { + f_sig += 0x00001000u; + } + h_sig = (npy_uint16) (f_sig >> 13); + /* + * If the rounding causes a bit to spill into h_exp, it will + * increment h_exp by one and h_sig will be zero. This is the + * correct result. h_exp may increment to 15, at greatest, in + * which case the result overflows to a signed inf. + */ + return h_sgn + h_exp + h_sig; +} + +npy_uint32 npy_halfbits_to_floatbits(npy_uint16 h) +{ + npy_uint16 h_exp, h_sig; + npy_uint32 f_sgn, f_exp, f_sig; + + h_exp = (h&0x7c00u); + f_sgn = ((npy_uint32)h&0x8000u) << 16; + switch (h_exp) { + case 0x0000u: /* 0 or subnormal */ + h_sig = (h&0x03ffu); + /* Signed zero */ + if (h_sig == 0) { + return f_sgn; + } + /* Subnormal */ + h_sig <<= 1; + while ((h_sig&0x0400u) == 0) { + h_sig <<= 1; + h_exp++; + } + f_exp = ((npy_uint32)(127 - 15 - h_exp)) << 23; + f_sig = ((npy_uint32)(h_sig&0x03ffu)) << 13; + return f_sgn + f_exp + f_sig; + case 0x7c00u: /* inf or NaN */ + /* All-ones exponent and a copy of the significand */ + return f_sgn + 0x7f800000u + (((npy_uint32)(h&0x03ffu)) << 13); + default: /* normalized */ + /* Just need to adjust the exponent and shift */ + return f_sgn + (((npy_uint32)(h&0x7fffu) + 0x1c000u) << 13); + } +} diff --git a/cupy/_core/include/cupy/README.md b/cupy/_core/include/cupy/README.md new file mode 100644 index 0000000..38121d8 --- /dev/null +++ b/cupy/_core/include/cupy/README.md @@ -0,0 +1,22 @@ +# "include" directory + +All files and directories in this directory will be copied to the distribution (sdist and wheel). +Note that items starting with `.` (e.g., `cub/.git`) are excluded. +See `setup.py` for details. + +## CUB + +The `cub` folder is a git submodule for the CUB project. +Including the CUB headers as a submodule enables not only building the `cupy.cuda.cub` module, +but also easier maintenance. +For further information on CUB, see the [CUB Project Website](http://nvlabs.github.com/cub). + +## Jitify +The `Jitify` folder is a git submodule for the Jitify project. +Including the Jitify header as a submodule for building the `cupy.cuda.jitify` module. +For further information on Jitify, see the [Jitify repo](https://github.com/NVIDIA/jitify). + +## DLPack +The `dlpack` folder stores the DLPack header for building the `cupy._core.dlpack` module, +see `README.md` therein. +For further information on DLPack, see the [DLPack repo](https://github.com/dmlc/dlpack). diff --git a/cupy/_core/include/cupy/_cuda/README.md b/cupy/_core/include/cupy/_cuda/README.md new file mode 100644 index 0000000..bc5d93f --- /dev/null +++ b/cupy/_core/include/cupy/_cuda/README.md @@ -0,0 +1,9 @@ +These files are copied from CUDA Toolkit Distribution and redistributed under the following license: + +https://docs.nvidia.com/cuda/archive/9.2/eula/#nvidia-cuda-toolkit-license-agreement + +For CUDA 11.2+, we enable CUDA Enhanced Compatibility by hosting the fp16 headers from the latest +CUDA Toolkit release. + +* ``cuda-11`` contains headers from CTK 11.8.0. +* ``cuda-12`` contains headers from CTK 12.1.1. diff --git a/cupy/_core/include/cupy/_cuda/cuda-10.2/cuda_fp16.h b/cupy/_core/include/cupy/_cuda/cuda-10.2/cuda_fp16.h new file mode 100755 index 0000000..a04c111 --- /dev/null +++ b/cupy/_core/include/cupy/_cuda/cuda-10.2/cuda_fp16.h @@ -0,0 +1,3052 @@ +/* +* Copyright 1993-2014 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +/** +* \defgroup CUDA_MATH_INTRINSIC_HALF Half Precision Intrinsics +* This section describes half precision intrinsic functions that are +* only supported in device code. +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_ARITHMETIC Half Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_ARITHMETIC Half2 Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_COMPARISON Half Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_COMPARISON Half2 Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_MISC Half Precision Conversion And Data Movement +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_FUNCTIONS Half Math Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_FUNCTIONS Half2 Math Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +#ifndef __CUDA_FP16_H__ +#define __CUDA_FP16_H__ + +#if defined(__cplusplus) +#if defined(__CUDACC__) +#define __CUDA_FP16_DECL__ static __device__ __inline__ +#define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__ +#else +#define __CUDA_HOSTDEVICE_FP16_DECL__ static +#endif /* defined(__CUDACC__) */ + +#define __CUDA_FP16_TYPES_EXIST__ + +/* Forward-declaration of structures defined in "cuda_fp16.hpp" */ + +/** + * \brief half datatype + * + * \details This structure implements the datatype for storing + * half-precision floating-point numbers. The structure implements + * assignment operators and type conversions. + * 16 bits are being used in total: 1 sign bit, 5 bits for the exponent, + * and the significand is being stored in 10 bits. + * The total precision is 11 bits. There are 15361 representable + * numbers within the interval [0.0, 1.0], endpoints included. + * On average we have log10(2**11) ≈ 3.311 decimal digits. + * + * \req IEEE 754-2008 compliant implementation of half-precision + * floating-point numbers. + */ +struct __half; + +/** + * \brief half2 datatype + * + * \details This structure implements the datatype for storing two + * half-precision floating-point numbers. + * The structure implements assignment operators and type conversions. + * + * \req Vectorified version of half. + */ +struct __half2; + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-to-nearest-even mode. +* \req +* \param[in] a - float. Is only being read. +* \returns half +* \retval \p a converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-to-nearest-even mode. +* \req +* \param[in] a - float. Is only being read. +* \returns half +* \retval \p a converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-towards-zero mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-towards-zero mode. +* \req +* \param[in] a - float. Is only being read. +* \returns half +* \retval \p a converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-down mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-down mode. +* \req +* \param[in] a - float. Is only being read. +* +* \returns half +* \retval \p a converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-up mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-up mode. +* \req +* \param[in] a - float. Is only being read. +* +* \returns half +* \retval \p a converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts \p half number to float. +* +* \details Converts half number \p a to float. +* \req +* \param[in] a - float. Is only being read. +* +* \returns float +* \retval \p a converted to float. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts input to half precision in round-to-nearest-even mode and +* populates both halves of \p half2 with converted value. +* +* \details Converts input \p a to half precision in round-to-nearest-even mode and +* populates both halves of \p half2 with converted value. +* \req +* \param[in] a - float. Is only being read. +* +* \returns half2 +* \retval The \p half2 value with both halves equal to the converted half +* precision number. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both input floats to half precision in round-to-nearest-even +* mode and returns \p half2 with converted values. +* +* \details Converts both input floats to half precision in round-to-nearest-even mode +* and combines the results into one \p half2 number. Low 16 bits of the return +* value correspond to the input \p a, high 16 bits correspond to the input \p +* b. +* \req +* \param[in] a - float. Is only being read. +* \param[in] b - float. Is only being read. +* +* \returns half2 +* \retval The \p half2 value with corresponding halves equal to the +* converted input floats. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts low 16 bits of \p half2 to float and returns the result +* +* \details Converts low 16 bits of \p half2 input \p a to 32 bit floating point number +* and returns the result. +* \req +* \param[in] a - half2. Is only being read. +* +* \returns float +* \retval The low 16 bits of \p a converted to float. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts high 16 bits of \p half2 to float and returns the result +* +* \details Converts high 16 bits of \p half2 input \p a to 32 bit floating point number +* and returns the result. +* \req +* \param[in] a - half2. Is only being read. +* +* \returns float +* \retval The high 16 bits of \p a converted to float. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a); + +#if defined(__CUDACC__) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both components of float2 number to half precision in +* round-to-nearest-even mode and returns \p half2 with converted values. +* +* \details Converts both components of float2 to half precision in round-to-nearest +* mode and combines the results into one \p half2 number. Low 16 bits of the +* return value correspond to \p a.x and high 16 bits of the return value +* correspond to \p a.y. +* \req +* \param[in] a - float2. Is only being read. +* +* \returns half2 +* \retval The \p half2 which has corresponding halves equal to the +* converted float2 components. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both halves of \p half2 to float2 and returns the result. +* +* \details Converts both halves of \p half2 input \p a to float2 and returns the +* result. +* \req +* \param[in] a - half2. Is only being read. +* +* \returns float2 +* \retval \p a converted to float2. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-to-nearest-even mode. +* +* \details Convert the half-precision floating point value \p h to a signed integer in +* round-to-nearest-even mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns int +* \retval \p h converted to a signed integer. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ int __half2int_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-towards-zero mode. +* +* \details Convert the half-precision floating point value \p h to a signed integer in +* round-towards-zero mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns int +* \retval \p h converted to a signed integer. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ int __half2int_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-down mode. +* +* \details Convert the half-precision floating point value \p h to a signed integer in +* round-down mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns int +* \retval \p h converted to a signed integer. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ int __half2int_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-up mode. +* +* \details Convert the half-precision floating point value \p h to a signed integer in +* round-up mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns int +* \retval \p h converted to a signed integer. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ int __half2int_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-to-nearest-even mode. +* +* \details Convert the signed integer value \p i to a half-precision floating point +* value in round-to-nearest-even mode. +* \req +* \param[in] i - int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __int2half_rn(int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-towards-zero mode. +* +* \details Convert the signed integer value \p i to a half-precision floating point +* value in round-towards-zero mode. +* \req +* \param[in] i - int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __int2half_rz(int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-down mode. +* +* \details Convert the signed integer value \p i to a half-precision floating point +* value in round-down mode. +* \req +* \param[in] i - int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __int2half_rd(int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-up mode. +* +* \details Convert the signed integer value \p i to a half-precision floating point +* value in round-up mode. +* \req +* \param[in] i - int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __int2half_ru(int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating point value \p h to a signed short +* integer in round-to-nearest-even mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns short int +* \retval \p h converted to a signed short integer. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ short int __half2short_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-towards-zero mode. +* +* \details Convert the half-precision floating point value \p h to a signed short +* integer in round-towards-zero mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns short int +* \retval \p h converted to a signed short integer. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ short int __half2short_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-down mode. +* +* \details Convert the half-precision floating point value \p h to a signed short +* integer in round-down mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns short int +* \retval \p h converted to a signed short integer. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ short int __half2short_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-up mode. +* +* \details Convert the half-precision floating point value \p h to a signed short +* integer in round-up mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns short int +* \retval \p h converted to a signed short integer. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ short int __half2short_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating +* point value in round-to-nearest-even mode. +* \req +* \param[in] i - short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __short2half_rn(short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-towards-zero mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating +* point value in round-towards-zero mode. +* \req +* \param[in] i - short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __short2half_rz(short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-down mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating +* point value in round-down mode. +* \req +* \param[in] i - short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __short2half_rd(short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-up mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating +* point value in round-up mode. +* \req +* \param[in] i - short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __short2half_ru(short int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-to-nearest-even mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned integer +* in round-to-nearest-even mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* \retval \p h converted to an unsigned integer. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-towards-zero mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned integer +* in round-towards-zero mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* \retval \p h converted to an unsigned integer. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-down mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned integer +* in round-down mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* \retval \p h converted to an unsigned integer. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-up mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned integer +* in round-up mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* \retval \p h converted to an unsigned integer. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-to-nearest-even mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating point +* value in round-to-nearest-even mode. +* \req +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __uint2half_rn(unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-towards-zero mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating point +* value in round-towards-zero mode. +* \req +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __uint2half_rz(unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-down mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating point +* value in round-down mode. +* \req +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __uint2half_rd(unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-up mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating point +* value in round-up mode. +* \req +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __uint2half_ru(unsigned int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned short +* integer in round-to-nearest-even mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* \retval \p h converted to an unsigned short integer. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-towards-zero +* mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned short +* integer in round-towards-zero mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* \retval \p h converted to an unsigned short integer. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-down mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned short +* integer in round-down mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* \retval \p h converted to an unsigned short integer. +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-up mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned short +* integer in round-up mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* \retval \p h converted to an an unsigned short integer. +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating +* point value in round-to-nearest-even mode. +* \req +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __ushort2half_rn(unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-towards-zero +* mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating +* point value in round-towards-zero mode. +* \req +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __ushort2half_rz(unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-down mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating +* point value in round-down mode. +* \req +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __ushort2half_rd(unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-up mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating +* point value in round-up mode. +* \req +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __ushort2half_ru(unsigned short int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned 64-bit +* integer in round-to-nearest-even mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* \retval \p h converted to an unsigned 64-bit integer. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-towards-zero +* mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned 64-bit +* integer in round-towards-zero mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* \retval \p h converted to an unsigned 64-bit integer. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-down mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned 64-bit +* integer in round-down mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* \retval \p h converted to an unsigned 64-bit integer. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-up mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned 64-bit +* integer in round-up mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* \retval \p h converted to an unsigned 64-bit integer. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating +* point value in round-to-nearest-even mode. +* \req +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __ull2half_rn(unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-towards-zero +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating +* point value in round-towards-zero mode. +* \req +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __ull2half_rz(unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-down mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating +* point value in round-down mode. +* \req +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __ull2half_rd(unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-up mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating +* point value in round-up mode. +* \req +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __ull2half_ru(unsigned long long int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating point value \p h to a signed 64-bit +* integer in round-to-nearest-even mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns long long int +* \retval \p h converted to a signed 64-bit integer. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ long long int __half2ll_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-towards-zero mode. +* +* \details Convert the half-precision floating point value \p h to a signed 64-bit +* integer in round-towards-zero mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns long long int +* \retval \p h converted to a signed 64-bit integer. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ long long int __half2ll_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-down mode. +* +* \details Convert the half-precision floating point value \p h to a signed 64-bit +* integer in round-down mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns long long int +* \retval \p h converted to a signed 64-bit integer. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ long long int __half2ll_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-up mode. +* +* \details Convert the half-precision floating point value \p h to a signed 64-bit +* integer in round-up mode. +* \req +* \param[in] h - half. Is only being read. +* +* \returns long long int +* \retval \p h converted to a signed 64-bit integer. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ long long int __half2ll_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating +* point value in round-to-nearest-even mode. +* \req +* \param[in] i - long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __ll2half_rn(long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-towards-zero mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating +* point value in round-towards-zero mode. +* \req +* \param[in] i - long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +*/ +__CUDA_FP16_DECL__ __half __ll2half_rz(long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-down mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating +* point value in round-down mode. +* \req +* \param[in] i - long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __ll2half_rd(long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-up mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating +* point value in round-up mode. +* \req +* \param[in] i - long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __ll2half_ru(long long int i); + +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Truncate input argument to the integral part. +* +* \details Round \p h to the nearest integer value that does not exceed \p h in +* magnitude. +* \req +* \param[in] h - half. Is only being read. +* +* \returns half +* \retval The truncated integer value. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half htrunc(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculate ceiling of the input argument. +* +* \details Compute the smallest integer value not less than \p h. +* \req +* \param[in] h - half. Is only being read. +* +* \returns half +* \retval The smallest integer value not less than \p h. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half hceil(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details Calculate the largest integer value which is less than or equal to \p h. +* \req +* \param[in] h - half. Is only being read. +* +* \returns half +* \retval The largest integer value which is less than or equal to \p h. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half hfloor(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Round input to nearest integer value in half-precision floating point +* number. +* +* \details Round \p h to the nearest integer value in half-precision floating point +* format, with halfway cases rounded to the nearest even integer value. +* \req +* \param[in] h - half. Is only being read. +* +* \returns half +* \retval The nearest integer to \p h. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half hrint(const __half h); + +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Truncate \p half2 vector input argument to the integral part. +* +* \details Round each component of vector \p h to the nearest integer value that does +* not exceed \p h in magnitude. +* \req +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* \retval The truncated \p h. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculate \p half2 vector ceiling of the input argument. +* +* \details For each component of vector \p h compute the smallest integer value not less +* than \p h. +* \req +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* \retval The vector of smallest integers not less than \p h. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details For each component of vector \p h calculate the largest integer value which +* is less than or equal to \p h. +* \req +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* \retval The vector of largest integers which is less than or equal to \p h. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Round input to nearest integer value in half-precision floating point +* number. +* +* \details Round each component of \p half2 vector \p h to the nearest integer value in +* half-precision floating point format, with halfway cases rounded to the +* nearest even integer value. +* \req +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* \retval The vector of rounded integer values. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns \p half2 with both halves equal to the input value. +* +* \details Returns \p half2 number with both halves equal to the input \p a \p half +* number. +* \req +* \param[in] a - half. Is only being read. +* +* \returns half2 +* \retval The vector which has both its halves equal to the input \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __half2half2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Swaps both halves of the \p half2 input. +* +* \details Swaps both halves of the \p half2 input and returns a new \p half2 number +* with swapped halves. +* \req +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval \p a with its halves being swapped. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts low 16 bits from each of the two \p half2 inputs and combines +* into one \p half2 number. +* +* \details Extracts low 16 bits from each of the two \p half2 inputs and combines into +* one \p half2 number. Low 16 bits from input \p a is stored in low 16 bits of +* the return value, low 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The low 16 bits of \p a and of \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts high 16 bits from each of the two \p half2 inputs and +* combines into one \p half2 number. +* +* \details Extracts high 16 bits from each of the two \p half2 inputs and combines into +* one \p half2 number. High 16 bits from input \p a is stored in low 16 bits of +* the return value, high 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The high 16 bits of \p a and of \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns high 16 bits of \p half2 input. +* +* \details Returns high 16 bits of \p half2 input \p a. +* \req +* \param[in] a - half2. Is only being read. +* +* \returns half +* \retval The high 16 bits of the input. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __high2half(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns low 16 bits of \p half2 input. +* +* \details Returns low 16 bits of \p half2 input \p a. +* \req +* \param[in] a - half2. Is only being read. +* +* \returns half +* \retval Returns \p half which contains low 16 bits of the input \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __low2half(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Checks if the input \p half number is infinite. +* +* \details Checks if the input \p half number \p a is infinite. +* \req +* \param[in] a - half. Is only being read. +* +* \returns int +* \retval -1 iff \p a is equal to negative infinity, +* \retval 1 iff \p a is equal to positive infinity, +* \retval 0 otherwise. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ int __hisinf(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Combines two \p half numbers into one \p half2 number. +* +* \details Combines two input \p half number \p a and \p b into one \p half2 number. +* Input \p a is stored in low 16 bits of the return value, input \p b is stored +* in high 16 bits of the return value. +* \req +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half2 +* \retval The half2 with one half equal to \p a and the other to \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts low 16 bits from \p half2 input. +* +* \details Extracts low 16 bits from \p half2 input \p a and returns a new \p half2 +* number which has both halves equal to the extracted bits. +* \req +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The half2 with both halves equal to the low 16 bits of the input. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __low2half2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts high 16 bits from \p half2 input. +* +* \details Extracts high 16 bits from \p half2 input \p a and returns a new \p half2 +* number which has both halves equal to the extracted bits. +* \req +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The half2 with both halves equal to the high 16 bits of the input. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __high2half2(const __half2 a); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a \p half as a signed short integer. +* +* \details Reinterprets the bits in the half-precision floating point number \p h +* as a signed short integer. +* \req +* \param[in] h - half. Is only being read. +* +* \returns short int +* \retval The reinterpreted value. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ short int __half_as_short(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a \p half as an unsigned short integer. +* +* \details Reinterprets the bits in the half-precision floating point \p h +* as an unsigned short number. +* \req +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* \retval The reinterpreted value. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a signed short integer as a \p half. +* +* \details Reinterprets the bits in the signed short integer \p i as a +* half-precision floating point number. +* \req +* \param[in] i - short int. Is only being read. +* +* \returns half +* \retval The reinterpreted value. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __short_as_half(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in an unsigned short integer as a \p half. +* +* \details Reinterprets the bits in the unsigned short integer \p i as a +* half-precision floating point number. +* \req +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* \retval The reinterpreted value. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i); + +#if __CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__) +#if !defined warpSize && !defined __local_warpSize +#define warpSize 32 +#define __local_warpSize +#endif + +#if defined(_WIN32) +# define __DEPRECATED__(msg) __declspec(deprecated(msg)) +#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__)))) +# define __DEPRECATED__(msg) __attribute__((deprecated)) +#else +# define __DEPRECATED__(msg) __attribute__((deprecated(msg))) +#endif + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 +#define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)." + +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) __half2 __shfl(__half2 var, int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) __half2 __shfl_up(__half2 var, unsigned int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down))__half2 __shfl_down(__half2 var, unsigned int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half2 __shfl_xor(__half2 var, int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) __half __shfl(__half var, int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) __half __shfl_up(__half var, unsigned int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) __half __shfl_down(__half var, unsigned int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half __shfl_xor(__half var, int delta, int width = warpSize); +#endif + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of var held by the thread whose ID is given by delta. +* If width is less than warpSize then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], +* the value returned corresponds to the value of var held by the delta modulo width (i.e. +* ithin the same subsection). width must have a value which is a power of 2; +* results are undefined if width is not a power of 2, or is a number greater than +* warpSize. +* \req +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +*/ +__CUDA_FP16_DECL__ __half2 __shfl_sync(unsigned mask, __half2 var, int delta, int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* +* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. +* The value of var held by the resulting lane ID is returned: in effect, var is shifted up +* the warp by delta threads. If width is less than warpSize then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. +* width must have a value which is a power of 2; results are undefined if width is not a power of 2, +* or is a number greater than warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +*/ +__CUDA_FP16_DECL__ __half2 __shfl_up_sync(unsigned mask, __half2 var, unsigned int delta, int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding delta to the caller's thread ID. +* The value of var held by the resulting thread ID is returned: this has the effect +* of shifting var down the warp by delta threads. If width is less than warpSize then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of width and so the upper delta threads +* will remain unchanged. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +*/ +__CUDA_FP16_DECL__ __half2 __shfl_down_sync(unsigned mask, __half2 var, unsigned int delta, int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: +* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each +* group of width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +*/ +__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(unsigned mask, __half2 var, int delta, int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of var held by the thread whose ID is given by delta. +* If width is less than warpSize then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], +* the value returned corresponds to the value of var held by the delta modulo width (i.e. +* ithin the same subsection). width must have a value which is a power of 2; +* results are undefined if width is not a power of 2, or is a number greater than +* warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +*/ +__CUDA_FP16_DECL__ __half __shfl_sync(unsigned mask, __half var, int delta, int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. +* The value of var held by the resulting lane ID is returned: in effect, var is shifted up +* the warp by delta threads. If width is less than warpSize then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. +* width must have a value which is a power of 2; results are undefined if width is not a power of 2, +* or is a number greater than warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +*/ +__CUDA_FP16_DECL__ __half __shfl_up_sync(unsigned mask, __half var, unsigned int delta, int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding delta to the caller's thread ID. +* The value of var held by the resulting thread ID is returned: this has the effect +* of shifting var down the warp by delta threads. If width is less than warpSize then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of width and so the upper delta threads +* will remain unchanged. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +*/ +__CUDA_FP16_DECL__ __half __shfl_down_sync(unsigned mask, __half var, unsigned int delta, int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: +* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each +* group of width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +*/ +__CUDA_FP16_DECL__ __half __shfl_xor_sync(unsigned mask, __half var, int delta, int width = warpSize); + +#if defined(__local_warpSize) +#undef warpSize +#undef __local_warpSize +#endif +#endif /*__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__) */ + +#if defined(__cplusplus) && ( __CUDA_ARCH__ >=320 || !defined(__CUDA_ARCH__) ) +__CUDA_FP16_DECL__ __half2 __ldg(const __half2 *ptr); +__CUDA_FP16_DECL__ __half __ldg(const __half *ptr); +__CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *ptr); +__CUDA_FP16_DECL__ __half __ldcg(const __half *ptr); +__CUDA_FP16_DECL__ __half2 __ldca(const __half2 *ptr); +__CUDA_FP16_DECL__ __half __ldca(const __half *ptr); +__CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *ptr); +__CUDA_FP16_DECL__ __half __ldcs(const __half *ptr); +#endif /*defined(__cplusplus) && ( __CUDA_ARCH__ >=320 || !defined(__CUDA_ARCH__) )*/ + +#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs half2 vector if-equal comparison. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of if-equal comparison of vectors \p a and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector not-equal comparison. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of not-equal comparison of vectors \p a and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-equal comparison. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The \p half2 result of less-equal comparison of vectors \p a and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-equal comparison. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of greater-equal comparison of vectors \p a and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-than comparison. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The half2 vector result of less-than comparison of vectors \p a and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-than comparison. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of greater-than comparison of vectors \p a and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered if-equal comparison. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of unordered if-equal comparison of vectors \p a and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered not-equal comparison. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of unordered not-equal comparison of vectors \p a and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-equal comparison. +* +* Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of unordered less-equal comparison of vectors \p a and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-equal comparison. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The \p half2 vector result of unordered greater-equal comparison of vectors \p a and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-than comparison. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of unordered less-than comparison of vectors \p a and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-than comparison. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The \p half2 vector result of unordered greater-than comparison of vectors \p a and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Determine whether \p half2 argument is a NaN. +* +* \details Determine whether each half of input \p half2 number \p a is a NaN. +* \req +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The half2 with the corresponding \p half results set to +* 1.0 for for NaN, 0.0 otherwise. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode. +* +* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest +* mode. +* \req DEEPLEARN-SRM_REQ-95 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The sum of vectors \p a and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode. +* \req DEEPLEARN-SRM_REQ-104 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The subtraction of vector \p b from \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode. +* +* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode. +* \req DEEPLEARN-SRM_REQ-102 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The result of elementwise multiplying the vectors \p a and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector division in round-to-nearest-even mode. +* +* \details Divides \p half2 input vector \p a by input vector \p b in round-to-nearest +* mode. +* \req DEEPLEARN-SRM_REQ-103 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise division of \p a with \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Calculates the absolute value of both halves of the input \p half2 number and +* returns the result. +* +* \details Calculates the absolute value of both halves of the input \p half2 number and +* returns the result. +* \req +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval Returns \p a with the absolute value of both halves. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __habs2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest +* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The sum of \p a and \p b, with respect to saturation. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The subtraction of vector \p b from \p a, with respect to saturation. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The result of elementwise multiplication of vectors \p a and \p b, +* with respect to saturation. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \req DEEPLEARN-SRM_REQ-105 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* \retval The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode, with saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the +* results to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* \retval The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c, +* with respect to saturation. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Negates both halves of the input \p half2 number and returns the +* result. +* +* \details Negates both halves of the input \p half2 number \p a and returns the result. +* \req DEEPLEARN-SRM_REQ-101 +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval Returns \p a with both halves negated. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Calculates the absolute value of input \p half number and returns the result. +* +* \details Calculates the absolute value of input \p half number and returns the result. +* \req +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The absolute value of a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __habs(const __half a); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode. +* +* \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even +* mode. +* \req DEEPLEARN-SRM_REQ-94 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \retval The sum of \p a and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half input \p b from input \p a in round-to-nearest +* mode. +* \req DEEPLEARN-SRM_REQ-97 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \retval The result of subtracting \p b from \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode. +* +* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest +* mode. +* \req DEEPLEARN-SRM_REQ-99 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \retval The result of multiplying \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half division in round-to-nearest-even mode. +* +* \details Divides \p half input \p a by input \p b in round-to-nearest +* mode. +* \req DEEPLEARN-SRM_REQ-98 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \retval The result of dividing \p a by \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half add of inputs \p a and \p b, in round-to-nearest-even mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \req +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \retval The sum of \p a and \p b, with respect to saturation. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Subtracts \p half input \p b from input \p a in round-to-nearest +* mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \req +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \retval The result of subtraction of \p b from \p a, with respect to saturation. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest +* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \req +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \retval The result of multiplying \p a and \p b, with respect to saturation. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \req DEEPLEARN-SRM_REQ-96 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* \retval The result of fused multiply-add operation on \p +* a, \p b, and \p c. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the result +* to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \req +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* \retval The result of fused multiply-add operation on \p +* a, \p b, and \p c, with respect to saturation. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Negates input \p half number and returns the result. +* +* \details Negates input \p half number and returns the result. +* \req DEEPLEARN-SRM_REQ-100 +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval minus a +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half __hneg(const __half a); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector if-equal comparison, and returns boolean true +* iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of if-equal comparison +* of vectors \p a and \p b are true; +* \retval false, otherwise. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector not-equal comparison, and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of not-equal comparison +* of vectors \p a and \p b are true, +* \retval false otherwise. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-equal comparison, and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of less-equal comparison +* of vectors \p a and \p b are true; +* \retval false, otherwise. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-equal comparison, and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of greater-equal +* comparison of vectors \p a and \p b are true; +* \retval false, otherwise. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-than comparison, and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of less-than comparison +* of vectors \p a and \p b are true; +* \retval false, otherwise. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-than comparison, and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of greater-than +* comparison of vectors \p a and \p b are true; +* \retval false, otherwise. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered if-equal comparison, and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of unordered if-equal +* comparison of vectors \p a and \p b are true; +* \retval false, otherwise. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered not-equal comparison, and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of unordered not-equal +* comparison of vectors \p a and \p b are true; +* \retval false, otherwise. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-equal comparison, and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of unordered less-equal +* comparison of vectors \p a and \p b are true; +* \retval false, otherwise. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-equal comparison, and +* returns boolean true iff both \p half results are true, boolean false +* otherwise. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of unordered +* greater-equal comparison of vectors \p a and \p b are true; +* \retval false, otherwise. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-than comparison, and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of unordered less-than comparison of +* vectors \p a and \p b are true; +* \retval false, otherwise. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-than comparison, and +* returns boolean true iff both \p half results are true, boolean false +* otherwise. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \req +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of unordered +* greater-than comparison of vectors \p a and \p b are true; +* \retval false, otherwise. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half if-equal comparison. +* +* \details Performs \p half if-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \req +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of if-equal comparison of \p a and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half not-equal comparison. +* +* \details Performs \p half not-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \req +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of not-equal comparison of \p a and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half less-equal comparison. +* +* \details Performs \p half less-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \req +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of less-equal comparison of \p a and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half greater-equal comparison. +* +* \details Performs \p half greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \req +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of greater-equal comparison of \p a and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half less-than comparison. +* +* \details Performs \p half less-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \req +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of less-than comparison of \p a and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half greater-than comparison. +* +* \details Performs \p half greater-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \req +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of greater-than comparison of \p a and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered if-equal comparison. +* +* \details Performs \p half if-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \req +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of unordered if-equal comparison of \p a and +* \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered not-equal comparison. +* +* \details Performs \p half not-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \req +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of unordered not-equal comparison of \p a and +* \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered less-equal comparison. +* +* \details Performs \p half less-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \req +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of unordered less-equal comparison of \p a and +* \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered greater-equal comparison. +* +* \details Performs \p half greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \req +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of unordered greater-equal comparison of \p a +* and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered less-than comparison. +* +* \details Performs \p half less-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \req +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of unordered less-than comparison of \p a and +* \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered greater-than comparison. +* +* \details Performs \p half greater-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \req +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of unordered greater-than comparison of \p a +* and \p b. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Determine whether \p half argument is a NaN. +* +* \details Determine whether \p half value \p a is a NaN. +* \req +* \param[in] a - half. Is only being read. +* +* \returns bool +* \retval true iff argument is NaN. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ bool __hisnan(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half square root in round-to-nearest-even mode. +* +* \details Calculates \p half square root of input \p a in round-to-nearest-even mode. +* \req +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The square root of \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half hsqrt(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half reciprocal square root in round-to-nearest-even +* mode. +* +* \details Calculates \p half reciprocal square root of input \p a in round-to-nearest +* mode. +* \req +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The reciprocal square root of \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half hrsqrt(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p half reciprocal of input \p a in round-to-nearest-even mode. +* \req +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The reciprocal of \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half hrcp(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half natural logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half natural logarithm of input \p a in round-to-nearest-even +* mode. +* \req +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The natural logarithm of \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half hlog(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half binary logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half binary logarithm of input \p a in round-to-nearest-even +* mode. +* \req +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The binary logarithm of \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half hlog2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half decimal logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half decimal logarithm of input \p a in round-to-nearest-even +* mode. +* \req +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The decimal logarithm of \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half hlog10(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half natural exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half natural exponential function of input \p a in +* round-to-nearest-even mode. +* \req +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The natural exponential function on \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half hexp(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half binary exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half binary exponential function of input \p a in +* round-to-nearest-even mode. +* \req +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The binary exponential function on \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half hexp2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half decimal exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half decimal exponential function of input \p a in +* round-to-nearest-even mode. +* \req +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The decimal exponential function on \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half hexp10(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half cosine in round-to-nearest-even mode. +* +* \details Calculates \p half cosine of input \p a in round-to-nearest-even mode. +* \req +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The cosine of \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half hcos(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half sine in round-to-nearest-even mode. +* +* \details Calculates \p half sine of input \p a in round-to-nearest-even mode. +* \req +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The sine of \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half hsin(const __half a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector square root in round-to-nearest-even mode. +* +* \details Calculates \p half2 square root of input vector \p a in round-to-nearest +* mode. +* \req +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise square root on vector \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector reciprocal square root in round-to-nearest +* mode. +* +* \details Calculates \p half2 reciprocal square root of input vector \p a in +* round-to-nearest-even mode. +* \req +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise reciprocal square root on vector \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p half2 reciprocal of input vector \p a in round-to-nearest-even +* mode. +* \req +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise reciprocal on vector \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector natural logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 natural logarithm of input vector \p a in +* round-to-nearest-even mode. +* \req +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise natural logarithm on vector \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 h2log(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector binary logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 binary logarithm of input vector \p a in round-to-nearest +* mode. +* \req +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise binary logarithm on vector \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector decimal logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 decimal logarithm of input vector \p a in +* round-to-nearest-even mode. +* \req +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise decimal logarithm on vector \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half2 exponential function of input vector \p a in +* round-to-nearest-even mode. +* \req +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise exponential function on vector \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector binary exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p half2 binary exponential function of input vector \p a in +* round-to-nearest-even mode. +* \req +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise binary exponential function on vector \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector decimal exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p half2 decimal exponential function of input vector \p a in +* round-to-nearest-even mode. +* \req +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise decimal exponential function on vector \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector cosine in round-to-nearest-even mode. +* +* \details Calculates \p half2 cosine of input vector \p a in round-to-nearest-even +* mode. +* \req +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise cosine on vector \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector sine in round-to-nearest-even mode. +* +* \details Calculates \p half2 sine of input vector \p a in round-to-nearest-even mode. +* \req +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise sine on vector \p a. +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +*/ +__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a); + +#endif /*if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ + +#if __CUDA_ARCH__ >= 600 || !defined(__CUDA_ARCH__) + +__CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *address, __half2 val); + +#endif /*if __CUDA_ARCH__ >= 600 || !defined(__CUDA_ARCH__)*/ + +#if __CUDA_ARCH__ >= 700 || !defined(__CUDA_ARCH__) + +__CUDA_FP16_DECL__ __half atomicAdd(__half *address, __half val); + +#endif /*if __CUDA_ARCH__ >= 700 || !defined(__CUDA_ARCH__)*/ + +#endif /* defined(__CUDACC__) */ + +#undef __CUDA_FP16_DECL__ +#undef __CUDA_HOSTDEVICE_FP16_DECL__ + +#endif /* defined(__cplusplus) */ + +/* Note the .hpp file is included even for host-side compilation, to capture the "half" & "half2" definitions */ +#include "cuda_fp16.hpp" + +#endif /* end of include guard: __CUDA_FP16_H__ */ diff --git a/cupy/_core/include/cupy/_cuda/cuda-10.2/cuda_fp16.hpp b/cupy/_core/include/cupy/_cuda/cuda-10.2/cuda_fp16.hpp new file mode 100755 index 0000000..a4403b5 --- /dev/null +++ b/cupy/_core/include/cupy/_cuda/cuda-10.2/cuda_fp16.hpp @@ -0,0 +1,2071 @@ +/* +* Copyright 1993-2019 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +#if !defined(__CUDA_FP16_HPP__) +#define __CUDA_FP16_HPP__ + +/* C++11 header for std::move. + * In RTC mode, std::move is provided implicitly; don't include the header + */ +#if (__cplusplus >= 201103L) && !defined(__CUDACC_RTC__) +#include +#endif /* __cplusplus >= 201103L && !defined(__CUDACC_RTC__) */ + +/* C++ header for std::memcpy (used for type punning in host-side implementations). + * When compiling as a CUDA source file memcpy is provided implicitly. + * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__). + */ +#if defined(__cplusplus) && !defined(__CUDACC__) +#include +#endif /* defined(__cplusplus) && !defined(__CUDACC__) */ + + +/* Set up function decorations */ +#if defined(__CUDACC__) +#define __CUDA_FP16_DECL__ static __device__ __inline__ +#define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__ +#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__ +#define __CUDA_HOSTDEVICE__ __host__ __device__ +#else /* !defined(__CUDACC__) */ +#if defined(__GNUC__) +#define __CUDA_HOSTDEVICE_FP16_DECL__ static __attribute__ ((unused)) +#else +#define __CUDA_HOSTDEVICE_FP16_DECL__ static +#endif /* defined(__GNUC__) */ +#define __CUDA_HOSTDEVICE__ +#endif /* defined(__CUDACC_) */ + +/* Set up structure-alignment attribute */ +#if defined(__CUDACC__) +#define __CUDA_ALIGN__(align) __align__(align) +#else +/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */ +#if __cplusplus >= 201103L +#define __CUDA_ALIGN__(n) alignas(n) /* C++11 kindly gives us a keyword for this */ +#else /* !(__cplusplus >= 201103L)*/ +#if defined(__GNUC__) +#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +#define __CUDA_ALIGN__(n) __declspec(align(n)) +#else +#define __CUDA_ALIGN__(n) +#endif /* defined(__GNUC__) */ +#endif /* __cplusplus >= 201103L */ +#endif /* defined(__CUDACC__) */ + +/* Macros to allow half & half2 to be used by inline assembly */ +#define __HALF_TO_US(var) *(reinterpret_cast(&(var))) +#define __HALF_TO_CUS(var) *(reinterpret_cast(&(var))) +#define __HALF2_TO_UI(var) *(reinterpret_cast(&(var))) +#define __HALF2_TO_CUI(var) *(reinterpret_cast(&(var))) + +/** +* Types which allow static initialization of "half" and "half2" until +* these become an actual builtin. Note this initialization is as a +* bitfield representation of "half", and not a conversion from short->half. +* Such a representation will be deprecated in a future version of CUDA. +* (Note these are visible to non-nvcc compilers, including C-only compilation) +*/ +typedef struct __CUDA_ALIGN__(2) { + unsigned short x; +} __half_raw; + +typedef struct __CUDA_ALIGN__(4) { + unsigned short x; + unsigned short y; +} __half2_raw; + +/* All other definitions in this file are only visible to C++ compilers */ +#if defined(__cplusplus) + +/* Hide GCC member initialization list warnings because of host/device in-function init requirement */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Weffc++" +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +/* class' : multiple assignment operators specified + The class has multiple assignment operators of a single type. This warning is informational */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( push ) +#pragma warning( disable:4522 ) +#endif /* defined(__GNUC__) */ + +struct __CUDA_ALIGN__(2) __half { +protected: + unsigned short __x; + +public: +#if __cplusplus >= 201103L + __half() = default; +#else + __CUDA_HOSTDEVICE__ __half() { } +#endif /* __cplusplus >= 201103L */ + + /* Convert to/from __half_raw */ + __CUDA_HOSTDEVICE__ __half(const __half_raw &hr) : __x(hr.x) { } + __CUDA_HOSTDEVICE__ __half &operator=(const __half_raw &hr) { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ volatile __half &operator=(const __half_raw &hr) volatile { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ volatile __half &operator=(const volatile __half_raw &hr) volatile { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ operator __half_raw() const { __half_raw ret; ret.x = __x; return ret; } + __CUDA_HOSTDEVICE__ operator __half_raw() const volatile { __half_raw ret; ret.x = __x; return ret; } + +#if !defined(__CUDA_NO_HALF_CONVERSIONS__) + + /* Construct from float/double */ + __CUDA_HOSTDEVICE__ __half(const float f) { __x = __float2half(f).__x; } + __CUDA_HOSTDEVICE__ __half(const double f) { __x = __float2half(static_cast(f)).__x; } + + __CUDA_HOSTDEVICE__ operator float() const { return __half2float(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const float f) { __x = __float2half(f).__x; return *this; } + + /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */ + __CUDA_HOSTDEVICE__ __half &operator=(const double f) { __x = __float2half(static_cast(f)).__x; return *this; } + +/* Member functions only available to nvcc compilation so far */ +#if defined(__CUDACC__) + /* Allow automatic construction from types supported natively in hardware */ + /* Note we do avoid constructor init-list because of special host/device compilation rules */ + __device__ __half(short val) { __x = __short2half_rn(val).__x; } + __device__ __half(unsigned short val) { __x = __ushort2half_rn(val).__x; } + __device__ __half(int val) { __x = __int2half_rn(val).__x; } + __device__ __half(unsigned int val) { __x = __uint2half_rn(val).__x; } + __device__ __half(long long val) { __x = __ll2half_rn(val).__x; } + __device__ __half(unsigned long long val) { __x = __ull2half_rn(val).__x; } + + /* Allow automatic casts to supported builtin types, matching all that are permitted with float */ + __device__ operator short() const { return __half2short_rn(*this); } + __device__ __half &operator=(short val) { __x = __short2half_rn(val).__x; return *this; } + + __device__ operator unsigned short() const { return __half2ushort_rn(*this); } + __device__ __half &operator=(unsigned short val) { __x = __ushort2half_rn(val).__x; return *this; } + + __device__ operator int() const { return __half2int_rn(*this); } + __device__ __half &operator=(int val) { __x = __int2half_rn(val).__x; return *this; } + + __device__ operator unsigned int() const { return __half2uint_rn(*this); } + __device__ __half &operator=(unsigned int val) { __x = __uint2half_rn(val).__x; return *this; } + + __device__ operator long long() const { return __half2ll_rn(*this); } + __device__ __half &operator=(long long val) { __x = __ll2half_rn(val).__x; return *this; } + + __device__ operator unsigned long long() const { return __half2ull_rn(*this); } + __device__ __half &operator=(unsigned long long val) { __x = __ull2half_rn(val).__x; return *this; } + + /* Boolean conversion - note both 0 and -0 must return false */ + __device__ operator bool() const { return (__x & 0x7FFF) != 0; } +#endif /* defined(__CUDACC__) */ +#endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */ +}; + +/* Global-space operator functions are only available to nvcc compilation */ +#if defined(__CUDACC__) + +/* Arithmetic FP16 operations only supported on arch >= 5.3 */ +#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) +#if !defined(__CUDA_NO_HALF_OPERATORS__) +/* Some basic arithmetic operations expected of a builtin */ +__device__ __forceinline__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); } +__device__ __forceinline__ __half operator-(const __half &lh, const __half &rh) { return __hsub(lh, rh); } +__device__ __forceinline__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); } +__device__ __forceinline__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); } + +__device__ __forceinline__ __half &operator+=(__half &lh, const __half &rh) { lh = __hadd(lh, rh); return lh; } +__device__ __forceinline__ __half &operator-=(__half &lh, const __half &rh) { lh = __hsub(lh, rh); return lh; } +__device__ __forceinline__ __half &operator*=(__half &lh, const __half &rh) { lh = __hmul(lh, rh); return lh; } +__device__ __forceinline__ __half &operator/=(__half &lh, const __half &rh) { lh = __hdiv(lh, rh); return lh; } + +/* Note for increment and decrement we use the raw value 0x3C00 equating to half(1.0f), to avoid the extra conversion */ +__device__ __forceinline__ __half &operator++(__half &h) { __half_raw one; one.x = 0x3C00; h += one; return h; } +__device__ __forceinline__ __half &operator--(__half &h) { __half_raw one; one.x = 0x3C00; h -= one; return h; } +__device__ __forceinline__ __half operator++(__half &h, int) { __half ret = h; __half_raw one; one.x = 0x3C00; h += one; return ret; } +__device__ __forceinline__ __half operator--(__half &h, int) { __half ret = h; __half_raw one; one.x = 0x3C00; h -= one; return ret; } + +/* Unary plus and inverse operators */ +__device__ __forceinline__ __half operator+(const __half &h) { return h; } +__device__ __forceinline__ __half operator-(const __half &h) { return __hneg(h); } + +/* Some basic comparison operations to make it look like a builtin */ +__device__ __forceinline__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); } +__device__ __forceinline__ bool operator!=(const __half &lh, const __half &rh) { return __hne(lh, rh); } +__device__ __forceinline__ bool operator> (const __half &lh, const __half &rh) { return __hgt(lh, rh); } +__device__ __forceinline__ bool operator< (const __half &lh, const __half &rh) { return __hlt(lh, rh); } +__device__ __forceinline__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); } +__device__ __forceinline__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); } +#endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */ +#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) */ +#endif /* defined(__CUDACC__) */ + +/* __half2 is visible to non-nvcc host compilers */ +struct __CUDA_ALIGN__(4) __half2 { + __half x; + __half y; + + // All construct/copy/assign/move +public: +#if __cplusplus >= 201103L + __half2() = default; + __CUDA_HOSTDEVICE__ __half2(__half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); } + __CUDA_HOSTDEVICE__ __half2 &operator=(__half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); return *this; } +#else + __CUDA_HOSTDEVICE__ __half2() { } +#endif /* __cplusplus >= 201103L */ + __CUDA_HOSTDEVICE__ __half2(const __half &a, const __half &b) : x(a), y(b) { } + __CUDA_HOSTDEVICE__ __half2(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); } + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); return *this; } + + /* Convert to/from __half2_raw */ + __CUDA_HOSTDEVICE__ __half2(const __half2_raw &h2r ) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); } + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2_raw &h2r) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); return *this; } + __CUDA_HOSTDEVICE__ operator __half2_raw() const { __half2_raw ret; __HALF2_TO_UI(ret) = __HALF2_TO_CUI(*this); return ret; } +}; + +/* Global-space operator functions are only available to nvcc compilation */ +#if defined(__CUDACC__) + +/* Arithmetic FP16x2 operations only supported on arch >= 5.3 */ +#if (__CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)) && !defined(__CUDA_NO_HALF2_OPERATORS__) + +__device__ __forceinline__ __half2 operator+(const __half2 &lh, const __half2 &rh) { return __hadd2(lh, rh); } +__device__ __forceinline__ __half2 operator-(const __half2 &lh, const __half2 &rh) { return __hsub2(lh, rh); } +__device__ __forceinline__ __half2 operator*(const __half2 &lh, const __half2 &rh) { return __hmul2(lh, rh); } +__device__ __forceinline__ __half2 operator/(const __half2 &lh, const __half2 &rh) { return __h2div(lh, rh); } + +__device__ __forceinline__ __half2& operator+=(__half2 &lh, const __half2 &rh) { lh = __hadd2(lh, rh); return lh; } +__device__ __forceinline__ __half2& operator-=(__half2 &lh, const __half2 &rh) { lh = __hsub2(lh, rh); return lh; } +__device__ __forceinline__ __half2& operator*=(__half2 &lh, const __half2 &rh) { lh = __hmul2(lh, rh); return lh; } +__device__ __forceinline__ __half2& operator/=(__half2 &lh, const __half2 &rh) { lh = __h2div(lh, rh); return lh; } + +__device__ __forceinline__ __half2 &operator++(__half2 &h) { __half2_raw one; one.x = 0x3C00; one.y = 0x3C00; h = __hadd2(h, one); return h; } +__device__ __forceinline__ __half2 &operator--(__half2 &h) { __half2_raw one; one.x = 0x3C00; one.y = 0x3C00; h = __hsub2(h, one); return h; } +__device__ __forceinline__ __half2 operator++(__half2 &h, int) { __half2 ret = h; __half2_raw one; one.x = 0x3C00; one.y = 0x3C00; h = __hadd2(h, one); return ret; } +__device__ __forceinline__ __half2 operator--(__half2 &h, int) { __half2 ret = h; __half2_raw one; one.x = 0x3C00; one.y = 0x3C00; h = __hsub2(h, one); return ret; } + +__device__ __forceinline__ __half2 operator+(const __half2 &h) { return h; } +__device__ __forceinline__ __half2 operator-(const __half2 &h) { return __hneg2(h); } + +__device__ __forceinline__ bool operator==(const __half2 &lh, const __half2 &rh) { return __hbeq2(lh, rh); } +__device__ __forceinline__ bool operator!=(const __half2 &lh, const __half2 &rh) { return __hbne2(lh, rh); } +__device__ __forceinline__ bool operator>(const __half2 &lh, const __half2 &rh) { return __hbgt2(lh, rh); } +__device__ __forceinline__ bool operator<(const __half2 &lh, const __half2 &rh) { return __hblt2(lh, rh); } +__device__ __forceinline__ bool operator>=(const __half2 &lh, const __half2 &rh) { return __hbge2(lh, rh); } +__device__ __forceinline__ bool operator<=(const __half2 &lh, const __half2 &rh) { return __hble2(lh, rh); } + +#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) */ +#endif /* defined(__CUDACC__) */ + +/* Restore warning for multiple assignment operators */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( pop ) +#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */ + +/* Restore -Weffc++ warnings from here on */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic pop +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +#undef __CUDA_HOSTDEVICE__ +#undef __CUDA_ALIGN__ + +#ifndef __CUDACC_RTC__ /* no host functions in NVRTC mode */ +static unsigned short __internal_float2half(const float f, unsigned int &sign, unsigned int &remainder) +{ + unsigned int x; + unsigned int u; + unsigned int result = 0U; +#if defined(__CUDACC__) + (void)memcpy(&x, &f, sizeof(f)); +#else + (void)std::memcpy(&x, &f, sizeof(f)); +#endif + u = (x & 0x7fffffffU); + sign = ((x >> 16U) & 0x8000U); + // NaN/+Inf/-Inf + if (u >= 0x7f800000U) { + remainder = 0U; + result = ((u == 0x7f800000U) ? (sign | 0x7c00U) : 0x7fffU); + } else if (u > 0x477fefffU) { // Overflows + remainder = 0x80000000U; + result = (sign | 0x7bffU); + } else if (u >= 0x38800000U) { // Normal numbers + remainder = u << 19U; + u -= 0x38000000U; + result = (sign | (u >> 13U)); + } else if (u < 0x33000001U) { // +0/-0 + remainder = u; + result = sign; + } else { // Denormal numbers + const unsigned int exponent = u >> 23U; + const unsigned int shift = 0x7eU - exponent; + unsigned int mantissa = (u & 0x7fffffU); + mantissa |= 0x800000U; + remainder = mantissa << (32U - shift); + result = (sign | (mantissa >> shift)); + } + return static_cast(result); +} +#endif /* #if !defined(__CUDACC_RTC__) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign; + unsigned int remainder; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign; + unsigned int remainder; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rz.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign; + unsigned int remainder; + r.x = __internal_float2half(a, sign, remainder); + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rm.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign; + unsigned int remainder; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder != 0U) && (sign != 0U)) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rp.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign; + unsigned int remainder; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder != 0U) && (sign == 0U)) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a) +{ + __half2 val; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low;\n" + " cvt.rn.f16.f32 low, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a)); +#else + val = __half2(__float2half_rn(a), __float2half_rn(a)); +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b) +{ + __half2 val; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low,high;\n" + " cvt.rn.f16.f32 low, %1;\n" + " cvt.rn.f16.f32 high, %2;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b)); +#else + val = __half2(__float2half_rn(a), __float2half_rn(b)); +#endif + return val; +} + +#ifndef __CUDACC_RTC__ /* no host functions in NVRTC mode */ +static float __internal_half2float(const unsigned short h) +{ + unsigned int sign = ((static_cast(h) >> 15U) & 1U); + unsigned int exponent = ((static_cast(h) >> 10U) & 0x1fU); + unsigned int mantissa = ((static_cast(h) & 0x3ffU) << 13U); + float f; + if (exponent == 0x1fU) { /* NaN or Inf */ + sign = ((mantissa != 0U) ? 0U : sign); + mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U); + exponent = 0xffU; + } else if (exponent == 0U) { /* Denorm or Zero */ + if (mantissa != 0U) { + unsigned int msb; + exponent = 0x71U; + do { + msb = (mantissa & 0x400000U); + mantissa <<= 1U; /* normalize */ + --exponent; + } while (msb == 0U); + mantissa &= 0x7fffffU; /* 1.mantissa is implicit */ + } + } else { + exponent += 0x70U; + } + unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa); +#if defined(__CUDACC__) + (void)memcpy(&f, &u, sizeof(u)); +#else + (void)std::memcpy(&f, &u, sizeof(u)); +#endif + return f; +} +#endif /* !defined(__CUDACC_RTC__) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a) +{ + float val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(a))); +#else + val = __internal_half2float(static_cast<__half_raw>(a).x); +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a) +{ + float val; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, low;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a))); +#else + val = __internal_half2float(static_cast<__half2_raw>(a).x); +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a) +{ + float val; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, high;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a))); +#else + val = __internal_half2float(static_cast<__half2_raw>(a).y); +#endif + return val; +} + +/* Intrinsic functions only available to nvcc compilers */ +#if defined(__CUDACC__) + +/* CUDA vector-types compatible vector creation function (note returns __half2, not half2) */ +__VECTOR_FUNCTIONS_DECL__ __half2 make_half2(__half x, __half y) +{ + __half2 t; t.x = x; t.y = y; return t; +} +#undef __VECTOR_FUNCTIONS_DECL__ + + +/* Definitions of intrinsics */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 f) +{ + __half2 val = __floats2half2_rn(f.x, f.y); + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 l) +{ + float hi_float; + float lo_float; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, low;}\n" : "=f"(lo_float) : "r"(__HALF2_TO_CUI(l))); + + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, high;}\n" : "=f"(hi_float) : "r"(__HALF2_TO_CUI(l))); +#else + lo_float = __internal_half2float(((__half2_raw)l).x); + hi_float = __internal_half2float(((__half2_raw)l).y); +#endif + return make_float2(lo_float, hi_float); +} +__CUDA_FP16_DECL__ int __half2int_rn(__half h) +{ + int i; + asm("cvt.rni.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ int __half2int_rz(__half h) +{ + int i; + asm("cvt.rzi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ int __half2int_rd(__half h) +{ + int i; + asm("cvt.rmi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ int __half2int_ru(__half h) +{ + int i; + asm("cvt.rpi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ __half __int2half_rn(int i) +{ + __half h; + asm("cvt.rn.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __int2half_rz(int i) +{ + __half h; + asm("cvt.rz.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __int2half_rd(int i) +{ + __half h; + asm("cvt.rm.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __int2half_ru(int i) +{ + __half h; + asm("cvt.rp.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} + +__CUDA_FP16_DECL__ short int __half2short_rn(__half h) +{ + short int i; + asm("cvt.rni.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ short int __half2short_rz(__half h) +{ + short int i; + asm("cvt.rzi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ short int __half2short_rd(__half h) +{ + short int i; + asm("cvt.rmi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ short int __half2short_ru(__half h) +{ + short int i; + asm("cvt.rpi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ __half __short2half_rn(short int i) +{ + __half h; + asm("cvt.rn.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __short2half_rz(short int i) +{ + __half h; + asm("cvt.rz.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __short2half_rd(short int i) +{ + __half h; + asm("cvt.rm.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __short2half_ru(short int i) +{ + __half h; + asm("cvt.rp.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} + +__CUDA_FP16_DECL__ unsigned int __half2uint_rn(__half h) +{ + unsigned int i; + asm("cvt.rni.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned int __half2uint_rz(__half h) +{ + unsigned int i; + asm("cvt.rzi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned int __half2uint_rd(__half h) +{ + unsigned int i; + asm("cvt.rmi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned int __half2uint_ru(__half h) +{ + unsigned int i; + asm("cvt.rpi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ __half __uint2half_rn(unsigned int i) +{ + __half h; + asm("cvt.rn.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __uint2half_rz(unsigned int i) +{ + __half h; + asm("cvt.rz.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __uint2half_rd(unsigned int i) +{ + __half h; + asm("cvt.rm.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __uint2half_ru(unsigned int i) +{ + __half h; + asm("cvt.rp.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} + +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(__half h) +{ + unsigned short int i; + asm("cvt.rni.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rz(__half h) +{ + unsigned short int i; + asm("cvt.rzi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(__half h) +{ + unsigned short int i; + asm("cvt.rmi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(__half h) +{ + unsigned short int i; + asm("cvt.rpi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ __half __ushort2half_rn(unsigned short int i) +{ + __half h; + asm("cvt.rn.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ushort2half_rz(unsigned short int i) +{ + __half h; + asm("cvt.rz.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ushort2half_rd(unsigned short int i) +{ + __half h; + asm("cvt.rm.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ushort2half_ru(unsigned short int i) +{ + __half h; + asm("cvt.rp.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} + +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(__half h) +{ + unsigned long long int i; + asm("cvt.rni.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rz(__half h) +{ + unsigned long long int i; + asm("cvt.rzi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(__half h) +{ + unsigned long long int i; + asm("cvt.rmi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(__half h) +{ + unsigned long long int i; + asm("cvt.rpi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ __half __ull2half_rn(unsigned long long int i) +{ + __half h; + asm("cvt.rn.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ull2half_rz(unsigned long long int i) +{ + __half h; + asm("cvt.rz.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ull2half_rd(unsigned long long int i) +{ + __half h; + asm("cvt.rm.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ull2half_ru(unsigned long long int i) +{ + __half h; + asm("cvt.rp.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} + +__CUDA_FP16_DECL__ long long int __half2ll_rn(__half h) +{ + long long int i; + asm("cvt.rni.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ long long int __half2ll_rz(__half h) +{ + long long int i; + asm("cvt.rzi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ long long int __half2ll_rd(__half h) +{ + long long int i; + asm("cvt.rmi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ long long int __half2ll_ru(__half h) +{ + long long int i; + asm("cvt.rpi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ __half __ll2half_rn(long long int i) +{ + __half h; + asm("cvt.rn.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ll2half_rz(long long int i) +{ + __half h; + asm("cvt.rz.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ll2half_rd(long long int i) +{ + __half h; + asm("cvt.rm.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ll2half_ru(long long int i) +{ + __half h; + asm("cvt.rp.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} + +__CUDA_FP16_DECL__ __half htrunc(const __half h) +{ + __half r; + asm("cvt.rzi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hceil(const __half h) +{ + __half r; + asm("cvt.rpi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hfloor(const __half h) +{ + __half r; + asm("cvt.rmi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hrint(const __half h) +{ + __half r; + asm("cvt.rni.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} + +__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rzi.f16.f16 low, low;\n" + " cvt.rzi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rpi.f16.f16 low, low;\n" + " cvt.rpi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rmi.f16.f16 low, low;\n" + " cvt.rmi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rni.f16.f16 low, low;\n" + " cvt.rni.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 l, const __half2 h) +{ + __half2 val; + asm("{.reg .f16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {alow,blow};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(l)), "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 l, const __half2 h) +{ + __half2 val; + asm("{.reg .f16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(l)), "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half __low2half(const __half2 h) +{ + __half ret; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, low;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(h))); + return ret; +} +__CUDA_FP16_DECL__ int __hisinf(const __half a) +{ + if (__HALF_TO_CUS(a) == 0xFC00) { + return -1; + } + if (__HALF_TO_CUS(a) == 0x7C00) { + return 1; + } + return 0; +} +__CUDA_FP16_DECL__ __half2 __low2half2(const __half2 l) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(l))); + return val; +} +__CUDA_FP16_DECL__ __half2 __high2half2(const __half2 l) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(l))); + return val; +} +__CUDA_FP16_DECL__ __half __high2half(const __half2 h) +{ + __half ret; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, high;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(h))); + return ret; +} +__CUDA_FP16_DECL__ __half2 __halves2half2(const __half l, const __half h) +{ + __half2 val; + asm("{ mov.b32 %0, {%1,%2};}\n" + : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(l)), "h"(__HALF_TO_CUS(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 __half2half2(const __half lh) +{ + __half2 val; + asm("{ mov.b32 %0, {%1,%1};}\n" + : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(lh))); + return val; +} +__CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 lh) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(lh))); + return val; +} +__CUDA_FP16_DECL__ short int __half_as_short(const __half h) +{ + return (short int)__HALF_TO_CUS(h); +} +__CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h) +{ + return __HALF_TO_CUS(h); +} +__CUDA_FP16_DECL__ __half __short_as_half(const short int i) +{ + __half h; + __HALF_TO_US(h) = (unsigned short int)i; + return h; +} +__CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i) +{ + __half h; + __HALF_TO_US(h) = i; + return h; +} + +#if __CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__) +/****************************************************************************** +* __half, __half2 warp shuffle * +******************************************************************************/ +#define __SHUFFLE_HALF2_MACRO(name) do {\ + __half2 r; \ + asm volatile ("{"#name" %0,%1,%2,%3;\n}" \ + :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c)); \ + return r; \ +} while(0) + +#define __SHUFFLE_SYNC_HALF2_MACRO(name) do {\ + __half2 r; \ + asm("{"#name" %0,%1,%2,%3,%4;\n}" \ + :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \ + return r; \ +} while(0) + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 + +__CUDA_FP16_DECL__ __half2 __shfl(__half2 var, int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_HALF2_MACRO(shfl.idx.b32); +} +__CUDA_FP16_DECL__ __half2 __shfl_up(__half2 var, unsigned int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = (warpSize - width) << 8; + __SHUFFLE_HALF2_MACRO(shfl.up.b32); +} +__CUDA_FP16_DECL__ __half2 __shfl_down(__half2 var, unsigned int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_HALF2_MACRO(shfl.down.b32); +} +__CUDA_FP16_DECL__ __half2 __shfl_xor(__half2 var, int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_HALF2_MACRO(shfl.bfly.b32); +} + +#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */ + +__CUDA_FP16_DECL__ __half2 __shfl_sync(unsigned mask, __half2 var, int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.idx.b32); +} +__CUDA_FP16_DECL__ __half2 __shfl_up_sync(unsigned mask, __half2 var, unsigned int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = (warpSize - width) << 8; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.up.b32); +} +__CUDA_FP16_DECL__ __half2 __shfl_down_sync(unsigned mask, __half2 var, unsigned int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.down.b32); +} +__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(unsigned mask, __half2 var, int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.bfly.b32); +} + +#undef __SHUFFLE_HALF2_MACRO +#undef __SHUFFLE_SYNC_HALF2_MACRO + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 + +__CUDA_FP16_DECL__ __half __shfl(__half var, int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_up(__half var, unsigned int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_up(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_down(__half var, unsigned int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_down(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_xor(__half var, int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_xor(temp1, delta, width); + return __low2half(temp2); +} + +#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */ + +__CUDA_FP16_DECL__ __half __shfl_sync(unsigned mask, __half var, int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_up_sync(unsigned mask, __half var, unsigned int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_up_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_down_sync(unsigned mask, __half var, unsigned int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_down_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_xor_sync(unsigned mask, __half var, int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_xor_sync(mask, temp1, delta, width); + return __low2half(temp2); +} + +#endif /*__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__)*/ +/****************************************************************************** +* __half and __half2 __ldg,__ldcg,__ldca,__ldcs * +******************************************************************************/ + +#if defined(__cplusplus) && (__CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__)) +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ +__CUDA_FP16_DECL__ __half2 __ldg(const __half2 *ptr) +{ + __half2 ret; + asm ("ld.global.nc.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldg(const __half *ptr) +{ + __half ret; + asm ("ld.global.nc.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *ptr) +{ + __half2 ret; + asm ("ld.global.cg.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcg(const __half *ptr) +{ + __half ret; + asm ("ld.global.cg.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldca(const __half2 *ptr) +{ + __half2 ret; + asm ("ld.global.ca.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldca(const __half *ptr) +{ + __half ret; + asm ("ld.global.ca.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *ptr) +{ + __half2 ret; + asm ("ld.global.cs.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcs(const __half *ptr) +{ + __half ret; + asm ("ld.global.cs.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +#undef __LDG_PTR +#endif /*defined(__cplusplus) && (__CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__))*/ +#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) +/****************************************************************************** +* __half2 comparison * +******************************************************************************/ +#define __COMPARISON_OP_HALF2_MACRO(name) do {\ + __half2 val; \ + asm( "{ "#name".f16x2.f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + return val; \ +} while(0) +__CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.eq); +} +__CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.ne); +} +__CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.le); +} +__CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.ge); +} +__CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.lt); +} +__CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.gt); +} +__CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.equ); +} +__CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.neu); +} +__CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.leu); +} +__CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.geu); +} +__CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.ltu); +} +__CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.gtu); +} +#undef __COMPARISON_OP_HALF2_MACRO +#define __BOOL_COMPARISON_OP_HALF2_MACRO(name) do {\ + __half2 val; \ + asm( "{ "#name".f16x2.f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + if (__HALF2_TO_CUI(val) == 0x3C003C00) \ + return true; \ + else \ + return false; \ +} while(0) +__CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.eq); +} +__CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.ne); +} +__CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.le); +} +__CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.ge); +} +__CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.lt); +} +__CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.gt); +} +__CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.equ); +} +__CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.neu); +} +__CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.leu); +} +__CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.geu); +} +__CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.ltu); +} +__CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.gtu); +} +#undef __BOOL_COMPARISON_OP_HALF2_MACRO +/****************************************************************************** +* __half comparison * +******************************************************************************/ +#define __COMPARISON_OP_HALF_MACRO(name) do {\ + unsigned short val; \ + asm( "{ .reg .pred __$temp3;\n" \ + " setp."#name".f16 __$temp3, %1, %2;\n" \ + " selp.u16 %0, 1, 0, __$temp3;}" \ + : "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \ + return val ? true : false; \ +} while(0) +__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(eq); +} +__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ne); +} +__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(le); +} +__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ge); +} +__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(lt); +} +__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(gt); +} +__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(equ); +} +__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(neu); +} +__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(leu); +} +__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(geu); +} +__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ltu); +} +__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(gtu); +} +#undef __COMPARISON_OP_HALF_MACRO +/****************************************************************************** +* __half2 arithmetic * +******************************************************************************/ +#define __BINARY_OP_HALF2_MACRO(name) do {\ + __half2 val; \ + asm( "{"#name".f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + return val; \ +} while(0) + +__CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(add); +} +__CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(sub); +} +__CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(mul); +} +__CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(add.sat); +} +__CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(sub.sat); +} +__CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(mul.sat); +} +#undef __BINARY_OP_HALF2_MACRO +#define __TERNARY_OP_HALF2_MACRO(name) do {\ + __half2 val; \ + asm( "{"#name".f16x2 %0,%1,%2,%3;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b)),"r"(__HALF2_TO_CUI(c))); \ + return val; \ +} while(0) +__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn); +} +__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn.sat); +} +#undef __TERNARY_OP_HALF2_MACRO +__CUDA_FP16_DECL__ __half2 __h2div(__half2 a, __half2 b) { + __half ha, hb; + + ha = __low2half(a); + hb = __low2half(b); + + __half v1 = __hdiv(ha, hb); + + ha = __high2half(a); + hb = __high2half(b); + + __half v2 = __hdiv(ha, hb); + + return __halves2half2(v1, v2); +} +/****************************************************************************** +* __half arithmetic * +******************************************************************************/ +#define __BINARY_OP_HALF_MACRO(name) do {\ + __half val; \ + asm( "{"#name".f16 %0,%1,%2;\n}" \ + :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \ + return val; \ +} while(0) +__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(add); +} +__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(sub); +} +__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(mul); +} +__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(add.sat); +} +__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(sub.sat); +} +__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(mul.sat); +} +#undef __BINARY_OP_HALF_MACRO +#define __TERNARY_OP_HALF_MACRO(name) do {\ + __half val; \ + asm( "{"#name".f16 %0,%1,%2,%3;\n}" \ + :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \ + return val; \ +} while(0) +__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn); +} +__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn.sat); +} +#undef __TERNARY_OP_HALF2_MACRO +__CUDA_FP16_DECL__ __half __hdiv(__half a, __half b) { + __half v, abs, den; + __HALF_TO_US(den) = 0x008F; + float fa, fb, fv, rcp; + + fa = __half2float(a); + fb = __half2float(b); + + asm("{rcp.approx.f32 %0, %1;\n}" :"=f"(rcp) : "f"(fb)); + + fv = rcp * fa; + + v = __float2half(fv); + __HALF_TO_US(abs) = (unsigned short)(((unsigned int)__HALF_TO_CUS(v)) & 0x00007FFF); + if (__hlt(abs, den) && (!(__HALF_TO_CUS(abs) == 0x0000))) { + float err = __fmaf_rn(-fb, fv, fa); + fv = __fmaf_rn(rcp, err, fv); + v = __float2half(fv); + } + return v; +} + +/****************************************************************************** +* __half2 functions * +******************************************************************************/ +#define __SPEC_CASE2(i,r, spc, ulp) \ + "{.reg.b32 spc, ulp, p;\n"\ + " mov.b32 spc,"#spc";\n"\ + " mov.b32 ulp,"#ulp";\n"\ + " set.eq.f16x2.f16x2 p,"#i", spc;\n"\ + " fma.rn.f16x2 "#r",p,ulp,"#r";\n}\n" +#define __SPEC_CASE(i,r, spc, ulp) \ + "{.reg.b16 spc, ulp, p;\n"\ + " mov.b16 spc,"#spc";\n"\ + " mov.b16 ulp,"#ulp";\n"\ + " set.eq.f16.f16 p,"#i", spc;\n"\ + " fma.rn.f16 "#r",p,ulp,"#r";\n}\n" +#define __APPROX_FCAST(fun) do {\ + __half val;\ + asm("{.reg.b32 f; \n"\ + " .reg.b16 r; \n"\ + " mov.b16 r,%1; \n"\ + " cvt.f32.f16 f,r; \n"\ + " "#fun".approx.f32 f,f; \n"\ + " cvt.rn.f16.f32 r,f; \n"\ + " mov.b16 %0,r; \n"\ + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\ + return val;\ +} while(0) +#define __APPROX_FCAST2(fun) do {\ + __half2 val;\ + asm("{.reg.b16 hl, hu; \n"\ + " .reg.b32 fl, fu; \n"\ + " mov.b32 {hl, hu}, %1; \n"\ + " cvt.f32.f16 fl, hl; \n"\ + " cvt.f32.f16 fu, hu; \n"\ + " "#fun".approx.f32 fl, fl; \n"\ + " "#fun".approx.f32 fu, fu; \n"\ + " cvt.rn.f16.f32 hl, fl; \n"\ + " cvt.rn.f16.f32 hu, fu; \n"\ + " mov.b32 %0, {hl, hu}; \n"\ + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); \ + return val;\ +} while(0) +static __device__ __forceinline__ float __float_simpl_sinf(float); +static __device__ __forceinline__ float __float_simpl_cosf(float); +__CUDA_FP16_DECL__ __half __hsin_internal(const __half a) { + float f = __half2float(a); + f = __float_simpl_sinf(f); + return __float2half_rn(f); +} +__CUDA_FP16_DECL__ __half hsin(const __half a) { + __half r = __hsin_internal(a); + asm("{\n\t" + " .reg.b16 i,r,t; \n\t" + " mov.b16 r, %0; \n\t" + " mov.b16 i, %1; \n\t" + " mov.b16 t, 0x8000; \n\t" + " and.b16 t,r,t; \n\t" + __SPEC_CASE(i, r, 0X32B3, 0x0800) + __SPEC_CASE(i, r, 0X5CB0, 0x1000) + __SPEC_CASE(i, r, 0XB2B3, 0x8800) + __SPEC_CASE(i, r, 0XDCB0, 0x9000) + " or.b16 r,r,t; \n\t" + " mov.b16 %0, r; \n" + "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a) { + __half l = __low2half(a); + __half h = __high2half(a); + __half2 r = __halves2half2(__hsin_internal(l), __hsin_internal(h)); + asm("{\n\t" + " .reg.b32 i,r,t; \n\t" + " mov.b32 r, %0; \n\t" + " mov.b32 i, %1; \n\t" + " and.b32 t, r, 0x80008000; \n\t" + __SPEC_CASE2(i, r, 0X32B332B3, 0x08000800) + __SPEC_CASE2(i, r, 0X5CB05CB0, 0x10001000) + __SPEC_CASE2(i, r, 0XB2B3B2B3, 0x88008800) + __SPEC_CASE2(i, r, 0XDCB0DCB0, 0x90009000) + " or.b32 r, r, t; \n\t" + " mov.b32 %0, r; \n" + "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ __half __hcos_internal(const __half a) { + float f = __half2float(a); + f = __float_simpl_cosf(f); + return __float2half_rn(f); +} +__CUDA_FP16_DECL__ __half hcos(const __half a) { + __half r = __hcos_internal(a); + asm("{\n\t" + " .reg.b16 i,r; \n\t" + " mov.b16 r, %0; \n\t" + " mov.b16 i, %1; \n\t" + __SPEC_CASE(i, r, 0X2B7C, 0x1000) + __SPEC_CASE(i, r, 0XAB7C, 0x1000) + " mov.b16 %0, r; \n" + "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a) { + __half l = __low2half(a); + __half h = __high2half(a); + __half2 r = __halves2half2(__hcos_internal(l), __hcos_internal(h)); + asm("{\n\t" + " .reg.b32 i,r; \n\t" + " mov.b32 r, %0; \n\t" + " mov.b32 i, %1; \n\t" + __SPEC_CASE2(i, r, 0X2B7C2B7C, 0x10001000) + __SPEC_CASE2(i, r, 0XAB7CAB7C, 0x10001000) + " mov.b32 %0, r; \n" + "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +static __device__ __forceinline__ float __internal_trig_reduction_kernel(float a, int *quadrant) +{ + float j, t; + int q; + q = __float2int_rn(a * 0.636619772F); + j = (float)q; + t = __fmaf_rn(-j, 1.5707962512969971e+000F, a); + t = __fmaf_rn(-j, 7.5497894158615964e-008F, t); + *quadrant = q; + return t; +} +static __device__ __forceinline__ float __internal_sin_cos_kernel(float x, int i) +{ + float x2, z; + x2 = x*x; + + if (i & 1) { + z = 2.44331571e-5F; + z = __fmaf_rn(z, x2, -1.38873163e-3F); + } + else { + z = -1.95152959e-4F; + z = __fmaf_rn(z, x2, 8.33216087e-3F); + } + if (i & 1) { + z = __fmaf_rn(z, x2, 4.16666457e-2F); + z = __fmaf_rn(z, x2, -5.00000000e-1F); + } + else { + z = __fmaf_rn(z, x2, -1.66666546e-1F); + z = __fmaf_rn(z, x2, 0.0F); + } + x = __fmaf_rn(z, x, x); + if (i & 1) { + x = __fmaf_rn(z, x2, 1.0F); + } + if (i & 2) { + x = __fmaf_rn(x, -1.0F, 0.0F); + } + return x; +} +static __device__ __forceinline__ float __float_simpl_sinf(float a) +{ + float z; + int i; + if (::isinf(a)) { + a = a * 0.0F; + } + a = __internal_trig_reduction_kernel(a, &i); + z = __internal_sin_cos_kernel(a, i); + return z; +} +static __device__ __forceinline__ float __float_simpl_cosf(float a) +{ + float z; + int i; + if (::isinf(a)) { + a = a * 0.0F; + } + a = __internal_trig_reduction_kernel(a, &i); + i++; + z = __internal_sin_cos_kernel(a, i); + return z; +} + +__CUDA_FP16_DECL__ __half hexp(const __half a) { + __half val; + asm("{.reg.b32 f, C; \n" + " .reg.b16 h,r; \n" + " mov.b16 h,%1; \n" + " cvt.f32.f16 f,h; \n" + " mov.b32 C, 0x3fb8aa3b; \n" + " mul.f32 f,f,C; \n" + " ex2.approx.f32 f,f; \n" + " cvt.rn.f16.f32 r,f; \n" + __SPEC_CASE(h, r, 0X1F79, 0x9400) + __SPEC_CASE(h, r, 0X25CF, 0x9400) + __SPEC_CASE(h, r, 0XC13B, 0x0400) + __SPEC_CASE(h, r, 0XC1EF, 0x0200) + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu, C; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " mov.b32 C, 0x3fb8aa3b; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " ex2.approx.f32 fl, fl; \n" + " ex2.approx.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0X1F791F79, 0x94009400) + __SPEC_CASE2(h, r, 0X25CF25CF, 0x94009400) + __SPEC_CASE2(h, r, 0XC13BC13B, 0x04000400) + __SPEC_CASE2(h, r, 0XC1EFC1EF, 0x02000200) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hexp2(const __half a) { + __half val; + asm("{.reg.b32 f, ULP; \n" + " .reg.b16 r; \n" + " mov.b16 r,%1; \n" + " cvt.f32.f16 f,r; \n" + " ex2.approx.f32 f,f; \n" + " mov.b32 ULP, 0x33800000;\n" + " fma.rn.f32 f,f,ULP,f; \n" + " cvt.rn.f16.f32 r,f; \n" + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 fl, fu, ULP; \n" + " mov.b32 {hl, hu}, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " ex2.approx.f32 fl, fl; \n" + " ex2.approx.f32 fu, fu; \n" + " mov.b32 ULP, 0x33800000;\n" + " fma.rn.f32 fl,fl,ULP,fl; \n" + " fma.rn.f32 fu,fu,ULP,fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 %0, {hl, hu}; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hexp10(const __half a) { + __half val; + asm("{.reg.b16 h,r; \n" + " .reg.b32 f, C; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " mov.b32 C, 0x40549A78; \n" + " mul.f32 f,f,C; \n" + " ex2.approx.f32 f, f; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(h, r, 0x34DE, 0x9800) + __SPEC_CASE(h, r, 0x9766, 0x9000) + __SPEC_CASE(h, r, 0x9972, 0x1000) + __SPEC_CASE(h, r, 0xA5C4, 0x1000) + __SPEC_CASE(h, r, 0xBF0A, 0x8100) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu, C; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " mov.b32 C, 0x40549A78; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " ex2.approx.f32 fl, fl; \n" + " ex2.approx.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0x34DE34DE, 0x98009800) + __SPEC_CASE2(h, r, 0x97669766, 0x90009000) + __SPEC_CASE2(h, r, 0x99729972, 0x10001000) + __SPEC_CASE2(h, r, 0xA5C4A5C4, 0x10001000) + __SPEC_CASE2(h, r, 0xBF0ABF0A, 0x81008100) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog2(const __half a) { + __half val; + asm("{.reg.b16 h, r; \n" + " .reg.b32 f; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " lg2.approx.f32 f, f; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(r, r, 0xA2E2, 0x8080) + __SPEC_CASE(r, r, 0xBF46, 0x9400) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 fl, fu, r, p; \n" + " mov.b32 {hl, hu}, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.f32 fl, fl; \n" + " lg2.approx.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(r, r, 0xA2E2A2E2, 0x80808080) + __SPEC_CASE2(r, r, 0xBF46BF46, 0x94009400) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog(const __half a) { + __half val; + asm("{.reg.b32 f, C; \n" + " .reg.b16 r,h; \n" + " mov.b16 h,%1; \n" + " cvt.f32.f16 f,h; \n" + " lg2.approx.f32 f,f; \n" + " mov.b32 C, 0x3f317218; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.f16.f32 r,f; \n" + __SPEC_CASE(h, r, 0X160D, 0x9C00) + __SPEC_CASE(h, r, 0X3BFE, 0x8010) + __SPEC_CASE(h, r, 0X3C0B, 0x8080) + __SPEC_CASE(h, r, 0X6051, 0x1C00) + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.f32 fl, fl; \n" + " lg2.approx.f32 fu, fu; \n" + " mov.b32 C, 0x3f317218; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0X160D160D, 0x9C009C00) + __SPEC_CASE2(h, r, 0X3BFE3BFE, 0x80108010) + __SPEC_CASE2(h, r, 0X3C0B3C0B, 0x80808080) + __SPEC_CASE2(h, r, 0X60516051, 0x1C001C00) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog10(const __half a) { + __half val; + asm("{.reg.b16 h, r; \n" + " .reg.b32 f, C; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " lg2.approx.f32 f, f; \n" + " mov.b32 C, 0x3E9A209B; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(h, r, 0x338F, 0x1000) + __SPEC_CASE(h, r, 0x33F8, 0x9000) + __SPEC_CASE(h, r, 0x57E1, 0x9800) + __SPEC_CASE(h, r, 0x719D, 0x9C00) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.f32 fl, fl; \n" + " lg2.approx.f32 fu, fu; \n" + " mov.b32 C, 0x3E9A209B; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0x338F338F, 0x10001000) + __SPEC_CASE2(h, r, 0x33F833F8, 0x90009000) + __SPEC_CASE2(h, r, 0x57E157E1, 0x98009800) + __SPEC_CASE2(h, r, 0x719D719D, 0x9C009C00) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +#undef __SPEC_CASE2 +#undef __SPEC_CASE +__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a) { + __APPROX_FCAST2(rcp); +} +__CUDA_FP16_DECL__ __half hrcp(const __half a) { + __APPROX_FCAST(rcp); +} +__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a) { + __APPROX_FCAST2(rsqrt); +} +__CUDA_FP16_DECL__ __half hrsqrt(const __half a) { + __APPROX_FCAST(rsqrt); +} +__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a) { + __APPROX_FCAST2(sqrt); +} +__CUDA_FP16_DECL__ __half hsqrt(const __half a) { + __APPROX_FCAST(sqrt); +} +#undef __APPROX_FCAST +#undef __APPROX_FCAST2 +__CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a) +{ + __half2 r; + asm("{set.nan.f16x2.f16x2 %0,%1,%2;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ bool __hisnan(const __half a) +{ + __half r; + asm("{set.nan.f16.f16 %0,%1,%2;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(a))); + return __HALF_TO_CUS(r) != 0U; +} +__CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a) +{ + __half2 zero = __float2half2_rn(0.0); + return __hsub2(zero, a); +} +__CUDA_FP16_DECL__ __half __hneg(const __half a) +{ + __half zero; + zero = __float2half(0.0); + return __hsub(zero, a); +} +__CUDA_FP16_DECL__ __half2 __habs2(const __half2 a) +{ + __half2 r; + asm("{abs.f16x2 %0,%1;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ __half __habs(const __half a) +{ + __half r; + asm("{abs.f16 %0,%1;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +#endif /*__CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ + +/* Define __PTR for atomicAdd prototypes below, undef after done */ +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __PTR "l" +#else +#define __PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 + +__CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *address, __half2 val) { + __half2 r; + asm volatile ("{ atom.add.noftz.f16x2 %0,[%1],%2; }\n" + : "=r"(__HALF2_TO_UI(r)) : __PTR(address), "r"(__HALF2_TO_CUI(val)) + : "memory"); + return r; +} + +#endif /*!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600*/ + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + +__CUDA_FP16_DECL__ __half atomicAdd(__half *address, __half val) { + __half r; + asm volatile ("{ atom.add.noftz.f16 %0,[%1],%2; }\n" + : "=h"(__HALF_TO_US(r)) + : __PTR(address), "h"(__HALF_TO_CUS(val)) + : "memory"); + return r; +} + +#endif /*!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700*/ + +#undef __PTR + +#undef __CUDA_FP16_DECL__ +#endif /* defined(__CUDACC__) */ +#endif /* defined(__cplusplus) */ + +#undef __CUDA_HOSTDEVICE_FP16_DECL__ +#undef __CUDA_FP16_DECL__ + +/* Define first-class types "half" and "half2", unless user specifies otherwise via "#define CUDA_NO_HALF" */ +/* C cannot ever have these types defined here, because __half and __half2 are C++ classes */ +#if defined(__cplusplus) && !defined(CUDA_NO_HALF) +typedef __half half; +typedef __half2 half2; +#endif /* defined(__cplusplus) && !defined(CUDA_NO_HALF) */ + +#endif /* end of include guard: __CUDA_FP16_HPP__ */ diff --git a/cupy/_core/include/cupy/_cuda/cuda-11.0/cuda_fp16.h b/cupy/_core/include/cupy/_cuda/cuda-11.0/cuda_fp16.h new file mode 100755 index 0000000..4729425 --- /dev/null +++ b/cupy/_core/include/cupy/_cuda/cuda-11.0/cuda_fp16.h @@ -0,0 +1,3612 @@ +/* +* Copyright 1993-2020 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +/** +* \defgroup CUDA_MATH_INTRINSIC_HALF Half Precision Intrinsics +* This section describes half precision intrinsic functions that are +* only supported in device code. +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_ARITHMETIC Half Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_ARITHMETIC Half2 Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_COMPARISON Half Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_COMPARISON Half2 Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_MISC Half Precision Conversion And Data Movement +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_FUNCTIONS Half Math Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_FUNCTIONS Half2 Math Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +#ifndef __CUDA_FP16_H__ +#define __CUDA_FP16_H__ + +#if defined(__cplusplus) +#if defined(__CUDACC__) +#define __CUDA_FP16_DECL__ static __device__ __inline__ +#define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__ +#else +#define __CUDA_HOSTDEVICE_FP16_DECL__ static +#endif /* defined(__CUDACC__) */ + +#define __CUDA_FP16_TYPES_EXIST__ + +/* Forward-declaration of structures defined in "cuda_fp16.hpp" */ + +/** + * \brief half datatype + * + * \details This structure implements the datatype for storing + * half-precision floating-point numbers. The structure implements + * assignment operators and type conversions. + * 16 bits are being used in total: 1 sign bit, 5 bits for the exponent, + * and the significand is being stored in 10 bits. + * The total precision is 11 bits. There are 15361 representable + * numbers within the interval [0.0, 1.0], endpoints included. + * On average we have log10(2**11) ~ 3.311 decimal digits. + * + * \internal + * \req IEEE 754-2008 compliant implementation of half-precision + * floating-point numbers. + * \endinternal + */ +struct __half; + +/** + * \brief half2 datatype + * + * \details This structure implements the datatype for storing two + * half-precision floating-point numbers. + * The structure implements assignment operators and type conversions. + * + * \internal + * \req Vectorified version of half. + * \endinternal + */ +struct __half2; + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts double number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts double number \p a to half precision in round-to-nearest-even mode. +* \param[in] a - double. Is only being read. +* \returns half +* \retval \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. +* \returns half +* \retval \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. +* \returns half +* \retval \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-towards-zero mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-towards-zero mode. +* \param[in] a - float. Is only being read. +* \returns half +* \retval \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-down mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-down mode. +* \param[in] a - float. Is only being read. +* +* \returns half +* \retval \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-up mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-up mode. +* \param[in] a - float. Is only being read. +* +* \returns half +* \retval \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts \p half number to float. +* +* \details Converts half number \p a to float. +* \param[in] a - float. Is only being read. +* +* \returns float +* \retval \p a converted to float. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts input to half precision in round-to-nearest-even mode and +* populates both halves of \p half2 with converted value. +* +* \details Converts input \p a to half precision in round-to-nearest-even mode and +* populates both halves of \p half2 with converted value. +* \param[in] a - float. Is only being read. +* +* \returns half2 +* \retval The \p half2 value with both halves equal to the converted half +* precision number. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both input floats to half precision in round-to-nearest-even +* mode and returns \p half2 with converted values. +* +* \details Converts both input floats to half precision in round-to-nearest-even mode +* and combines the results into one \p half2 number. Low 16 bits of the return +* value correspond to the input \p a, high 16 bits correspond to the input \p +* b. +* \param[in] a - float. Is only being read. +* \param[in] b - float. Is only being read. +* +* \returns half2 +* \retval The \p half2 value with corresponding halves equal to the +* converted input floats. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts low 16 bits of \p half2 to float and returns the result +* +* \details Converts low 16 bits of \p half2 input \p a to 32 bit floating point number +* and returns the result. +* \param[in] a - half2. Is only being read. +* +* \returns float +* \retval The low 16 bits of \p a converted to float. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts high 16 bits of \p half2 to float and returns the result +* +* \details Converts high 16 bits of \p half2 input \p a to 32 bit floating point number +* and returns the result. +* \param[in] a - half2. Is only being read. +* +* \returns float +* \retval The high 16 bits of \p a converted to float. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a); + +#if defined(__CUDACC__) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both components of float2 number to half precision in +* round-to-nearest-even mode and returns \p half2 with converted values. +* +* \details Converts both components of float2 to half precision in round-to-nearest +* mode and combines the results into one \p half2 number. Low 16 bits of the +* return value correspond to \p a.x and high 16 bits of the return value +* correspond to \p a.y. +* \param[in] a - float2. Is only being read. +* +* \returns half2 +* \retval The \p half2 which has corresponding halves equal to the +* converted float2 components. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both halves of \p half2 to float2 and returns the result. +* +* \details Converts both halves of \p half2 input \p a to float2 and returns the +* result. +* \param[in] a - half2. Is only being read. +* +* \returns float2 +* \retval \p a converted to float2. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-to-nearest-even mode. +* +* \details Convert the half-precision floating point value \p h to a signed integer in +* round-to-nearest-even mode. +* \param[in] h - half. Is only being read. +* +* \returns int +* \retval \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __half2int_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-towards-zero mode. +* +* \details Convert the half-precision floating point value \p h to a signed integer in +* round-towards-zero mode. +* \param[in] h - half. Is only being read. +* +* \returns int +* \retval \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __half2int_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-down mode. +* +* \details Convert the half-precision floating point value \p h to a signed integer in +* round-down mode. +* \param[in] h - half. Is only being read. +* +* \returns int +* \retval \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __half2int_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-up mode. +* +* \details Convert the half-precision floating point value \p h to a signed integer in +* round-up mode. +* \param[in] h - half. Is only being read. +* +* \returns int +* \retval \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __half2int_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-to-nearest-even mode. +* +* \details Convert the signed integer value \p i to a half-precision floating point +* value in round-to-nearest-even mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __int2half_rn(int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-towards-zero mode. +* +* \details Convert the signed integer value \p i to a half-precision floating point +* value in round-towards-zero mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __int2half_rz(int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-down mode. +* +* \details Convert the signed integer value \p i to a half-precision floating point +* value in round-down mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __int2half_rd(int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-up mode. +* +* \details Convert the signed integer value \p i to a half-precision floating point +* value in round-up mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __int2half_ru(int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating point value \p h to a signed short +* integer in round-to-nearest-even mode. +* \param[in] h - half. Is only being read. +* +* \returns short int +* \retval \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half2short_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-towards-zero mode. +* +* \details Convert the half-precision floating point value \p h to a signed short +* integer in round-towards-zero mode. +* \param[in] h - half. Is only being read. +* +* \returns short int +* \retval \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half2short_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-down mode. +* +* \details Convert the half-precision floating point value \p h to a signed short +* integer in round-down mode. +* \param[in] h - half. Is only being read. +* +* \returns short int +* \retval \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half2short_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-up mode. +* +* \details Convert the half-precision floating point value \p h to a signed short +* integer in round-up mode. +* \param[in] h - half. Is only being read. +* +* \returns short int +* \retval \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half2short_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating +* point value in round-to-nearest-even mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __short2half_rn(short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-towards-zero mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating +* point value in round-towards-zero mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __short2half_rz(short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-down mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating +* point value in round-down mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __short2half_rd(short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-up mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating +* point value in round-up mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __short2half_ru(short int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-to-nearest-even mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned integer +* in round-to-nearest-even mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* \retval \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-towards-zero mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned integer +* in round-towards-zero mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* \retval \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-down mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned integer +* in round-down mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* \retval \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-up mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned integer +* in round-up mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* \retval \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-to-nearest-even mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __uint2half_rn(unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-towards-zero mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating point +* value in round-towards-zero mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __uint2half_rz(unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-down mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating point +* value in round-down mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __uint2half_rd(unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-up mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating point +* value in round-up mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __uint2half_ru(unsigned int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned short +* integer in round-to-nearest-even mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* \retval \p h converted to an unsigned short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-towards-zero +* mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned short +* integer in round-towards-zero mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* \retval \p h converted to an unsigned short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-down mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned short +* integer in round-down mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* \retval \p h converted to an unsigned short integer. +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-up mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned short +* integer in round-up mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* \retval \p h converted to an an unsigned short integer. +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating +* point value in round-to-nearest-even mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ushort2half_rn(unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-towards-zero +* mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating +* point value in round-towards-zero mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ushort2half_rz(unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-down mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating +* point value in round-down mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ushort2half_rd(unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-up mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating +* point value in round-up mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ushort2half_ru(unsigned short int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned 64-bit +* integer in round-to-nearest-even mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* \retval \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-towards-zero +* mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned 64-bit +* integer in round-towards-zero mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* \retval \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-down mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned 64-bit +* integer in round-down mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* \retval \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-up mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned 64-bit +* integer in round-up mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* \retval \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating +* point value in round-to-nearest-even mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ull2half_rn(unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-towards-zero +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating +* point value in round-towards-zero mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ull2half_rz(unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-down mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating +* point value in round-down mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ull2half_rd(unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-up mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating +* point value in round-up mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ull2half_ru(unsigned long long int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating point value \p h to a signed 64-bit +* integer in round-to-nearest-even mode. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* \retval \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ long long int __half2ll_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-towards-zero mode. +* +* \details Convert the half-precision floating point value \p h to a signed 64-bit +* integer in round-towards-zero mode. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* \retval \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ long long int __half2ll_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-down mode. +* +* \details Convert the half-precision floating point value \p h to a signed 64-bit +* integer in round-down mode. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* \retval \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ long long int __half2ll_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-up mode. +* +* \details Convert the half-precision floating point value \p h to a signed 64-bit +* integer in round-up mode. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* \retval \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ long long int __half2ll_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating +* point value in round-to-nearest-even mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ll2half_rn(long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-towards-zero mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating +* point value in round-towards-zero mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +*/ +__CUDA_FP16_DECL__ __half __ll2half_rz(long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-down mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating +* point value in round-down mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ll2half_rd(long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-up mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating +* point value in round-up mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ll2half_ru(long long int i); + +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Truncate input argument to the integral part. +* +* \details Round \p h to the nearest integer value that does not exceed \p h in +* magnitude. +* \param[in] h - half. Is only being read. +* +* \returns half +* \retval The truncated integer value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half htrunc(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculate ceiling of the input argument. +* +* \details Compute the smallest integer value not less than \p h. +* \param[in] h - half. Is only being read. +* +* \returns half +* \retval The smallest integer value not less than \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hceil(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details Calculate the largest integer value which is less than or equal to \p h. +* \param[in] h - half. Is only being read. +* +* \returns half +* \retval The largest integer value which is less than or equal to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hfloor(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Round input to nearest integer value in half-precision floating point +* number. +* +* \details Round \p h to the nearest integer value in half-precision floating point +* format, with halfway cases rounded to the nearest even integer value. +* \param[in] h - half. Is only being read. +* +* \returns half +* \retval The nearest integer to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hrint(const __half h); + +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Truncate \p half2 vector input argument to the integral part. +* +* \details Round each component of vector \p h to the nearest integer value that does +* not exceed \p h in magnitude. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* \retval The truncated \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculate \p half2 vector ceiling of the input argument. +* +* \details For each component of vector \p h compute the smallest integer value not less +* than \p h. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* \retval The vector of smallest integers not less than \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details For each component of vector \p h calculate the largest integer value which +* is less than or equal to \p h. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* \retval The vector of largest integers which is less than or equal to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Round input to nearest integer value in half-precision floating point +* number. +* +* \details Round each component of \p half2 vector \p h to the nearest integer value in +* half-precision floating point format, with halfway cases rounded to the +* nearest even integer value. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* \retval The vector of rounded integer values. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns \p half2 with both halves equal to the input value. +* +* \details Returns \p half2 number with both halves equal to the input \p a \p half +* number. +* \param[in] a - half. Is only being read. +* +* \returns half2 +* \retval The vector which has both its halves equal to the input \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __half2half2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Swaps both halves of the \p half2 input. +* +* \details Swaps both halves of the \p half2 input and returns a new \p half2 number +* with swapped halves. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval \p a with its halves being swapped. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts low 16 bits from each of the two \p half2 inputs and combines +* into one \p half2 number. +* +* \details Extracts low 16 bits from each of the two \p half2 inputs and combines into +* one \p half2 number. Low 16 bits from input \p a is stored in low 16 bits of +* the return value, low 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The low 16 bits of \p a and of \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts high 16 bits from each of the two \p half2 inputs and +* combines into one \p half2 number. +* +* \details Extracts high 16 bits from each of the two \p half2 inputs and combines into +* one \p half2 number. High 16 bits from input \p a is stored in low 16 bits of +* the return value, high 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The high 16 bits of \p a and of \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns high 16 bits of \p half2 input. +* +* \details Returns high 16 bits of \p half2 input \p a. +* \param[in] a - half2. Is only being read. +* +* \returns half +* \retval The high 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __high2half(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns low 16 bits of \p half2 input. +* +* \details Returns low 16 bits of \p half2 input \p a. +* \param[in] a - half2. Is only being read. +* +* \returns half +* \retval Returns \p half which contains low 16 bits of the input \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __low2half(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Checks if the input \p half number is infinite. +* +* \details Checks if the input \p half number \p a is infinite. +* \param[in] a - half. Is only being read. +* +* \returns int +* \retval -1 iff \p a is equal to negative infinity, +* \retval 1 iff \p a is equal to positive infinity, +* \retval 0 otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __hisinf(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Combines two \p half numbers into one \p half2 number. +* +* \details Combines two input \p half number \p a and \p b into one \p half2 number. +* Input \p a is stored in low 16 bits of the return value, input \p b is stored +* in high 16 bits of the return value. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half2 +* \retval The half2 with one half equal to \p a and the other to \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts low 16 bits from \p half2 input. +* +* \details Extracts low 16 bits from \p half2 input \p a and returns a new \p half2 +* number which has both halves equal to the extracted bits. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The half2 with both halves equal to the low 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __low2half2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts high 16 bits from \p half2 input. +* +* \details Extracts high 16 bits from \p half2 input \p a and returns a new \p half2 +* number which has both halves equal to the extracted bits. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The half2 with both halves equal to the high 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __high2half2(const __half2 a); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a \p half as a signed short integer. +* +* \details Reinterprets the bits in the half-precision floating point number \p h +* as a signed short integer. +* \param[in] h - half. Is only being read. +* +* \returns short int +* \retval The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half_as_short(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a \p half as an unsigned short integer. +* +* \details Reinterprets the bits in the half-precision floating point \p h +* as an unsigned short number. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* \retval The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a signed short integer as a \p half. +* +* \details Reinterprets the bits in the signed short integer \p i as a +* half-precision floating point number. +* \param[in] i - short int. Is only being read. +* +* \returns half +* \retval The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __short_as_half(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in an unsigned short integer as a \p half. +* +* \details Reinterprets the bits in the unsigned short integer \p i as a +* half-precision floating point number. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* \retval The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i); + +#if __CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__) +#if !defined warpSize && !defined __local_warpSize +#define warpSize 32 +#define __local_warpSize +#endif + +#if defined(_WIN32) +# define __DEPRECATED__(msg) __declspec(deprecated(msg)) +#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__)))) +# define __DEPRECATED__(msg) __attribute__((deprecated)) +#else +# define __DEPRECATED__(msg) __attribute__((deprecated(msg))) +#endif + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 +#define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)." + +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) __half2 __shfl(__half2 var, int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) __half2 __shfl_up(__half2 var, unsigned int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down))__half2 __shfl_down(__half2 var, unsigned int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half2 __shfl_xor(__half2 var, int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) __half __shfl(__half var, int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) __half __shfl_up(__half var, unsigned int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) __half __shfl_down(__half var, unsigned int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half __shfl_xor(__half var, int delta, int width = warpSize); +#endif + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of var held by the thread whose ID is given by delta. +* If width is less than warpSize then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], +* the value returned corresponds to the value of var held by the delta modulo width (i.e. +* ithin the same subsection). width must have a value which is a power of 2; +* results are undefined if width is not a power of 2, or is a number greater than +* warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_sync(unsigned mask, __half2 var, int delta, int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* +* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. +* The value of var held by the resulting lane ID is returned: in effect, var is shifted up +* the warp by delta threads. If width is less than warpSize then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. +* width must have a value which is a power of 2; results are undefined if width is not a power of 2, +* or is a number greater than warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_up_sync(unsigned mask, __half2 var, unsigned int delta, int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding delta to the caller's thread ID. +* The value of var held by the resulting thread ID is returned: this has the effect +* of shifting var down the warp by delta threads. If width is less than warpSize then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of width and so the upper delta threads +* will remain unchanged. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_down_sync(unsigned mask, __half2 var, unsigned int delta, int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: +* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each +* group of width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(unsigned mask, __half2 var, int delta, int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of var held by the thread whose ID is given by delta. +* If width is less than warpSize then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], +* the value returned corresponds to the value of var held by the delta modulo width (i.e. +* ithin the same subsection). width must have a value which is a power of 2; +* results are undefined if width is not a power of 2, or is a number greater than +* warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_sync(unsigned mask, __half var, int delta, int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. +* The value of var held by the resulting lane ID is returned: in effect, var is shifted up +* the warp by delta threads. If width is less than warpSize then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. +* width must have a value which is a power of 2; results are undefined if width is not a power of 2, +* or is a number greater than warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_up_sync(unsigned mask, __half var, unsigned int delta, int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding delta to the caller's thread ID. +* The value of var held by the resulting thread ID is returned: this has the effect +* of shifting var down the warp by delta threads. If width is less than warpSize then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of width and so the upper delta threads +* will remain unchanged. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_down_sync(unsigned mask, __half var, unsigned int delta, int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: +* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each +* group of width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_xor_sync(unsigned mask, __half var, int delta, int width = warpSize); + +#if defined(__local_warpSize) +#undef warpSize +#undef __local_warpSize +#endif +#endif /*__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__) */ + +#if defined(__cplusplus) && ( __CUDA_ARCH__ >=320 || !defined(__CUDA_ARCH__) ) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.nc` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldg(const __half2 *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.nc` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldg(const __half *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cg` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cg` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldcg(const __half *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.ca` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldca(const __half2 *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.ca` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldca(const __half *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cs` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cs` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldcs(const __half *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.lu` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldlu(const __half2 *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.lu` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldlu(const __half *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cv` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldcv(const __half2 *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cv` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldcv(const __half *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wb` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwb(__half2 *ptr, __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wb` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwb(__half *ptr, __half value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cg` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcg(__half2 *ptr, __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cg` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcg(__half *ptr, __half value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cs` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcs(__half2 *ptr, __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cs` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcs(__half *ptr, __half value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wt` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwt(__half2 *ptr, __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wt` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwt(__half *ptr, __half value); +#endif /*defined(__cplusplus) && ( __CUDA_ARCH__ >=320 || !defined(__CUDA_ARCH__) )*/ + +#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs half2 vector if-equal comparison. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector not-equal comparison. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-equal comparison. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The \p half2 result of less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-equal comparison. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-than comparison. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The half2 vector result of less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-than comparison. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered if-equal comparison. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of unordered if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered not-equal comparison. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of unordered not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-equal comparison. +* +* Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of unordered less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-equal comparison. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The \p half2 vector result of unordered greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-than comparison. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of unordered less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-than comparison. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The \p half2 vector result of unordered greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Determine whether \p half2 argument is a NaN. +* +* \details Determine whether each half of input \p half2 number \p a is a NaN. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The half2 with the corresponding \p half results set to +* 1.0 for for NaN, 0.0 otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode. +* +* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-95 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The sum of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-104 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The subtraction of vector \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode. +* +* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-102 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The result of elementwise multiplying the vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector division in round-to-nearest-even mode. +* +* \details Divides \p half2 input vector \p a by input vector \p b in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-103 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise division of \p a with \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Calculates the absolute value of both halves of the input \p half2 number and +* returns the result. +* +* \details Calculates the absolute value of both halves of the input \p half2 number and +* returns the result. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval Returns \p a with the absolute value of both halves. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __habs2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest +* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The sum of \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The subtraction of vector \p b from \p a, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The result of elementwise multiplication of vectors \p a and \p b, +* with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-105 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* \retval The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode, with saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the +* results to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* \retval The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c, +* with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Negates both halves of the input \p half2 number and returns the +* result. +* +* \details Negates both halves of the input \p half2 number \p a and returns the result. +* \internal +* \req DEEPLEARN-SRM_REQ-101 +* \endinternal +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval Returns \p a with both halves negated. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Calculates the absolute value of input \p half number and returns the result. +* +* \details Calculates the absolute value of input \p half number and returns the result. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The absolute value of a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __habs(const __half a); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode. +* +* \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-94 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \retval The sum of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half input \p b from input \p a in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-97 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \retval The result of subtracting \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode. +* +* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-99 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \retval The result of multiplying \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half division in round-to-nearest-even mode. +* +* \details Divides \p half input \p a by input \p b in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-98 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \retval The result of dividing \p a by \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half add of inputs \p a and \p b, in round-to-nearest-even mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \retval The sum of \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Subtracts \p half input \p b from input \p a in round-to-nearest +* mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \retval The result of subtraction of \p b from \p a, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest +* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \retval The result of multiplying \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-96 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* \retval The result of fused multiply-add operation on \p +* a, \p b, and \p c. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the result +* to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* \retval The result of fused multiply-add operation on \p +* a, \p b, and \p c, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Negates input \p half number and returns the result. +* +* \details Negates input \p half number and returns the result. +* \internal +* \req DEEPLEARN-SRM_REQ-100 +* \endinternal +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval minus a +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hneg(const __half a); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector if-equal comparison, and returns boolean true +* iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of if-equal comparison +* of vectors \p a and \p b are true; +* \retval false, otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector not-equal comparison, and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of not-equal comparison +* of vectors \p a and \p b are true, +* \retval false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-equal comparison, and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of less-equal comparison +* of vectors \p a and \p b are true; +* \retval false, otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-equal comparison, and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of greater-equal +* comparison of vectors \p a and \p b are true; +* \retval false, otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-than comparison, and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of less-than comparison +* of vectors \p a and \p b are true; +* \retval false, otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-than comparison, and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of greater-than +* comparison of vectors \p a and \p b are true; +* \retval false, otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered if-equal comparison, and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of unordered if-equal +* comparison of vectors \p a and \p b are true; +* \retval false, otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered not-equal comparison, and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of unordered not-equal +* comparison of vectors \p a and \p b are true; +* \retval false, otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-equal comparison, and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of unordered less-equal +* comparison of vectors \p a and \p b are true; +* \retval false, otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-equal comparison, and +* returns boolean true iff both \p half results are true, boolean false +* otherwise. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of unordered +* greater-equal comparison of vectors \p a and \p b are true; +* \retval false, otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-than comparison, and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of unordered less-than comparison of +* vectors \p a and \p b are true; +* \retval false, otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-than comparison, and +* returns boolean true iff both \p half results are true, boolean false +* otherwise. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of unordered +* greater-than comparison of vectors \p a and \p b are true; +* \retval false, otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half if-equal comparison. +* +* \details Performs \p half if-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of if-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half not-equal comparison. +* +* \details Performs \p half not-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of not-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half less-equal comparison. +* +* \details Performs \p half less-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of less-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half greater-equal comparison. +* +* \details Performs \p half greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of greater-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half less-than comparison. +* +* \details Performs \p half less-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of less-than comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half greater-than comparison. +* +* \details Performs \p half greater-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of greater-than comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered if-equal comparison. +* +* \details Performs \p half if-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of unordered if-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered not-equal comparison. +* +* \details Performs \p half not-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of unordered not-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered less-equal comparison. +* +* \details Performs \p half less-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of unordered less-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered greater-equal comparison. +* +* \details Performs \p half greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of unordered greater-equal comparison of \p a +* and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered less-than comparison. +* +* \details Performs \p half less-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of unordered less-than comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered greater-than comparison. +* +* \details Performs \p half greater-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of unordered greater-than comparison of \p a +* and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Determine whether \p half argument is a NaN. +* +* \details Determine whether \p half value \p a is a NaN. +* \param[in] a - half. Is only being read. +* +* \returns bool +* \retval true iff argument is NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hisnan(const __half a); +#if __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__) +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half maximum of two input values. +* +* \details Calculates \p half max(\p a, \p b) +* defined as (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half minimum of two input values. +* +* \details Calculates \p half min(\p a, \p b) +* defined as (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half maximum of two input values, NaNs pass through. +* +* \details Calculates \p half max(\p a, \p b) +* defined as (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmax_nan(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half minimum of two input values, NaNs pass through. +* +* \details Calculates \p half min(\p a, \p b) +* defined as (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmin_nan(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode with relu saturation. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* Then negative result is clamped to 0. +* NaN result is converted to canonical NaN. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* \retval The result of fused multiply-add operation on \p +* a, \p b, and \p c with relu saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector maximum of two inputs. +* +* \details Calculates \p half2 vector max(\p a, \p b) +* Elementwise \p half operation is defined as +* (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The result of elementwise maximum of vectors \p a and \p b +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector minimum of two inputs. +* +* \details Calculates \p half2 vector min(\p a, \p b) +* Elementwise \p half operation is defined as +* (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The result of elementwise minimum of vectors \p a and \p b +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector maximum of two inputs, NaNs pass through. +* +* \details Calculates \p half2 vector max(\p a, \p b) +* Elementwise \p half operation is defined as +* (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The result of elementwise maximum of vectors \p a and \p b, with NaNs pass through +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector minimum of two inputs, NaNs pass through. +* +* \details Calculates \p half2 vector min(\p a, \p b) +* Elementwise \p half operation is defined as +* (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The result of elementwise minimum of vectors \p a and \p b, with NaNs pass through +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode with relu saturation. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* Then negative result is clamped to 0. +* NaN result is converted to canonical NaN. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* \retval The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c); +#endif /*__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)*/ +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half square root in round-to-nearest-even mode. +* +* \details Calculates \p half square root of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The square root of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hsqrt(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half reciprocal square root in round-to-nearest-even +* mode. +* +* \details Calculates \p half reciprocal square root of input \p a in round-to-nearest +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The reciprocal square root of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hrsqrt(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p half reciprocal of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The reciprocal of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hrcp(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half natural logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half natural logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The natural logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hlog(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half binary logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half binary logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The binary logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hlog2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half decimal logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half decimal logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The decimal logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hlog10(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half natural exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half natural exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The natural exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hexp(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half binary exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half binary exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The binary exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hexp2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half decimal exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half decimal exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The decimal exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hexp10(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half cosine in round-to-nearest-even mode. +* +* \details Calculates \p half cosine of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The cosine of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hcos(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half sine in round-to-nearest-even mode. +* +* \details Calculates \p half sine of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The sine of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hsin(const __half a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector square root in round-to-nearest-even mode. +* +* \details Calculates \p half2 square root of input vector \p a in round-to-nearest +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise square root on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector reciprocal square root in round-to-nearest +* mode. +* +* \details Calculates \p half2 reciprocal square root of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise reciprocal square root on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p half2 reciprocal of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise reciprocal on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector natural logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 natural logarithm of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise natural logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2log(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector binary logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 binary logarithm of input vector \p a in round-to-nearest +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise binary logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector decimal logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 decimal logarithm of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise decimal logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half2 exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector binary exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p half2 binary exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise binary exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector decimal exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p half2 decimal exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise decimal exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector cosine in round-to-nearest-even mode. +* +* \details Calculates \p half2 cosine of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise cosine on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector sine in round-to-nearest-even mode. +* +* \details Calculates \p half2 sine of input vector \p a in round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise sine on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a); + +#endif /*if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ + +#if __CUDA_ARCH__ >= 600 || !defined(__CUDA_ARCH__) + +__CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *address, __half2 val); + +#endif /*if __CUDA_ARCH__ >= 600 || !defined(__CUDA_ARCH__)*/ + +#if __CUDA_ARCH__ >= 700 || !defined(__CUDA_ARCH__) + +__CUDA_FP16_DECL__ __half atomicAdd(__half *address, __half val); + +#endif /*if __CUDA_ARCH__ >= 700 || !defined(__CUDA_ARCH__)*/ + +#endif /* defined(__CUDACC__) */ + +#undef __CUDA_FP16_DECL__ +#undef __CUDA_HOSTDEVICE_FP16_DECL__ + +#endif /* defined(__cplusplus) */ + +/* Note the .hpp file is included even for host-side compilation, to capture the "half" & "half2" definitions */ +#include "cuda_fp16.hpp" + +#endif /* end of include guard: __CUDA_FP16_H__ */ diff --git a/cupy/_core/include/cupy/_cuda/cuda-11.0/cuda_fp16.hpp b/cupy/_core/include/cupy/_cuda/cuda-11.0/cuda_fp16.hpp new file mode 100755 index 0000000..aeaa36b --- /dev/null +++ b/cupy/_core/include/cupy/_cuda/cuda-11.0/cuda_fp16.hpp @@ -0,0 +1,2285 @@ +/* +* Copyright 1993-2020 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +#if !defined(__CUDA_FP16_HPP__) +#define __CUDA_FP16_HPP__ + +#if !defined(__CUDA_FP16_H__) +#error "Do not include this file directly. Instead, include cuda_fp16.h." +#endif + +#if !defined(_MSC_VER) && __cplusplus >= 201103L +# define __CPP_VERSION_AT_LEAST_11_FP16 +#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L +# define __CPP_VERSION_AT_LEAST_11_FP16 +#endif + +/* C++11 header for std::move. + * In RTC mode, std::move is provided implicitly; don't include the header + */ +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) && !defined(__CUDACC_RTC__) +#include +#endif /* __cplusplus >= 201103L && !defined(__CUDACC_RTC__) */ + +/* C++ header for std::memcpy (used for type punning in host-side implementations). + * When compiling as a CUDA source file memcpy is provided implicitly. + * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__). + */ +#if defined(__cplusplus) && !defined(__CUDACC__) +#include +#endif /* defined(__cplusplus) && !defined(__CUDACC__) */ + + +/* Set up function decorations */ +#if defined(__CUDACC__) +#define __CUDA_FP16_DECL__ static __device__ __inline__ +#define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__ +#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__ +#define __CUDA_HOSTDEVICE__ __host__ __device__ +#else /* !defined(__CUDACC__) */ +#if defined(__GNUC__) +#define __CUDA_HOSTDEVICE_FP16_DECL__ static __attribute__ ((unused)) +#else +#define __CUDA_HOSTDEVICE_FP16_DECL__ static +#endif /* defined(__GNUC__) */ +#define __CUDA_HOSTDEVICE__ +#endif /* defined(__CUDACC_) */ + +/* Set up structure-alignment attribute */ +#if defined(__CUDACC__) +#define __CUDA_ALIGN__(align) __align__(align) +#else +/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */ +#if __cplusplus >= 201103L +#define __CUDA_ALIGN__(n) alignas(n) /* C++11 kindly gives us a keyword for this */ +#else /* !defined(__CPP_VERSION_AT_LEAST_11_FP16)*/ +#if defined(__GNUC__) +#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +#define __CUDA_ALIGN__(n) __declspec(align(n)) +#else +#define __CUDA_ALIGN__(n) +#endif /* defined(__GNUC__) */ +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ +#endif /* defined(__CUDACC__) */ + +/* Macros to allow half & half2 to be used by inline assembly */ +#define __HALF_TO_US(var) *(reinterpret_cast(&(var))) +#define __HALF_TO_CUS(var) *(reinterpret_cast(&(var))) +#define __HALF2_TO_UI(var) *(reinterpret_cast(&(var))) +#define __HALF2_TO_CUI(var) *(reinterpret_cast(&(var))) + +/* Macros for half & half2 binary arithmetic */ +#define __BINARY_OP_HALF_MACRO(name) /* do */ {\ + __half val; \ + asm( "{"#name".f16 %0,%1,%2;\n}" \ + :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \ + return val; \ +} /* while(0) */ +#define __BINARY_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{"#name".f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + return val; \ +} /* while(0) */ +#define __TERNARY_OP_HALF_MACRO(name) /* do */ {\ + __half val; \ + asm( "{"#name".f16 %0,%1,%2,%3;\n}" \ + :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \ + return val; \ +} /* while(0) */ +#define __TERNARY_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{"#name".f16x2 %0,%1,%2,%3;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b)),"r"(__HALF2_TO_CUI(c))); \ + return val; \ +} /* while(0) */ + +/** +* Types which allow static initialization of "half" and "half2" until +* these become an actual builtin. Note this initialization is as a +* bitfield representation of "half", and not a conversion from short->half. +* Such a representation will be deprecated in a future version of CUDA. +* (Note these are visible to non-nvcc compilers, including C-only compilation) +*/ +typedef struct __CUDA_ALIGN__(2) { + unsigned short x; +} __half_raw; + +typedef struct __CUDA_ALIGN__(4) { + unsigned short x; + unsigned short y; +} __half2_raw; + +/* All other definitions in this file are only visible to C++ compilers */ +#if defined(__cplusplus) + +/* Hide GCC member initialization list warnings because of host/device in-function init requirement */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Weffc++" +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +/* class' : multiple assignment operators specified + The class has multiple assignment operators of a single type. This warning is informational */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( push ) +#pragma warning( disable:4522 ) +#endif /* defined(__GNUC__) */ + +struct __CUDA_ALIGN__(2) __half { +protected: + unsigned short __x; + +public: +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) + __half() = default; +#else + __CUDA_HOSTDEVICE__ __half() { } +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + + /* Convert to/from __half_raw */ + __CUDA_HOSTDEVICE__ __half(const __half_raw &hr) : __x(hr.x) { } + __CUDA_HOSTDEVICE__ __half &operator=(const __half_raw &hr) { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ volatile __half &operator=(const __half_raw &hr) volatile { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ volatile __half &operator=(const volatile __half_raw &hr) volatile { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ operator __half_raw() const { __half_raw ret; ret.x = __x; return ret; } + __CUDA_HOSTDEVICE__ operator __half_raw() const volatile { __half_raw ret; ret.x = __x; return ret; } + +#if !defined(__CUDA_NO_HALF_CONVERSIONS__) + + /* Construct from float/double */ + __CUDA_HOSTDEVICE__ __half(const float f) { __x = __float2half(f).__x; } + __CUDA_HOSTDEVICE__ __half(const double f) { __x = __double2half(f).__x; } + + __CUDA_HOSTDEVICE__ operator float() const { return __half2float(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const float f) { __x = __float2half(f).__x; return *this; } + + /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */ + __CUDA_HOSTDEVICE__ __half &operator=(const double f) { __x = __double2half(f).__x; return *this; } + +/* Member functions only available to nvcc compilation so far */ +#if defined(__CUDACC__) + /* Allow automatic construction from types supported natively in hardware */ + /* Note we do avoid constructor init-list because of special host/device compilation rules */ + __device__ __half(short val) { __x = __short2half_rn(val).__x; } + __device__ __half(unsigned short val) { __x = __ushort2half_rn(val).__x; } + __device__ __half(int val) { __x = __int2half_rn(val).__x; } + __device__ __half(unsigned int val) { __x = __uint2half_rn(val).__x; } + __device__ __half(long long val) { __x = __ll2half_rn(val).__x; } + __device__ __half(unsigned long long val) { __x = __ull2half_rn(val).__x; } + + /* Allow automatic casts to supported builtin types, matching all that are permitted with float */ + __device__ operator short() const { return __half2short_rz(*this); } + __device__ __half &operator=(short val) { __x = __short2half_rn(val).__x; return *this; } + + __device__ operator unsigned short() const { return __half2ushort_rz(*this); } + __device__ __half &operator=(unsigned short val) { __x = __ushort2half_rn(val).__x; return *this; } + + __device__ operator int() const { return __half2int_rz(*this); } + __device__ __half &operator=(int val) { __x = __int2half_rn(val).__x; return *this; } + + __device__ operator unsigned int() const { return __half2uint_rz(*this); } + __device__ __half &operator=(unsigned int val) { __x = __uint2half_rn(val).__x; return *this; } + + __device__ operator long long() const { return __half2ll_rz(*this); } + __device__ __half &operator=(long long val) { __x = __ll2half_rn(val).__x; return *this; } + + __device__ operator unsigned long long() const { return __half2ull_rz(*this); } + __device__ __half &operator=(unsigned long long val) { __x = __ull2half_rn(val).__x; return *this; } + + /* Boolean conversion - note both 0 and -0 must return false */ + __device__ operator bool() const { return (__x & 0x7FFF) != 0; } +#endif /* defined(__CUDACC__) */ +#endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */ +}; + +/* Global-space operator functions are only available to nvcc compilation */ +#if defined(__CUDACC__) + +/* Arithmetic FP16 operations only supported on arch >= 5.3 */ +#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) +#if !defined(__CUDA_NO_HALF_OPERATORS__) +/* Some basic arithmetic operations expected of a builtin */ +__device__ __forceinline__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); } +__device__ __forceinline__ __half operator-(const __half &lh, const __half &rh) { return __hsub(lh, rh); } +__device__ __forceinline__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); } +__device__ __forceinline__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); } + +__device__ __forceinline__ __half &operator+=(__half &lh, const __half &rh) { lh = __hadd(lh, rh); return lh; } +__device__ __forceinline__ __half &operator-=(__half &lh, const __half &rh) { lh = __hsub(lh, rh); return lh; } +__device__ __forceinline__ __half &operator*=(__half &lh, const __half &rh) { lh = __hmul(lh, rh); return lh; } +__device__ __forceinline__ __half &operator/=(__half &lh, const __half &rh) { lh = __hdiv(lh, rh); return lh; } + +/* Note for increment and decrement we use the raw value 0x3C00 equating to half(1.0f), to avoid the extra conversion */ +__device__ __forceinline__ __half &operator++(__half &h) { __half_raw one; one.x = 0x3C00; h += one; return h; } +__device__ __forceinline__ __half &operator--(__half &h) { __half_raw one; one.x = 0x3C00; h -= one; return h; } +__device__ __forceinline__ __half operator++(__half &h, int) { __half ret = h; __half_raw one; one.x = 0x3C00; h += one; return ret; } +__device__ __forceinline__ __half operator--(__half &h, int) { __half ret = h; __half_raw one; one.x = 0x3C00; h -= one; return ret; } + +/* Unary plus and inverse operators */ +__device__ __forceinline__ __half operator+(const __half &h) { return h; } +__device__ __forceinline__ __half operator-(const __half &h) { return __hneg(h); } + +/* Some basic comparison operations to make it look like a builtin */ +__device__ __forceinline__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); } +__device__ __forceinline__ bool operator!=(const __half &lh, const __half &rh) { return __hneu(lh, rh); } +__device__ __forceinline__ bool operator> (const __half &lh, const __half &rh) { return __hgt(lh, rh); } +__device__ __forceinline__ bool operator< (const __half &lh, const __half &rh) { return __hlt(lh, rh); } +__device__ __forceinline__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); } +__device__ __forceinline__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); } +#endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */ +#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) */ +#endif /* defined(__CUDACC__) */ + +/* __half2 is visible to non-nvcc host compilers */ +struct __CUDA_ALIGN__(4) __half2 { + __half x; + __half y; + + // All construct/copy/assign/move +public: +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) + __half2() = default; + __CUDA_HOSTDEVICE__ __half2(__half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); } + __CUDA_HOSTDEVICE__ __half2 &operator=(__half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); return *this; } +#else + __CUDA_HOSTDEVICE__ __half2() { } +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + __CUDA_HOSTDEVICE__ __half2(const __half &a, const __half &b) : x(a), y(b) { } + __CUDA_HOSTDEVICE__ __half2(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); } + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); return *this; } + + /* Convert to/from __half2_raw */ + __CUDA_HOSTDEVICE__ __half2(const __half2_raw &h2r ) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); } + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2_raw &h2r) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); return *this; } + __CUDA_HOSTDEVICE__ operator __half2_raw() const { __half2_raw ret; __HALF2_TO_UI(ret) = __HALF2_TO_CUI(*this); return ret; } +}; + +/* Global-space operator functions are only available to nvcc compilation */ +#if defined(__CUDACC__) + +/* Arithmetic FP16x2 operations only supported on arch >= 5.3 */ +#if (__CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)) && !defined(__CUDA_NO_HALF2_OPERATORS__) + +__device__ __forceinline__ __half2 operator+(const __half2 &lh, const __half2 &rh) { return __hadd2(lh, rh); } +__device__ __forceinline__ __half2 operator-(const __half2 &lh, const __half2 &rh) { return __hsub2(lh, rh); } +__device__ __forceinline__ __half2 operator*(const __half2 &lh, const __half2 &rh) { return __hmul2(lh, rh); } +__device__ __forceinline__ __half2 operator/(const __half2 &lh, const __half2 &rh) { return __h2div(lh, rh); } + +__device__ __forceinline__ __half2& operator+=(__half2 &lh, const __half2 &rh) { lh = __hadd2(lh, rh); return lh; } +__device__ __forceinline__ __half2& operator-=(__half2 &lh, const __half2 &rh) { lh = __hsub2(lh, rh); return lh; } +__device__ __forceinline__ __half2& operator*=(__half2 &lh, const __half2 &rh) { lh = __hmul2(lh, rh); return lh; } +__device__ __forceinline__ __half2& operator/=(__half2 &lh, const __half2 &rh) { lh = __h2div(lh, rh); return lh; } + +__device__ __forceinline__ __half2 &operator++(__half2 &h) { __half2_raw one; one.x = 0x3C00; one.y = 0x3C00; h = __hadd2(h, one); return h; } +__device__ __forceinline__ __half2 &operator--(__half2 &h) { __half2_raw one; one.x = 0x3C00; one.y = 0x3C00; h = __hsub2(h, one); return h; } +__device__ __forceinline__ __half2 operator++(__half2 &h, int) { __half2 ret = h; __half2_raw one; one.x = 0x3C00; one.y = 0x3C00; h = __hadd2(h, one); return ret; } +__device__ __forceinline__ __half2 operator--(__half2 &h, int) { __half2 ret = h; __half2_raw one; one.x = 0x3C00; one.y = 0x3C00; h = __hsub2(h, one); return ret; } + +__device__ __forceinline__ __half2 operator+(const __half2 &h) { return h; } +__device__ __forceinline__ __half2 operator-(const __half2 &h) { return __hneg2(h); } + +__device__ __forceinline__ bool operator==(const __half2 &lh, const __half2 &rh) { return __hbeq2(lh, rh); } +__device__ __forceinline__ bool operator!=(const __half2 &lh, const __half2 &rh) { return __hbneu2(lh, rh); } +__device__ __forceinline__ bool operator>(const __half2 &lh, const __half2 &rh) { return __hbgt2(lh, rh); } +__device__ __forceinline__ bool operator<(const __half2 &lh, const __half2 &rh) { return __hblt2(lh, rh); } +__device__ __forceinline__ bool operator>=(const __half2 &lh, const __half2 &rh) { return __hbge2(lh, rh); } +__device__ __forceinline__ bool operator<=(const __half2 &lh, const __half2 &rh) { return __hble2(lh, rh); } + +#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) */ +#endif /* defined(__CUDACC__) */ + +/* Restore warning for multiple assignment operators */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( pop ) +#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */ + +/* Restore -Weffc++ warnings from here on */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic pop +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +#undef __CUDA_HOSTDEVICE__ +#undef __CUDA_ALIGN__ + +#ifndef __CUDACC_RTC__ /* no host functions in NVRTC mode */ +static unsigned short __internal_float2half(const float f, unsigned int &sign, unsigned int &remainder) +{ + unsigned int x; + unsigned int u; + unsigned int result = 0U; +#if defined(__CUDACC__) + (void)memcpy(&x, &f, sizeof(f)); +#else + (void)std::memcpy(&x, &f, sizeof(f)); +#endif + u = (x & 0x7fffffffU); + sign = ((x >> 16U) & 0x8000U); + // NaN/+Inf/-Inf + if (u >= 0x7f800000U) { + remainder = 0U; + result = ((u == 0x7f800000U) ? (sign | 0x7c00U) : 0x7fffU); + } else if (u > 0x477fefffU) { // Overflows + remainder = 0x80000000U; + result = (sign | 0x7bffU); + } else if (u >= 0x38800000U) { // Normal numbers + remainder = u << 19U; + u -= 0x38000000U; + result = (sign | (u >> 13U)); + } else if (u < 0x33000001U) { // +0/-0 + remainder = u; + result = sign; + } else { // Denormal numbers + const unsigned int exponent = u >> 23U; + const unsigned int shift = 0x7eU - exponent; + unsigned int mantissa = (u & 0x7fffffU); + mantissa |= 0x800000U; + remainder = mantissa << (32U - shift); + result = (sign | (mantissa >> shift)); + } + return static_cast(result); +} +#endif /* #if !defined(__CUDACC_RTC__) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double x) +{ +#if defined(__CUDA_ARCH__) + __half val; + asm("{ cvt.rn.f16.f64 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "d"(x)); + return val; +#else + // Perform rounding to 11 bits of precision, convert value + // to float and call existing float to half conversion. + // By pre-rounding to 11 bits we avoid additional rounding + // in float to half conversion. + unsigned long long int absx; + unsigned long long int ux; +#if defined(__CUDACC__) + (void)memcpy(&ux, &x, sizeof(x)); +#else + (void)std::memcpy(&ux, &x, sizeof(x)); +#endif + absx = (ux & 0x7fffffffffffffffull); + if ((absx >= 0x40f0000000000000ull) || (absx <= 0x3e60000000000000ull)) + { + // |x| >= 2^16 or NaN or |x| <= 2^(-25) + // double-rounding is not a problem + return __float2half(static_cast(x)); + } + + // here 2^(-25) < |x| < 2^16 + // prepare shifter value such that x + shifter + // done in double precision performs round-to-nearest-even + // and (x + shifter) - shifter results in x rounded to + // 11 bits of precision. Shifter needs to have exponent of + // x plus 53 - 11 = 42 and a leading bit in mantissa to guard + // against negative values. + // So need to have |x| capped to avoid overflow in exponent. + // For inputs that are smaller than half precision minnorm + // we prepare fixed shifter exponent. + unsigned long long shifterBits = ux & 0x7ff0000000000000ull; + if (absx >= 0x3f10000000000000ull) + { // |x| >= 2^(-14) + // add 42 to exponent bits + shifterBits += 42ull << 52; + } + else + { // 2^(-25) < |x| < 2^(-14), potentially results in denormal + // set exponent bits to 42 - 14 + bias + shifterBits = ((42ull - 14 + 1023) << 52); + } + // set leading mantissa bit to protect against negative inputs + shifterBits |= 1ull << 51; + double shifter; +#if defined(__CUDACC__) + (void)memcpy(&shifter, &shifterBits, sizeof(shifterBits)); +#else + (void)std::memcpy(&shifter, &shifterBits, sizeof(shifterBits)); +#endif + double xShiftRound = x + shifter; + + // Prevent the compiler from optimizing away x + shifter - shifter + // by doing intermediate memcopy and harmless bitwize operation + unsigned long long int xShiftRoundBits; +#if defined(__CUDACC__) + (void)memcpy(&xShiftRoundBits, &xShiftRound, sizeof(xShiftRound)); +#else + (void)std::memcpy(&xShiftRoundBits, &xShiftRound, sizeof(xShiftRound)); +#endif + + // the value is positive, so this operation doesn't change anything + xShiftRoundBits &= 0x7fffffffffffffffull; + +#if defined(__CUDACC__) + (void)memcpy(&xShiftRound, &xShiftRoundBits, sizeof(xShiftRound)); +#else + (void)std::memcpy(&xShiftRound, &xShiftRoundBits, sizeof(xShiftRound)); +#endif + + double xRounded = xShiftRound - shifter; + float xRndFlt = static_cast(xRounded); + __half res = __float2half(xRndFlt); + return res; +#endif +} + +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign; + unsigned int remainder; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign; + unsigned int remainder; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rz.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign; + unsigned int remainder; + r.x = __internal_float2half(a, sign, remainder); + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rm.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign; + unsigned int remainder; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder != 0U) && (sign != 0U)) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rp.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign; + unsigned int remainder; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder != 0U) && (sign == 0U)) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a) +{ + __half2 val; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low;\n" + " cvt.rn.f16.f32 low, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a)); +#else + val = __half2(__float2half_rn(a), __float2half_rn(a)); +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b) +{ + __half2 val; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low,high;\n" + " cvt.rn.f16.f32 low, %1;\n" + " cvt.rn.f16.f32 high, %2;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b)); +#else + val = __half2(__float2half_rn(a), __float2half_rn(b)); +#endif + return val; +} + +#ifndef __CUDACC_RTC__ /* no host functions in NVRTC mode */ +static float __internal_half2float(const unsigned short h) +{ + unsigned int sign = ((static_cast(h) >> 15U) & 1U); + unsigned int exponent = ((static_cast(h) >> 10U) & 0x1fU); + unsigned int mantissa = ((static_cast(h) & 0x3ffU) << 13U); + float f; + if (exponent == 0x1fU) { /* NaN or Inf */ + sign = ((mantissa != 0U) ? 0U : sign); + mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U); + exponent = 0xffU; + } else if (exponent == 0U) { /* Denorm or Zero */ + if (mantissa != 0U) { + unsigned int msb; + exponent = 0x71U; + do { + msb = (mantissa & 0x400000U); + mantissa <<= 1U; /* normalize */ + --exponent; + } while (msb == 0U); + mantissa &= 0x7fffffU; /* 1.mantissa is implicit */ + } + } else { + exponent += 0x70U; + } + unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa); +#if defined(__CUDACC__) + (void)memcpy(&f, &u, sizeof(u)); +#else + (void)std::memcpy(&f, &u, sizeof(u)); +#endif + return f; +} +#endif /* !defined(__CUDACC_RTC__) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a) +{ + float val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(a))); +#else + val = __internal_half2float(static_cast<__half_raw>(a).x); +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a) +{ + float val; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, low;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a))); +#else + val = __internal_half2float(static_cast<__half2_raw>(a).x); +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a) +{ + float val; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, high;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a))); +#else + val = __internal_half2float(static_cast<__half2_raw>(a).y); +#endif + return val; +} + +/* Intrinsic functions only available to nvcc compilers */ +#if defined(__CUDACC__) + +/* CUDA vector-types compatible vector creation function (note returns __half2, not half2) */ +__VECTOR_FUNCTIONS_DECL__ __half2 make_half2(__half x, __half y) +{ + __half2 t; t.x = x; t.y = y; return t; +} +#undef __VECTOR_FUNCTIONS_DECL__ + + +/* Definitions of intrinsics */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 f) +{ + __half2 val = __floats2half2_rn(f.x, f.y); + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 l) +{ + float hi_float; + float lo_float; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, low;}\n" : "=f"(lo_float) : "r"(__HALF2_TO_CUI(l))); + + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, high;}\n" : "=f"(hi_float) : "r"(__HALF2_TO_CUI(l))); +#else + lo_float = __internal_half2float(((__half2_raw)l).x); + hi_float = __internal_half2float(((__half2_raw)l).y); +#endif + return make_float2(lo_float, hi_float); +} +__CUDA_FP16_DECL__ int __half2int_rn(__half h) +{ + int i; + asm("cvt.rni.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ int __half2int_rz(__half h) +{ + int i; + asm("cvt.rzi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ int __half2int_rd(__half h) +{ + int i; + asm("cvt.rmi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ int __half2int_ru(__half h) +{ + int i; + asm("cvt.rpi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ __half __int2half_rn(int i) +{ + __half h; + asm("cvt.rn.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __int2half_rz(int i) +{ + __half h; + asm("cvt.rz.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __int2half_rd(int i) +{ + __half h; + asm("cvt.rm.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __int2half_ru(int i) +{ + __half h; + asm("cvt.rp.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} + +__CUDA_FP16_DECL__ short int __half2short_rn(__half h) +{ + short int i; + asm("cvt.rni.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ short int __half2short_rz(__half h) +{ + short int i; + asm("cvt.rzi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ short int __half2short_rd(__half h) +{ + short int i; + asm("cvt.rmi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ short int __half2short_ru(__half h) +{ + short int i; + asm("cvt.rpi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ __half __short2half_rn(short int i) +{ + __half h; + asm("cvt.rn.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __short2half_rz(short int i) +{ + __half h; + asm("cvt.rz.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __short2half_rd(short int i) +{ + __half h; + asm("cvt.rm.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __short2half_ru(short int i) +{ + __half h; + asm("cvt.rp.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} + +__CUDA_FP16_DECL__ unsigned int __half2uint_rn(__half h) +{ + unsigned int i; + asm("cvt.rni.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned int __half2uint_rz(__half h) +{ + unsigned int i; + asm("cvt.rzi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned int __half2uint_rd(__half h) +{ + unsigned int i; + asm("cvt.rmi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned int __half2uint_ru(__half h) +{ + unsigned int i; + asm("cvt.rpi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ __half __uint2half_rn(unsigned int i) +{ + __half h; + asm("cvt.rn.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __uint2half_rz(unsigned int i) +{ + __half h; + asm("cvt.rz.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __uint2half_rd(unsigned int i) +{ + __half h; + asm("cvt.rm.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __uint2half_ru(unsigned int i) +{ + __half h; + asm("cvt.rp.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} + +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(__half h) +{ + unsigned short int i; + asm("cvt.rni.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rz(__half h) +{ + unsigned short int i; + asm("cvt.rzi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(__half h) +{ + unsigned short int i; + asm("cvt.rmi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(__half h) +{ + unsigned short int i; + asm("cvt.rpi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ __half __ushort2half_rn(unsigned short int i) +{ + __half h; + asm("cvt.rn.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ushort2half_rz(unsigned short int i) +{ + __half h; + asm("cvt.rz.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ushort2half_rd(unsigned short int i) +{ + __half h; + asm("cvt.rm.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ushort2half_ru(unsigned short int i) +{ + __half h; + asm("cvt.rp.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} + +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(__half h) +{ + unsigned long long int i; + asm("cvt.rni.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rz(__half h) +{ + unsigned long long int i; + asm("cvt.rzi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(__half h) +{ + unsigned long long int i; + asm("cvt.rmi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(__half h) +{ + unsigned long long int i; + asm("cvt.rpi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ __half __ull2half_rn(unsigned long long int i) +{ + __half h; + asm("cvt.rn.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ull2half_rz(unsigned long long int i) +{ + __half h; + asm("cvt.rz.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ull2half_rd(unsigned long long int i) +{ + __half h; + asm("cvt.rm.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ull2half_ru(unsigned long long int i) +{ + __half h; + asm("cvt.rp.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} + +__CUDA_FP16_DECL__ long long int __half2ll_rn(__half h) +{ + long long int i; + asm("cvt.rni.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ long long int __half2ll_rz(__half h) +{ + long long int i; + asm("cvt.rzi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ long long int __half2ll_rd(__half h) +{ + long long int i; + asm("cvt.rmi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ long long int __half2ll_ru(__half h) +{ + long long int i; + asm("cvt.rpi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ __half __ll2half_rn(long long int i) +{ + __half h; + asm("cvt.rn.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ll2half_rz(long long int i) +{ + __half h; + asm("cvt.rz.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ll2half_rd(long long int i) +{ + __half h; + asm("cvt.rm.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ll2half_ru(long long int i) +{ + __half h; + asm("cvt.rp.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} + +__CUDA_FP16_DECL__ __half htrunc(const __half h) +{ + __half r; + asm("cvt.rzi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hceil(const __half h) +{ + __half r; + asm("cvt.rpi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hfloor(const __half h) +{ + __half r; + asm("cvt.rmi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hrint(const __half h) +{ + __half r; + asm("cvt.rni.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} + +__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rzi.f16.f16 low, low;\n" + " cvt.rzi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rpi.f16.f16 low, low;\n" + " cvt.rpi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rmi.f16.f16 low, low;\n" + " cvt.rmi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rni.f16.f16 low, low;\n" + " cvt.rni.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 l, const __half2 h) +{ + __half2 val; + asm("{.reg .f16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {alow,blow};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(l)), "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 l, const __half2 h) +{ + __half2 val; + asm("{.reg .f16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(l)), "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half __low2half(const __half2 h) +{ + __half ret; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, low;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(h))); + return ret; +} +__CUDA_FP16_DECL__ int __hisinf(const __half a) +{ + if (__HALF_TO_CUS(a) == 0xFC00) { + return -1; + } + if (__HALF_TO_CUS(a) == 0x7C00) { + return 1; + } + return 0; +} +__CUDA_FP16_DECL__ __half2 __low2half2(const __half2 l) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(l))); + return val; +} +__CUDA_FP16_DECL__ __half2 __high2half2(const __half2 l) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(l))); + return val; +} +__CUDA_FP16_DECL__ __half __high2half(const __half2 h) +{ + __half ret; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, high;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(h))); + return ret; +} +__CUDA_FP16_DECL__ __half2 __halves2half2(const __half l, const __half h) +{ + __half2 val; + asm("{ mov.b32 %0, {%1,%2};}\n" + : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(l)), "h"(__HALF_TO_CUS(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 __half2half2(const __half lh) +{ + __half2 val; + asm("{ mov.b32 %0, {%1,%1};}\n" + : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(lh))); + return val; +} +__CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 lh) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(lh))); + return val; +} +__CUDA_FP16_DECL__ short int __half_as_short(const __half h) +{ + return (short int)__HALF_TO_CUS(h); +} +__CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h) +{ + return __HALF_TO_CUS(h); +} +__CUDA_FP16_DECL__ __half __short_as_half(const short int i) +{ + __half h; + __HALF_TO_US(h) = (unsigned short int)i; + return h; +} +__CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i) +{ + __half h; + __HALF_TO_US(h) = i; + return h; +} + +#if __CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__) +/****************************************************************************** +* __half, __half2 warp shuffle * +******************************************************************************/ +#define __SHUFFLE_HALF2_MACRO(name) /* do */ {\ + __half2 r; \ + asm volatile ("{"#name" %0,%1,%2,%3;\n}" \ + :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c)); \ + return r; \ +} /* while(0) */ + +#define __SHUFFLE_SYNC_HALF2_MACRO(name) /* do */ {\ + __half2 r; \ + asm volatile ("{"#name" %0,%1,%2,%3,%4;\n}" \ + :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \ + return r; \ +} /* while(0) */ + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 + +__CUDA_FP16_DECL__ __half2 __shfl(__half2 var, int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_HALF2_MACRO(shfl.idx.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_up(__half2 var, unsigned int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = (warpSize - width) << 8; + __SHUFFLE_HALF2_MACRO(shfl.up.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_down(__half2 var, unsigned int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_HALF2_MACRO(shfl.down.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_xor(__half2 var, int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_HALF2_MACRO(shfl.bfly.b32) +} + +#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */ + +__CUDA_FP16_DECL__ __half2 __shfl_sync(unsigned mask, __half2 var, int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.idx.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_up_sync(unsigned mask, __half2 var, unsigned int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = (warpSize - width) << 8; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.up.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_down_sync(unsigned mask, __half2 var, unsigned int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.down.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(unsigned mask, __half2 var, int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.bfly.b32) +} + +#undef __SHUFFLE_HALF2_MACRO +#undef __SHUFFLE_SYNC_HALF2_MACRO + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 + +__CUDA_FP16_DECL__ __half __shfl(__half var, int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_up(__half var, unsigned int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_up(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_down(__half var, unsigned int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_down(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_xor(__half var, int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_xor(temp1, delta, width); + return __low2half(temp2); +} + +#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */ + +__CUDA_FP16_DECL__ __half __shfl_sync(unsigned mask, __half var, int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_up_sync(unsigned mask, __half var, unsigned int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_up_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_down_sync(unsigned mask, __half var, unsigned int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_down_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_xor_sync(unsigned mask, __half var, int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_xor_sync(mask, temp1, delta, width); + return __low2half(temp2); +} + +#endif /*__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__)*/ +/****************************************************************************** +* __half and __half2 __ldg,__ldcg,__ldca,__ldcs * +******************************************************************************/ + +#if defined(__cplusplus) && (__CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__)) +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ +__CUDA_FP16_DECL__ __half2 __ldg(const __half2 *ptr) +{ + __half2 ret; + asm ("ld.global.nc.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldg(const __half *ptr) +{ + __half ret; + asm ("ld.global.nc.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *ptr) +{ + __half2 ret; + asm ("ld.global.cg.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcg(const __half *ptr) +{ + __half ret; + asm ("ld.global.cg.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldca(const __half2 *ptr) +{ + __half2 ret; + asm ("ld.global.ca.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldca(const __half *ptr) +{ + __half ret; + asm ("ld.global.ca.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *ptr) +{ + __half2 ret; + asm ("ld.global.cs.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcs(const __half *ptr) +{ + __half ret; + asm ("ld.global.cs.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldlu(const __half2 *ptr) +{ + __half2 ret; + asm ("ld.global.lu.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ __half __ldlu(const __half *ptr) +{ + __half ret; + asm ("ld.global.lu.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcv(const __half2 *ptr) +{ + __half2 ret; + asm ("ld.global.cv.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcv(const __half *ptr) +{ + __half ret; + asm ("ld.global.cv.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ void __stwb(__half2 *ptr, __half2 value) +{ + asm ("st.global.wb.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stwb(__half *ptr, __half value) +{ + asm ("st.global.wb.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcg(__half2 *ptr, __half2 value) +{ + asm ("st.global.cg.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcg(__half *ptr, __half value) +{ + asm ("st.global.cg.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcs(__half2 *ptr, __half2 value) +{ + asm ("st.global.cs.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcs(__half *ptr, __half value) +{ + asm ("st.global.cs.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stwt(__half2 *ptr, __half2 value) +{ + asm ("st.global.wt.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stwt(__half *ptr, __half value) +{ + asm ("st.global.wt.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +#undef __LDG_PTR +#endif /*defined(__cplusplus) && (__CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__))*/ +#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) +/****************************************************************************** +* __half2 comparison * +******************************************************************************/ +#define __COMPARISON_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{ "#name".f16x2.f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + return val; \ +} /* while(0) */ +__CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.eq) +} +__CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.ne) +} +__CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.le) +} +__CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.ge) +} +__CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.lt) +} +__CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.gt) +} +__CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.equ) +} +__CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.neu) +} +__CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.leu) +} +__CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.geu) +} +__CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.ltu) +} +__CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.gtu) +} +#undef __COMPARISON_OP_HALF2_MACRO +#define __BOOL_COMPARISON_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{ "#name".f16x2.f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + if (__HALF2_TO_CUI(val) == 0x3C003C00) \ + return true; \ + else \ + return false; \ +} /* while(0) */ +__CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.eq) +} +__CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.ne) +} +__CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.le) +} +__CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.ge) +} +__CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.lt) +} +__CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.gt) +} +__CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.equ) +} +__CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.neu) +} +__CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.leu) +} +__CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.geu) +} +__CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.ltu) +} +__CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.gtu) +} +#undef __BOOL_COMPARISON_OP_HALF2_MACRO +/****************************************************************************** +* __half comparison * +******************************************************************************/ +#define __COMPARISON_OP_HALF_MACRO(name) /* do */ {\ + unsigned short val; \ + asm( "{ .reg .pred __$temp3;\n" \ + " setp."#name".f16 __$temp3, %1, %2;\n" \ + " selp.u16 %0, 1, 0, __$temp3;}" \ + : "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \ + return val ? true : false; \ +} /* while(0) */ +__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(eq) +} +__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ne) +} +__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(le) +} +__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ge) +} +__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(lt) +} +__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(gt) +} +__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(equ) +} +__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(neu) +} +__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(leu) +} +__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(geu) +} +__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ltu) +} +__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(gtu) +} +#undef __COMPARISON_OP_HALF_MACRO +/****************************************************************************** +* __half2 arithmetic * +******************************************************************************/ +__CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(add) +} +__CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(sub) +} +__CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(mul) +} +__CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(add.sat) +} +__CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(sub.sat) +} +__CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(mul.sat) +} +__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn) +} +__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn.sat) +} +__CUDA_FP16_DECL__ __half2 __h2div(__half2 a, __half2 b) { + __half ha, hb; + + ha = __low2half(a); + hb = __low2half(b); + + __half v1 = __hdiv(ha, hb); + + ha = __high2half(a); + hb = __high2half(b); + + __half v2 = __hdiv(ha, hb); + + return __halves2half2(v1, v2); +} +/****************************************************************************** +* __half arithmetic * +******************************************************************************/ +__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(add) +} +__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(sub) +} +__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(mul) +} +__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(add.sat) +} +__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(sub.sat) +} +__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(mul.sat) +} + +__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn) +} +__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn.sat) +} +__CUDA_FP16_DECL__ __half __hdiv(__half a, __half b) { + __half v, abs, den; + __HALF_TO_US(den) = 0x008F; + float fa, fb, fv, rcp; + + fa = __half2float(a); + fb = __half2float(b); + + asm("{rcp.approx.ftz.f32 %0, %1;\n}" :"=f"(rcp) : "f"(fb)); + + fv = rcp * fa; + + v = __float2half(fv); + __HALF_TO_US(abs) = (unsigned short)(((unsigned int)__HALF_TO_CUS(v)) & 0x00007FFF); + if (__hlt(abs, den) && (!(__HALF_TO_CUS(abs) == 0x0000))) { + float err = __fmaf_rn(-fb, fv, fa); + fv = __fmaf_rn(rcp, err, fv); + v = __float2half(fv); + } + return v; +} + +/****************************************************************************** +* __half2 functions * +******************************************************************************/ +#define __SPEC_CASE2(i,r, spc, ulp) \ + "{.reg.b32 spc, ulp, p;\n"\ + " mov.b32 spc,"#spc";\n"\ + " mov.b32 ulp,"#ulp";\n"\ + " set.eq.f16x2.f16x2 p,"#i", spc;\n"\ + " fma.rn.f16x2 "#r",p,ulp,"#r";\n}\n" +#define __SPEC_CASE(i,r, spc, ulp) \ + "{.reg.b16 spc, ulp, p;\n"\ + " mov.b16 spc,"#spc";\n"\ + " mov.b16 ulp,"#ulp";\n"\ + " set.eq.f16.f16 p,"#i", spc;\n"\ + " fma.rn.f16 "#r",p,ulp,"#r";\n}\n" +#define __APPROX_FCAST(fun) /* do */ {\ + __half val;\ + asm("{.reg.b32 f; \n"\ + " .reg.b16 r; \n"\ + " mov.b16 r,%1; \n"\ + " cvt.f32.f16 f,r; \n"\ + " "#fun".approx.f32 f,f; \n"\ + " cvt.rn.f16.f32 r,f; \n"\ + " mov.b16 %0,r; \n"\ + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\ + return val;\ +} /* while(0) */ +#define __APPROX_FCAST2(fun) /* do */ {\ + __half2 val;\ + asm("{.reg.b16 hl, hu; \n"\ + " .reg.b32 fl, fu; \n"\ + " mov.b32 {hl, hu}, %1; \n"\ + " cvt.f32.f16 fl, hl; \n"\ + " cvt.f32.f16 fu, hu; \n"\ + " "#fun".approx.f32 fl, fl; \n"\ + " "#fun".approx.f32 fu, fu; \n"\ + " cvt.rn.f16.f32 hl, fl; \n"\ + " cvt.rn.f16.f32 hu, fu; \n"\ + " mov.b32 %0, {hl, hu}; \n"\ + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); \ + return val;\ +} /* while(0) */ +static __device__ __forceinline__ float __float_simpl_sinf(float); +static __device__ __forceinline__ float __float_simpl_cosf(float); +__CUDA_FP16_DECL__ __half __hsin_internal(const __half a) { + float f = __half2float(a); + f = __float_simpl_sinf(f); + return __float2half_rn(f); +} +__CUDA_FP16_DECL__ __half hsin(const __half a) { + __half r = __hsin_internal(a); + asm("{\n\t" + " .reg.b16 i,r,t; \n\t" + " mov.b16 r, %0; \n\t" + " mov.b16 i, %1; \n\t" + " mov.b16 t, 0x8000; \n\t" + " and.b16 t,r,t; \n\t" + __SPEC_CASE(i, r, 0X32B3, 0x0800) + __SPEC_CASE(i, r, 0X5CB0, 0x1000) + __SPEC_CASE(i, r, 0XB2B3, 0x8800) + __SPEC_CASE(i, r, 0XDCB0, 0x9000) + " or.b16 r,r,t; \n\t" + " mov.b16 %0, r; \n" + "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a) { + __half l = __low2half(a); + __half h = __high2half(a); + __half2 r = __halves2half2(__hsin_internal(l), __hsin_internal(h)); + asm("{\n\t" + " .reg.b32 i,r,t; \n\t" + " mov.b32 r, %0; \n\t" + " mov.b32 i, %1; \n\t" + " and.b32 t, r, 0x80008000; \n\t" + __SPEC_CASE2(i, r, 0X32B332B3, 0x08000800) + __SPEC_CASE2(i, r, 0X5CB05CB0, 0x10001000) + __SPEC_CASE2(i, r, 0XB2B3B2B3, 0x88008800) + __SPEC_CASE2(i, r, 0XDCB0DCB0, 0x90009000) + " or.b32 r, r, t; \n\t" + " mov.b32 %0, r; \n" + "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ __half __hcos_internal(const __half a) { + float f = __half2float(a); + f = __float_simpl_cosf(f); + return __float2half_rn(f); +} +__CUDA_FP16_DECL__ __half hcos(const __half a) { + __half r = __hcos_internal(a); + asm("{\n\t" + " .reg.b16 i,r; \n\t" + " mov.b16 r, %0; \n\t" + " mov.b16 i, %1; \n\t" + __SPEC_CASE(i, r, 0X2B7C, 0x1000) + __SPEC_CASE(i, r, 0XAB7C, 0x1000) + " mov.b16 %0, r; \n" + "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a) { + __half l = __low2half(a); + __half h = __high2half(a); + __half2 r = __halves2half2(__hcos_internal(l), __hcos_internal(h)); + asm("{\n\t" + " .reg.b32 i,r; \n\t" + " mov.b32 r, %0; \n\t" + " mov.b32 i, %1; \n\t" + __SPEC_CASE2(i, r, 0X2B7C2B7C, 0x10001000) + __SPEC_CASE2(i, r, 0XAB7CAB7C, 0x10001000) + " mov.b32 %0, r; \n" + "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +static __device__ __forceinline__ float __internal_trig_reduction_kernel(float a, int *quadrant) +{ + float j, t; + int q; + q = __float2int_rn(a * 0.636619772F); + j = (float)q; + t = __fmaf_rn(-j, 1.5707962512969971e+000F, a); + t = __fmaf_rn(-j, 7.5497894158615964e-008F, t); + *quadrant = q; + return t; +} +static __device__ __forceinline__ float __internal_sin_cos_kernel(float x, int i) +{ + float x2, z; + x2 = x*x; + + if (i & 1) { + z = 2.44331571e-5F; + z = __fmaf_rn(z, x2, -1.38873163e-3F); + } + else { + z = -1.95152959e-4F; + z = __fmaf_rn(z, x2, 8.33216087e-3F); + } + if (i & 1) { + z = __fmaf_rn(z, x2, 4.16666457e-2F); + z = __fmaf_rn(z, x2, -5.00000000e-1F); + } + else { + z = __fmaf_rn(z, x2, -1.66666546e-1F); + z = __fmaf_rn(z, x2, 0.0F); + } + x = __fmaf_rn(z, x, x); + if (i & 1) { + x = __fmaf_rn(z, x2, 1.0F); + } + if (i & 2) { + x = __fmaf_rn(x, -1.0F, 0.0F); + } + return x; +} +static __device__ __forceinline__ float __float_simpl_sinf(float a) +{ + float z; + int i; + if (::isinf(a)) { + a = a * 0.0F; + } + a = __internal_trig_reduction_kernel(a, &i); + z = __internal_sin_cos_kernel(a, i); + return z; +} +static __device__ __forceinline__ float __float_simpl_cosf(float a) +{ + float z; + int i; + if (::isinf(a)) { + a = a * 0.0F; + } + a = __internal_trig_reduction_kernel(a, &i); + i++; + z = __internal_sin_cos_kernel(a, i); + return z; +} + +__CUDA_FP16_DECL__ __half hexp(const __half a) { + __half val; + asm("{.reg.b32 f, C; \n" + " .reg.b16 h,r; \n" + " mov.b16 h,%1; \n" + " cvt.f32.f16 f,h; \n" + " mov.b32 C, 0x3fb8aa3b; \n" + " mul.f32 f,f,C; \n" + " ex2.approx.f32 f,f; \n" + " cvt.rn.f16.f32 r,f; \n" + __SPEC_CASE(h, r, 0X1F79, 0x9400) + __SPEC_CASE(h, r, 0X25CF, 0x9400) + __SPEC_CASE(h, r, 0XC13B, 0x0400) + __SPEC_CASE(h, r, 0XC1EF, 0x0200) + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu, C; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " mov.b32 C, 0x3fb8aa3b; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " ex2.approx.f32 fl, fl; \n" + " ex2.approx.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0X1F791F79, 0x94009400) + __SPEC_CASE2(h, r, 0X25CF25CF, 0x94009400) + __SPEC_CASE2(h, r, 0XC13BC13B, 0x04000400) + __SPEC_CASE2(h, r, 0XC1EFC1EF, 0x02000200) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hexp2(const __half a) { + __half val; + asm("{.reg.b32 f, ULP; \n" + " .reg.b16 r; \n" + " mov.b16 r,%1; \n" + " cvt.f32.f16 f,r; \n" + " ex2.approx.f32 f,f; \n" + " mov.b32 ULP, 0x33800000;\n" + " fma.rn.f32 f,f,ULP,f; \n" + " cvt.rn.f16.f32 r,f; \n" + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 fl, fu, ULP; \n" + " mov.b32 {hl, hu}, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " ex2.approx.f32 fl, fl; \n" + " ex2.approx.f32 fu, fu; \n" + " mov.b32 ULP, 0x33800000;\n" + " fma.rn.f32 fl,fl,ULP,fl; \n" + " fma.rn.f32 fu,fu,ULP,fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 %0, {hl, hu}; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hexp10(const __half a) { + __half val; + asm("{.reg.b16 h,r; \n" + " .reg.b32 f, C; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " mov.b32 C, 0x40549A78; \n" + " mul.f32 f,f,C; \n" + " ex2.approx.f32 f, f; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(h, r, 0x34DE, 0x9800) + __SPEC_CASE(h, r, 0x9766, 0x9000) + __SPEC_CASE(h, r, 0x9972, 0x1000) + __SPEC_CASE(h, r, 0xA5C4, 0x1000) + __SPEC_CASE(h, r, 0xBF0A, 0x8100) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu, C; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " mov.b32 C, 0x40549A78; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " ex2.approx.f32 fl, fl; \n" + " ex2.approx.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0x34DE34DE, 0x98009800) + __SPEC_CASE2(h, r, 0x97669766, 0x90009000) + __SPEC_CASE2(h, r, 0x99729972, 0x10001000) + __SPEC_CASE2(h, r, 0xA5C4A5C4, 0x10001000) + __SPEC_CASE2(h, r, 0xBF0ABF0A, 0x81008100) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog2(const __half a) { + __half val; + asm("{.reg.b16 h, r; \n" + " .reg.b32 f; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " lg2.approx.f32 f, f; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(r, r, 0xA2E2, 0x8080) + __SPEC_CASE(r, r, 0xBF46, 0x9400) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 fl, fu, r, p; \n" + " mov.b32 {hl, hu}, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.f32 fl, fl; \n" + " lg2.approx.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(r, r, 0xA2E2A2E2, 0x80808080) + __SPEC_CASE2(r, r, 0xBF46BF46, 0x94009400) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog(const __half a) { + __half val; + asm("{.reg.b32 f, C; \n" + " .reg.b16 r,h; \n" + " mov.b16 h,%1; \n" + " cvt.f32.f16 f,h; \n" + " lg2.approx.f32 f,f; \n" + " mov.b32 C, 0x3f317218; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.f16.f32 r,f; \n" + __SPEC_CASE(h, r, 0X160D, 0x9C00) + __SPEC_CASE(h, r, 0X3BFE, 0x8010) + __SPEC_CASE(h, r, 0X3C0B, 0x8080) + __SPEC_CASE(h, r, 0X6051, 0x1C00) + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.f32 fl, fl; \n" + " lg2.approx.f32 fu, fu; \n" + " mov.b32 C, 0x3f317218; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0X160D160D, 0x9C009C00) + __SPEC_CASE2(h, r, 0X3BFE3BFE, 0x80108010) + __SPEC_CASE2(h, r, 0X3C0B3C0B, 0x80808080) + __SPEC_CASE2(h, r, 0X60516051, 0x1C001C00) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog10(const __half a) { + __half val; + asm("{.reg.b16 h, r; \n" + " .reg.b32 f, C; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " lg2.approx.f32 f, f; \n" + " mov.b32 C, 0x3E9A209B; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(h, r, 0x338F, 0x1000) + __SPEC_CASE(h, r, 0x33F8, 0x9000) + __SPEC_CASE(h, r, 0x57E1, 0x9800) + __SPEC_CASE(h, r, 0x719D, 0x9C00) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.f32 fl, fl; \n" + " lg2.approx.f32 fu, fu; \n" + " mov.b32 C, 0x3E9A209B; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0x338F338F, 0x10001000) + __SPEC_CASE2(h, r, 0x33F833F8, 0x90009000) + __SPEC_CASE2(h, r, 0x57E157E1, 0x98009800) + __SPEC_CASE2(h, r, 0x719D719D, 0x9C009C00) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +#undef __SPEC_CASE2 +#undef __SPEC_CASE +__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a) { + __APPROX_FCAST2(rcp) +} +__CUDA_FP16_DECL__ __half hrcp(const __half a) { + __APPROX_FCAST(rcp) +} +__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a) { + __APPROX_FCAST2(rsqrt) +} +__CUDA_FP16_DECL__ __half hrsqrt(const __half a) { + __APPROX_FCAST(rsqrt) +} +__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a) { + __APPROX_FCAST2(sqrt) +} +__CUDA_FP16_DECL__ __half hsqrt(const __half a) { + __APPROX_FCAST(sqrt) +} +#undef __APPROX_FCAST +#undef __APPROX_FCAST2 +__CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a) +{ + __half2 r; + asm("{set.nan.f16x2.f16x2 %0,%1,%2;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ bool __hisnan(const __half a) +{ + __half r; + asm("{set.nan.f16.f16 %0,%1,%2;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(a))); + return __HALF_TO_CUS(r) != 0U; +} +__CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a) +{ + __half2 r; + asm("{neg.f16x2 %0,%1;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ __half __hneg(const __half a) +{ + __half r; + asm("{neg.f16 %0,%1;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 __habs2(const __half2 a) +{ + __half2 r; + asm("{abs.f16x2 %0,%1;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ __half __habs(const __half a) +{ + __half r; + asm("{abs.f16 %0,%1;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +#endif /*__CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ + +#if __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__) +/****************************************************************************** +* __half arithmetic * +******************************************************************************/ +__CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(max) +} +__CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(min) +} +__CUDA_FP16_DECL__ __half __hmax_nan(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(max.NaN) +} +__CUDA_FP16_DECL__ __half __hmin_nan(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(min.NaN) +} +__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn.relu) +} +/****************************************************************************** +* __half2 arithmetic * +******************************************************************************/ +__CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(max) +} +__CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(min) +} +__CUDA_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(max.NaN) +} +__CUDA_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(min.NaN) +} +__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn.relu) +} +#endif /*__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)*/ + +/* Define __PTR for atomicAdd prototypes below, undef after done */ +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __PTR "l" +#else +#define __PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 + +__CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *address, __half2 val) { + __half2 r; + asm volatile ("{ atom.add.noftz.f16x2 %0,[%1],%2; }\n" + : "=r"(__HALF2_TO_UI(r)) : __PTR(address), "r"(__HALF2_TO_CUI(val)) + : "memory"); + return r; +} + +#endif /*!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600*/ + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + +__CUDA_FP16_DECL__ __half atomicAdd(__half *address, __half val) { + __half r; + asm volatile ("{ atom.add.noftz.f16 %0,[%1],%2; }\n" + : "=h"(__HALF_TO_US(r)) + : __PTR(address), "h"(__HALF_TO_CUS(val)) + : "memory"); + return r; +} + +#endif /*!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700*/ + +#undef __PTR + +#undef __CUDA_FP16_DECL__ +#endif /* defined(__CUDACC__) */ +#endif /* defined(__cplusplus) */ + +#undef __TERNARY_OP_HALF2_MACRO +#undef __TERNARY_OP_HALF_MACRO +#undef __BINARY_OP_HALF2_MACRO +#undef __BINARY_OP_HALF_MACRO + +#undef __CUDA_HOSTDEVICE_FP16_DECL__ +#undef __CUDA_FP16_DECL__ + +/* Define first-class types "half" and "half2", unless user specifies otherwise via "#define CUDA_NO_HALF" */ +/* C cannot ever have these types defined here, because __half and __half2 are C++ classes */ +#if defined(__cplusplus) && !defined(CUDA_NO_HALF) +typedef __half half; +typedef __half2 half2; +// for consistency with __nv_bfloat16 +typedef __half __nv_half; +typedef __half2 __nv_half2; +typedef __half_raw __nv_half_raw; +typedef __half2_raw __nv_half2_raw; +typedef __half nv_half; +typedef __half2 nv_half2; +#endif /* defined(__cplusplus) && !defined(CUDA_NO_HALF) */ + +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) +#undef __CPP_VERSION_AT_LEAST_11_FP16 +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + +#endif /* end of include guard: __CUDA_FP16_HPP__ */ diff --git a/cupy/_core/include/cupy/_cuda/cuda-11.1/cuda_fp16.h b/cupy/_core/include/cupy/_cuda/cuda-11.1/cuda_fp16.h new file mode 100755 index 0000000..57441c6 --- /dev/null +++ b/cupy/_core/include/cupy/_cuda/cuda-11.1/cuda_fp16.h @@ -0,0 +1,3631 @@ +/* +* Copyright 1993-2020 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +/** +* \defgroup CUDA_MATH_INTRINSIC_HALF Half Precision Intrinsics +* This section describes half precision intrinsic functions that are +* only supported in device code. +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_ARITHMETIC Half Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_ARITHMETIC Half2 Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_COMPARISON Half Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_COMPARISON Half2 Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_MISC Half Precision Conversion And Data Movement +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_FUNCTIONS Half Math Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_FUNCTIONS Half2 Math Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions include the header file \p cuda_fp16.h in your program. +*/ + +#ifndef __CUDA_FP16_H__ +#define __CUDA_FP16_H__ + +#if defined(__cplusplus) +#if defined(__CUDACC__) +#define __CUDA_FP16_DECL__ static __device__ __inline__ +#define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__ +#else +#define __CUDA_HOSTDEVICE_FP16_DECL__ static +#endif /* defined(__CUDACC__) */ + +#define __CUDA_FP16_TYPES_EXIST__ + +/* Forward-declaration of structures defined in "cuda_fp16.hpp" */ + +/** + * \brief half datatype + * + * \details This structure implements the datatype for storing + * half-precision floating-point numbers. The structure implements + * assignment operators and type conversions. + * 16 bits are being used in total: 1 sign bit, 5 bits for the exponent, + * and the significand is being stored in 10 bits. + * The total precision is 11 bits. There are 15361 representable + * numbers within the interval [0.0, 1.0], endpoints included. + * On average we have log10(2**11) ~ 3.311 decimal digits. + * + * \internal + * \req IEEE 754-2008 compliant implementation of half-precision + * floating-point numbers. + * \endinternal + */ +struct __half; + +/** + * \brief half2 datatype + * + * \details This structure implements the datatype for storing two + * half-precision floating-point numbers. + * The structure implements assignment operators and type conversions. + * + * \internal + * \req Vectorified version of half. + * \endinternal + */ +struct __half2; + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts double number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts double number \p a to half precision in round-to-nearest-even mode. +* \param[in] a - double. Is only being read. +* \returns half +* \retval \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. +* \returns half +* \retval \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. +* \returns half +* \retval \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-towards-zero mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-towards-zero mode. +* \param[in] a - float. Is only being read. +* \returns half +* \retval \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-down mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-down mode. +* \param[in] a - float. Is only being read. +* +* \returns half +* \retval \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-up mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-up mode. +* \param[in] a - float. Is only being read. +* +* \returns half +* \retval \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts \p half number to float. +* +* \details Converts half number \p a to float. +* \param[in] a - float. Is only being read. +* +* \returns float +* \retval \p a converted to float. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts input to half precision in round-to-nearest-even mode and +* populates both halves of \p half2 with converted value. +* +* \details Converts input \p a to half precision in round-to-nearest-even mode and +* populates both halves of \p half2 with converted value. +* \param[in] a - float. Is only being read. +* +* \returns half2 +* \retval The \p half2 value with both halves equal to the converted half +* precision number. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both input floats to half precision in round-to-nearest-even +* mode and returns \p half2 with converted values. +* +* \details Converts both input floats to half precision in round-to-nearest-even mode +* and combines the results into one \p half2 number. Low 16 bits of the return +* value correspond to the input \p a, high 16 bits correspond to the input \p +* b. +* \param[in] a - float. Is only being read. +* \param[in] b - float. Is only being read. +* +* \returns half2 +* \retval The \p half2 value with corresponding halves equal to the +* converted input floats. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts low 16 bits of \p half2 to float and returns the result +* +* \details Converts low 16 bits of \p half2 input \p a to 32 bit floating point number +* and returns the result. +* \param[in] a - half2. Is only being read. +* +* \returns float +* \retval The low 16 bits of \p a converted to float. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts high 16 bits of \p half2 to float and returns the result +* +* \details Converts high 16 bits of \p half2 input \p a to 32 bit floating point number +* and returns the result. +* \param[in] a - half2. Is only being read. +* +* \returns float +* \retval The high 16 bits of \p a converted to float. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a); + +#if defined(__CUDACC__) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both components of float2 number to half precision in +* round-to-nearest-even mode and returns \p half2 with converted values. +* +* \details Converts both components of float2 to half precision in round-to-nearest +* mode and combines the results into one \p half2 number. Low 16 bits of the +* return value correspond to \p a.x and high 16 bits of the return value +* correspond to \p a.y. +* \param[in] a - float2. Is only being read. +* +* \returns half2 +* \retval The \p half2 which has corresponding halves equal to the +* converted float2 components. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both halves of \p half2 to float2 and returns the result. +* +* \details Converts both halves of \p half2 input \p a to float2 and returns the +* result. +* \param[in] a - half2. Is only being read. +* +* \returns float2 +* \retval \p a converted to float2. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-to-nearest-even mode. +* +* \details Convert the half-precision floating point value \p h to a signed integer in +* round-to-nearest-even mode. +* \param[in] h - half. Is only being read. +* +* \returns int +* \retval \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __half2int_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-towards-zero mode. +* +* \details Convert the half-precision floating point value \p h to a signed integer in +* round-towards-zero mode. +* \param[in] h - half. Is only being read. +* +* \returns int +* \retval \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-down mode. +* +* \details Convert the half-precision floating point value \p h to a signed integer in +* round-down mode. +* \param[in] h - half. Is only being read. +* +* \returns int +* \retval \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __half2int_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-up mode. +* +* \details Convert the half-precision floating point value \p h to a signed integer in +* round-up mode. +* \param[in] h - half. Is only being read. +* +* \returns int +* \retval \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __half2int_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-to-nearest-even mode. +* +* \details Convert the signed integer value \p i to a half-precision floating point +* value in round-to-nearest-even mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-towards-zero mode. +* +* \details Convert the signed integer value \p i to a half-precision floating point +* value in round-towards-zero mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __int2half_rz(int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-down mode. +* +* \details Convert the signed integer value \p i to a half-precision floating point +* value in round-down mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __int2half_rd(int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-up mode. +* +* \details Convert the signed integer value \p i to a half-precision floating point +* value in round-up mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __int2half_ru(int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating point value \p h to a signed short +* integer in round-to-nearest-even mode. +* \param[in] h - half. Is only being read. +* +* \returns short int +* \retval \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half2short_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-towards-zero mode. +* +* \details Convert the half-precision floating point value \p h to a signed short +* integer in round-towards-zero mode. +* \param[in] h - half. Is only being read. +* +* \returns short int +* \retval \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-down mode. +* +* \details Convert the half-precision floating point value \p h to a signed short +* integer in round-down mode. +* \param[in] h - half. Is only being read. +* +* \returns short int +* \retval \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half2short_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-up mode. +* +* \details Convert the half-precision floating point value \p h to a signed short +* integer in round-up mode. +* \param[in] h - half. Is only being read. +* +* \returns short int +* \retval \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half2short_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating +* point value in round-to-nearest-even mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-towards-zero mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating +* point value in round-towards-zero mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __short2half_rz(short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-down mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating +* point value in round-down mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __short2half_rd(short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-up mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating +* point value in round-up mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __short2half_ru(short int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-to-nearest-even mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned integer +* in round-to-nearest-even mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* \retval \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-towards-zero mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned integer +* in round-towards-zero mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* \retval \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-down mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned integer +* in round-down mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* \retval \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-up mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned integer +* in round-up mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* \retval \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-to-nearest-even mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-towards-zero mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating point +* value in round-towards-zero mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __uint2half_rz(unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-down mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating point +* value in round-down mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __uint2half_rd(unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-up mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating point +* value in round-up mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __uint2half_ru(unsigned int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned short +* integer in round-to-nearest-even mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* \retval \p h converted to an unsigned short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-towards-zero +* mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned short +* integer in round-towards-zero mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* \retval \p h converted to an unsigned short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-down mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned short +* integer in round-down mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* \retval \p h converted to an unsigned short integer. +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-up mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned short +* integer in round-up mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* \retval \p h converted to an an unsigned short integer. +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating +* point value in round-to-nearest-even mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-towards-zero +* mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating +* point value in round-towards-zero mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ushort2half_rz(unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-down mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating +* point value in round-down mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ushort2half_rd(unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-up mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating +* point value in round-up mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ushort2half_ru(unsigned short int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned 64-bit +* integer in round-to-nearest-even mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* \retval \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-towards-zero +* mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned 64-bit +* integer in round-towards-zero mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* \retval \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-down mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned 64-bit +* integer in round-down mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* \retval \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-up mode. +* +* \details Convert the half-precision floating point value \p h to an unsigned 64-bit +* integer in round-up mode. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* \retval \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating +* point value in round-to-nearest-even mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-towards-zero +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating +* point value in round-towards-zero mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ull2half_rz(unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-down mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating +* point value in round-down mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ull2half_rd(unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-up mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating +* point value in round-up mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ull2half_ru(unsigned long long int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating point value \p h to a signed 64-bit +* integer in round-to-nearest-even mode. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* \retval \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ long long int __half2ll_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-towards-zero mode. +* +* \details Convert the half-precision floating point value \p h to a signed 64-bit +* integer in round-towards-zero mode. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* \retval \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-down mode. +* +* \details Convert the half-precision floating point value \p h to a signed 64-bit +* integer in round-down mode. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* \retval \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ long long int __half2ll_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-up mode. +* +* \details Convert the half-precision floating point value \p h to a signed 64-bit +* integer in round-up mode. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* \retval \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ long long int __half2ll_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating +* point value in round-to-nearest-even mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-towards-zero mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating +* point value in round-towards-zero mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +*/ +__CUDA_FP16_DECL__ __half __ll2half_rz(long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-down mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating +* point value in round-down mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ll2half_rd(long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-up mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating +* point value in round-up mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* \retval \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ll2half_ru(long long int i); + +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Truncate input argument to the integral part. +* +* \details Round \p h to the nearest integer value that does not exceed \p h in +* magnitude. +* \param[in] h - half. Is only being read. +* +* \returns half +* \retval The truncated integer value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half htrunc(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculate ceiling of the input argument. +* +* \details Compute the smallest integer value not less than \p h. +* \param[in] h - half. Is only being read. +* +* \returns half +* \retval The smallest integer value not less than \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hceil(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details Calculate the largest integer value which is less than or equal to \p h. +* \param[in] h - half. Is only being read. +* +* \returns half +* \retval The largest integer value which is less than or equal to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hfloor(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Round input to nearest integer value in half-precision floating point +* number. +* +* \details Round \p h to the nearest integer value in half-precision floating point +* format, with halfway cases rounded to the nearest even integer value. +* \param[in] h - half. Is only being read. +* +* \returns half +* \retval The nearest integer to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hrint(const __half h); + +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Truncate \p half2 vector input argument to the integral part. +* +* \details Round each component of vector \p h to the nearest integer value that does +* not exceed \p h in magnitude. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* \retval The truncated \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculate \p half2 vector ceiling of the input argument. +* +* \details For each component of vector \p h compute the smallest integer value not less +* than \p h. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* \retval The vector of smallest integers not less than \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details For each component of vector \p h calculate the largest integer value which +* is less than or equal to \p h. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* \retval The vector of largest integers which is less than or equal to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Round input to nearest integer value in half-precision floating point +* number. +* +* \details Round each component of \p half2 vector \p h to the nearest integer value in +* half-precision floating point format, with halfway cases rounded to the +* nearest even integer value. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* \retval The vector of rounded integer values. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns \p half2 with both halves equal to the input value. +* +* \details Returns \p half2 number with both halves equal to the input \p a \p half +* number. +* \param[in] a - half. Is only being read. +* +* \returns half2 +* \retval The vector which has both its halves equal to the input \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __half2half2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Swaps both halves of the \p half2 input. +* +* \details Swaps both halves of the \p half2 input and returns a new \p half2 number +* with swapped halves. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval \p a with its halves being swapped. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts low 16 bits from each of the two \p half2 inputs and combines +* into one \p half2 number. +* +* \details Extracts low 16 bits from each of the two \p half2 inputs and combines into +* one \p half2 number. Low 16 bits from input \p a is stored in low 16 bits of +* the return value, low 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The low 16 bits of \p a and of \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts high 16 bits from each of the two \p half2 inputs and +* combines into one \p half2 number. +* +* \details Extracts high 16 bits from each of the two \p half2 inputs and combines into +* one \p half2 number. High 16 bits from input \p a is stored in low 16 bits of +* the return value, high 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The high 16 bits of \p a and of \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns high 16 bits of \p half2 input. +* +* \details Returns high 16 bits of \p half2 input \p a. +* \param[in] a - half2. Is only being read. +* +* \returns half +* \retval The high 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __high2half(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns low 16 bits of \p half2 input. +* +* \details Returns low 16 bits of \p half2 input \p a. +* \param[in] a - half2. Is only being read. +* +* \returns half +* \retval Returns \p half which contains low 16 bits of the input \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __low2half(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Checks if the input \p half number is infinite. +* +* \details Checks if the input \p half number \p a is infinite. +* \param[in] a - half. Is only being read. +* +* \returns int +* \retval -1 iff \p a is equal to negative infinity, +* \retval 1 iff \p a is equal to positive infinity, +* \retval 0 otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __hisinf(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Combines two \p half numbers into one \p half2 number. +* +* \details Combines two input \p half number \p a and \p b into one \p half2 number. +* Input \p a is stored in low 16 bits of the return value, input \p b is stored +* in high 16 bits of the return value. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half2 +* \retval The half2 with one half equal to \p a and the other to \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts low 16 bits from \p half2 input. +* +* \details Extracts low 16 bits from \p half2 input \p a and returns a new \p half2 +* number which has both halves equal to the extracted bits. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The half2 with both halves equal to the low 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __low2half2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts high 16 bits from \p half2 input. +* +* \details Extracts high 16 bits from \p half2 input \p a and returns a new \p half2 +* number which has both halves equal to the extracted bits. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The half2 with both halves equal to the high 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __high2half2(const __half2 a); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a \p half as a signed short integer. +* +* \details Reinterprets the bits in the half-precision floating point number \p h +* as a signed short integer. +* \param[in] h - half. Is only being read. +* +* \returns short int +* \retval The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half_as_short(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a \p half as an unsigned short integer. +* +* \details Reinterprets the bits in the half-precision floating point \p h +* as an unsigned short number. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* \retval The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a signed short integer as a \p half. +* +* \details Reinterprets the bits in the signed short integer \p i as a +* half-precision floating point number. +* \param[in] i - short int. Is only being read. +* +* \returns half +* \retval The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __short_as_half(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in an unsigned short integer as a \p half. +* +* \details Reinterprets the bits in the unsigned short integer \p i as a +* half-precision floating point number. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* \retval The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i); + +#if __CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__) +#if !defined warpSize && !defined __local_warpSize +#define warpSize 32 +#define __local_warpSize +#endif + +#if defined(_WIN32) +# define __DEPRECATED__(msg) __declspec(deprecated(msg)) +#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__)))) +# define __DEPRECATED__(msg) __attribute__((deprecated)) +#else +# define __DEPRECATED__(msg) __attribute__((deprecated(msg))) +#endif + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 +#define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)." + +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) __half2 __shfl(__half2 var, int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) __half2 __shfl_up(__half2 var, unsigned int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down))__half2 __shfl_down(__half2 var, unsigned int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half2 __shfl_xor(__half2 var, int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) __half __shfl(__half var, int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) __half __shfl_up(__half var, unsigned int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) __half __shfl_down(__half var, unsigned int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half __shfl_xor(__half var, int delta, int width = warpSize); +#endif + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of var held by the thread whose ID is given by delta. +* If width is less than warpSize then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], +* the value returned corresponds to the value of var held by the delta modulo width (i.e. +* ithin the same subsection). width must have a value which is a power of 2; +* results are undefined if width is not a power of 2, or is a number greater than +* warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_sync(unsigned mask, __half2 var, int delta, int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* +* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. +* The value of var held by the resulting lane ID is returned: in effect, var is shifted up +* the warp by delta threads. If width is less than warpSize then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. +* width must have a value which is a power of 2; results are undefined if width is not a power of 2, +* or is a number greater than warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_up_sync(unsigned mask, __half2 var, unsigned int delta, int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding delta to the caller's thread ID. +* The value of var held by the resulting thread ID is returned: this has the effect +* of shifting var down the warp by delta threads. If width is less than warpSize then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of width and so the upper delta threads +* will remain unchanged. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_down_sync(unsigned mask, __half2 var, unsigned int delta, int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: +* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each +* group of width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(unsigned mask, __half2 var, int delta, int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of var held by the thread whose ID is given by delta. +* If width is less than warpSize then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], +* the value returned corresponds to the value of var held by the delta modulo width (i.e. +* ithin the same subsection). width must have a value which is a power of 2; +* results are undefined if width is not a power of 2, or is a number greater than +* warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_sync(unsigned mask, __half var, int delta, int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. +* The value of var held by the resulting lane ID is returned: in effect, var is shifted up +* the warp by delta threads. If width is less than warpSize then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. +* width must have a value which is a power of 2; results are undefined if width is not a power of 2, +* or is a number greater than warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_up_sync(unsigned mask, __half var, unsigned int delta, int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding delta to the caller's thread ID. +* The value of var held by the resulting thread ID is returned: this has the effect +* of shifting var down the warp by delta threads. If width is less than warpSize then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of width and so the upper delta threads +* will remain unchanged. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_down_sync(unsigned mask, __half var, unsigned int delta, int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: +* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each +* group of width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_xor_sync(unsigned mask, __half var, int delta, int width = warpSize); + +#if defined(__local_warpSize) +#undef warpSize +#undef __local_warpSize +#endif +#endif /*__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__) */ + +#if defined(__cplusplus) && ( __CUDA_ARCH__ >=320 || !defined(__CUDA_ARCH__) ) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.nc` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldg(const __half2 *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.nc` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldg(const __half *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cg` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cg` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldcg(const __half *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.ca` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldca(const __half2 *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.ca` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldca(const __half *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cs` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cs` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldcs(const __half *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.lu` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldlu(const __half2 *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.lu` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldlu(const __half *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cv` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldcv(const __half2 *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cv` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldcv(const __half *ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wb` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwb(__half2 *ptr, __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wb` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwb(__half *ptr, __half value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cg` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcg(__half2 *ptr, __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cg` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcg(__half *ptr, __half value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cs` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcs(__half2 *ptr, __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cs` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcs(__half *ptr, __half value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wt` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwt(__half2 *ptr, __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wt` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwt(__half *ptr, __half value); +#endif /*defined(__cplusplus) && ( __CUDA_ARCH__ >=320 || !defined(__CUDA_ARCH__) )*/ + +#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs half2 vector if-equal comparison. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector not-equal comparison. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-equal comparison. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The \p half2 result of less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-equal comparison. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-than comparison. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The half2 vector result of less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-than comparison. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered if-equal comparison. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of unordered if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered not-equal comparison. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of unordered not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-equal comparison. +* +* Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of unordered less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-equal comparison. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The \p half2 vector result of unordered greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-than comparison. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The vector result of unordered less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-than comparison. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The \p half2 vector result of unordered greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Determine whether \p half2 argument is a NaN. +* +* \details Determine whether each half of input \p half2 number \p a is a NaN. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The half2 with the corresponding \p half results set to +* 1.0 for for NaN, 0.0 otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode. +* +* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-95 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The sum of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-104 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The subtraction of vector \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode. +* +* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-102 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The result of elementwise multiplying the vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector division in round-to-nearest-even mode. +* +* \details Divides \p half2 input vector \p a by input vector \p b in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-103 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise division of \p a with \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Calculates the absolute value of both halves of the input \p half2 number and +* returns the result. +* +* \details Calculates the absolute value of both halves of the input \p half2 number and +* returns the result. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval Returns \p a with the absolute value of both halves. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __habs2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest +* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The sum of \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The subtraction of vector \p b from \p a, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The result of elementwise multiplication of vectors \p a and \p b, +* with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-105 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* \retval The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode, with saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the +* results to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* \retval The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c, +* with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Negates both halves of the input \p half2 number and returns the +* result. +* +* \details Negates both halves of the input \p half2 number \p a and returns the result. +* \internal +* \req DEEPLEARN-SRM_REQ-101 +* \endinternal +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval Returns \p a with both halves negated. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Calculates the absolute value of input \p half number and returns the result. +* +* \details Calculates the absolute value of input \p half number and returns the result. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The absolute value of a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __habs(const __half a); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode. +* +* \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-94 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \retval The sum of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half input \p b from input \p a in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-97 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \retval The result of subtracting \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode. +* +* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-99 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \retval The result of multiplying \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half division in round-to-nearest-even mode. +* +* \details Divides \p half input \p a by input \p b in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-98 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \retval The result of dividing \p a by \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half add of inputs \p a and \p b, in round-to-nearest-even mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \retval The sum of \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Subtracts \p half input \p b from input \p a in round-to-nearest +* mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \retval The result of subtraction of \p b from \p a, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest +* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \retval The result of multiplying \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-96 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* \retval The result of fused multiply-add operation on \p +* a, \p b, and \p c. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the result +* to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* \retval The result of fused multiply-add operation on \p +* a, \p b, and \p c, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Negates input \p half number and returns the result. +* +* \details Negates input \p half number and returns the result. +* \internal +* \req DEEPLEARN-SRM_REQ-100 +* \endinternal +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval minus a +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hneg(const __half a); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector if-equal comparison, and returns boolean true +* iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of if-equal comparison +* of vectors \p a and \p b are true; +* \retval false, otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector not-equal comparison, and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of not-equal comparison +* of vectors \p a and \p b are true, +* \retval false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-equal comparison, and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of less-equal comparison +* of vectors \p a and \p b are true; +* \retval false, otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-equal comparison, and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of greater-equal +* comparison of vectors \p a and \p b are true; +* \retval false, otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-than comparison, and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of less-than comparison +* of vectors \p a and \p b are true; +* \retval false, otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-than comparison, and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of greater-than +* comparison of vectors \p a and \p b are true; +* \retval false, otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered if-equal comparison, and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of unordered if-equal +* comparison of vectors \p a and \p b are true; +* \retval false, otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered not-equal comparison, and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of unordered not-equal +* comparison of vectors \p a and \p b are true; +* \retval false, otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-equal comparison, and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of unordered less-equal +* comparison of vectors \p a and \p b are true; +* \retval false, otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-equal comparison, and +* returns boolean true iff both \p half results are true, boolean false +* otherwise. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of unordered +* greater-equal comparison of vectors \p a and \p b are true; +* \retval false, otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-than comparison, and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of unordered less-than comparison of +* vectors \p a and \p b are true; +* \retval false, otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-than comparison, and +* returns boolean true iff both \p half results are true, boolean false +* otherwise. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* \retval true, if both \p half results of unordered +* greater-than comparison of vectors \p a and \p b are true; +* \retval false, otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half if-equal comparison. +* +* \details Performs \p half if-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of if-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half not-equal comparison. +* +* \details Performs \p half not-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of not-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half less-equal comparison. +* +* \details Performs \p half less-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of less-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half greater-equal comparison. +* +* \details Performs \p half greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of greater-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half less-than comparison. +* +* \details Performs \p half less-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of less-than comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half greater-than comparison. +* +* \details Performs \p half greater-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of greater-than comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered if-equal comparison. +* +* \details Performs \p half if-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of unordered if-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered not-equal comparison. +* +* \details Performs \p half not-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of unordered not-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered less-equal comparison. +* +* \details Performs \p half less-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of unordered less-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered greater-equal comparison. +* +* \details Performs \p half greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of unordered greater-equal comparison of \p a +* and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered less-than comparison. +* +* \details Performs \p half less-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of unordered less-than comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered greater-than comparison. +* +* \details Performs \p half greater-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* \retval The boolean result of unordered greater-than comparison of \p a +* and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Determine whether \p half argument is a NaN. +* +* \details Determine whether \p half value \p a is a NaN. +* \param[in] a - half. Is only being read. +* +* \returns bool +* \retval true iff argument is NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hisnan(const __half a); +#if __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__) +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half maximum of two input values. +* +* \details Calculates \p half max(\p a, \p b) +* defined as (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half minimum of two input values. +* +* \details Calculates \p half min(\p a, \p b) +* defined as (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half maximum of two input values, NaNs pass through. +* +* \details Calculates \p half max(\p a, \p b) +* defined as (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmax_nan(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half minimum of two input values, NaNs pass through. +* +* \details Calculates \p half min(\p a, \p b) +* defined as (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmin_nan(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode with relu saturation. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* Then negative result is clamped to 0. +* NaN result is converted to canonical NaN. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* \retval The result of fused multiply-add operation on \p +* a, \p b, and \p c with relu saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector maximum of two inputs. +* +* \details Calculates \p half2 vector max(\p a, \p b) +* Elementwise \p half operation is defined as +* (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The result of elementwise maximum of vectors \p a and \p b +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector minimum of two inputs. +* +* \details Calculates \p half2 vector min(\p a, \p b) +* Elementwise \p half operation is defined as +* (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The result of elementwise minimum of vectors \p a and \p b +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector maximum of two inputs, NaNs pass through. +* +* \details Calculates \p half2 vector max(\p a, \p b) +* Elementwise \p half operation is defined as +* (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The result of elementwise maximum of vectors \p a and \p b, with NaNs pass through +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector minimum of two inputs, NaNs pass through. +* +* \details Calculates \p half2 vector min(\p a, \p b) +* Elementwise \p half operation is defined as +* (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* \retval The result of elementwise minimum of vectors \p a and \p b, with NaNs pass through +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode with relu saturation. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* Then negative result is clamped to 0. +* NaN result is converted to canonical NaN. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* \retval The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c); +#endif /*__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)*/ +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs fast complex multiply-accumulate +* +* \details Interprets vector \p half2 input pairs \p a, \p b, and \p c as +* complex numbers in \p half precision and performs +* complex multiply-accumulate operation: a*b + c +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* \retval The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half square root in round-to-nearest-even mode. +* +* \details Calculates \p half square root of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The square root of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hsqrt(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half reciprocal square root in round-to-nearest-even +* mode. +* +* \details Calculates \p half reciprocal square root of input \p a in round-to-nearest +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The reciprocal square root of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hrsqrt(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p half reciprocal of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The reciprocal of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hrcp(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half natural logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half natural logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The natural logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hlog(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half binary logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half binary logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The binary logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hlog2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half decimal logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half decimal logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The decimal logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hlog10(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half natural exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half natural exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The natural exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hexp(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half binary exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half binary exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The binary exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hexp2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half decimal exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half decimal exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The decimal exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hexp10(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half cosine in round-to-nearest-even mode. +* +* \details Calculates \p half cosine of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The cosine of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hcos(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half sine in round-to-nearest-even mode. +* +* \details Calculates \p half sine of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* \retval The sine of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hsin(const __half a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector square root in round-to-nearest-even mode. +* +* \details Calculates \p half2 square root of input vector \p a in round-to-nearest +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise square root on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector reciprocal square root in round-to-nearest +* mode. +* +* \details Calculates \p half2 reciprocal square root of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise reciprocal square root on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p half2 reciprocal of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise reciprocal on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector natural logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 natural logarithm of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise natural logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2log(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector binary logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 binary logarithm of input vector \p a in round-to-nearest +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise binary logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector decimal logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 decimal logarithm of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise decimal logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half2 exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector binary exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p half2 binary exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise binary exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector decimal exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p half2 decimal exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise decimal exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector cosine in round-to-nearest-even mode. +* +* \details Calculates \p half2 cosine of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise cosine on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector sine in round-to-nearest-even mode. +* +* \details Calculates \p half2 sine of input vector \p a in round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* \retval The elementwise sine on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a); + +#endif /*if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ + +#if __CUDA_ARCH__ >= 600 || !defined(__CUDA_ARCH__) + +__CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *address, __half2 val); + +#endif /*if __CUDA_ARCH__ >= 600 || !defined(__CUDA_ARCH__)*/ + +#if __CUDA_ARCH__ >= 700 || !defined(__CUDA_ARCH__) + +__CUDA_FP16_DECL__ __half atomicAdd(__half *address, __half val); + +#endif /*if __CUDA_ARCH__ >= 700 || !defined(__CUDA_ARCH__)*/ + +#endif /* defined(__CUDACC__) */ + +#undef __CUDA_FP16_DECL__ +#undef __CUDA_HOSTDEVICE_FP16_DECL__ + +#endif /* defined(__cplusplus) */ + +/* Note the .hpp file is included even for host-side compilation, to capture the "half" & "half2" definitions */ +#include "cuda_fp16.hpp" + +#endif /* end of include guard: __CUDA_FP16_H__ */ diff --git a/cupy/_core/include/cupy/_cuda/cuda-11.1/cuda_fp16.hpp b/cupy/_core/include/cupy/_cuda/cuda-11.1/cuda_fp16.hpp new file mode 100755 index 0000000..123c6e0 --- /dev/null +++ b/cupy/_core/include/cupy/_cuda/cuda-11.1/cuda_fp16.hpp @@ -0,0 +1,2453 @@ +/* +* Copyright 1993-2020 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +#if !defined(__CUDA_FP16_HPP__) +#define __CUDA_FP16_HPP__ + +#if !defined(__CUDA_FP16_H__) +#error "Do not include this file directly. Instead, include cuda_fp16.h." +#endif + +#if !defined(_MSC_VER) && __cplusplus >= 201103L +# define __CPP_VERSION_AT_LEAST_11_FP16 +#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L +# define __CPP_VERSION_AT_LEAST_11_FP16 +#endif + +/* C++11 header for std::move. + * In RTC mode, std::move is provided implicitly; don't include the header + */ +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) && !defined(__CUDACC_RTC__) +#include +#endif /* __cplusplus >= 201103L && !defined(__CUDACC_RTC__) */ + +/* C++ header for std::memcpy (used for type punning in host-side implementations). + * When compiling as a CUDA source file memcpy is provided implicitly. + * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__). + */ +#if defined(__cplusplus) && !defined(__CUDACC__) +#include +#endif /* defined(__cplusplus) && !defined(__CUDACC__) */ + + +/* Set up function decorations */ +#if defined(__CUDACC__) +#define __CUDA_FP16_DECL__ static __device__ __inline__ +#define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__ +#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__ +#define __CUDA_HOSTDEVICE__ __host__ __device__ +#else /* !defined(__CUDACC__) */ +#if defined(__GNUC__) +#define __CUDA_HOSTDEVICE_FP16_DECL__ static __attribute__ ((unused)) +#else +#define __CUDA_HOSTDEVICE_FP16_DECL__ static +#endif /* defined(__GNUC__) */ +#define __CUDA_HOSTDEVICE__ +#endif /* defined(__CUDACC_) */ + +/* Set up structure-alignment attribute */ +#if defined(__CUDACC__) +#define __CUDA_ALIGN__(align) __align__(align) +#else +/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */ +#if __cplusplus >= 201103L +#define __CUDA_ALIGN__(n) alignas(n) /* C++11 kindly gives us a keyword for this */ +#else /* !defined(__CPP_VERSION_AT_LEAST_11_FP16)*/ +#if defined(__GNUC__) +#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +#define __CUDA_ALIGN__(n) __declspec(align(n)) +#else +#define __CUDA_ALIGN__(n) +#endif /* defined(__GNUC__) */ +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ +#endif /* defined(__CUDACC__) */ + +/* Macros to allow half & half2 to be used by inline assembly */ +#define __HALF_TO_US(var) *(reinterpret_cast(&(var))) +#define __HALF_TO_CUS(var) *(reinterpret_cast(&(var))) +#define __HALF2_TO_UI(var) *(reinterpret_cast(&(var))) +#define __HALF2_TO_CUI(var) *(reinterpret_cast(&(var))) + +/* Macros for half & half2 binary arithmetic */ +#define __BINARY_OP_HALF_MACRO(name) /* do */ {\ + __half val; \ + asm( "{"#name".f16 %0,%1,%2;\n}" \ + :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \ + return val; \ +} /* while(0) */ +#define __BINARY_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{"#name".f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + return val; \ +} /* while(0) */ +#define __TERNARY_OP_HALF_MACRO(name) /* do */ {\ + __half val; \ + asm( "{"#name".f16 %0,%1,%2,%3;\n}" \ + :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \ + return val; \ +} /* while(0) */ +#define __TERNARY_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{"#name".f16x2 %0,%1,%2,%3;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b)),"r"(__HALF2_TO_CUI(c))); \ + return val; \ +} /* while(0) */ + +/** +* Types which allow static initialization of "half" and "half2" until +* these become an actual builtin. Note this initialization is as a +* bitfield representation of "half", and not a conversion from short->half. +* Such a representation will be deprecated in a future version of CUDA. +* (Note these are visible to non-nvcc compilers, including C-only compilation) +*/ +typedef struct __CUDA_ALIGN__(2) { + unsigned short x; +} __half_raw; + +typedef struct __CUDA_ALIGN__(4) { + unsigned short x; + unsigned short y; +} __half2_raw; + +/* All other definitions in this file are only visible to C++ compilers */ +#if defined(__cplusplus) + +/* Hide GCC member initialization list warnings because of host/device in-function init requirement */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Weffc++" +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +/* class' : multiple assignment operators specified + The class has multiple assignment operators of a single type. This warning is informational */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( push ) +#pragma warning( disable:4522 ) +#endif /* defined(__GNUC__) */ + +struct __CUDA_ALIGN__(2) __half { +protected: + unsigned short __x; + +public: +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) + __half() = default; +#else + __CUDA_HOSTDEVICE__ __half() { } +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + + /* Convert to/from __half_raw */ + __CUDA_HOSTDEVICE__ __half(const __half_raw &hr) : __x(hr.x) { } + __CUDA_HOSTDEVICE__ __half &operator=(const __half_raw &hr) { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ volatile __half &operator=(const __half_raw &hr) volatile { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ volatile __half &operator=(const volatile __half_raw &hr) volatile { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ operator __half_raw() const { __half_raw ret; ret.x = __x; return ret; } + __CUDA_HOSTDEVICE__ operator __half_raw() const volatile { __half_raw ret; ret.x = __x; return ret; } + +#if !defined(__CUDA_NO_HALF_CONVERSIONS__) + + /* Construct from float/double */ + __CUDA_HOSTDEVICE__ __half(const float f) { __x = __float2half(f).__x; } + __CUDA_HOSTDEVICE__ __half(const double f) { __x = __double2half(f).__x; } + + __CUDA_HOSTDEVICE__ operator float() const { return __half2float(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const float f) { __x = __float2half(f).__x; return *this; } + + /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */ + __CUDA_HOSTDEVICE__ __half &operator=(const double f) { __x = __double2half(f).__x; return *this; } + +/* Member functions only available to nvcc compilation so far */ +#if defined(__CUDACC__) + /* Allow automatic construction from types supported natively in hardware */ + /* Note we do avoid constructor init-list because of special host/device compilation rules */ + __CUDA_HOSTDEVICE__ __half(short val) { __x = __short2half_rn(val).__x; } + __CUDA_HOSTDEVICE__ __half(unsigned short val) { __x = __ushort2half_rn(val).__x; } + __CUDA_HOSTDEVICE__ __half(int val) { __x = __int2half_rn(val).__x; } + __CUDA_HOSTDEVICE__ __half(unsigned int val) { __x = __uint2half_rn(val).__x; } + __CUDA_HOSTDEVICE__ __half(long long val) { __x = __ll2half_rn(val).__x; } + __CUDA_HOSTDEVICE__ __half(unsigned long long val) { __x = __ull2half_rn(val).__x; } + + /* Allow automatic casts to supported builtin types, matching all that are permitted with float */ + __CUDA_HOSTDEVICE__ operator short() const { return __half2short_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(short val) { __x = __short2half_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator unsigned short() const { return __half2ushort_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(unsigned short val) { __x = __ushort2half_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator int() const { return __half2int_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(int val) { __x = __int2half_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator unsigned int() const { return __half2uint_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(unsigned int val) { __x = __uint2half_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator long long() const { return __half2ll_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(long long val) { __x = __ll2half_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator unsigned long long() const { return __half2ull_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(unsigned long long val) { __x = __ull2half_rn(val).__x; return *this; } + + /* Boolean conversion - note both 0 and -0 must return false */ + __CUDA_HOSTDEVICE__ operator bool() const { return (__x & 0x7FFF) != 0; } +#endif /* defined(__CUDACC__) */ +#endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */ +}; + +/* Global-space operator functions are only available to nvcc compilation */ +#if defined(__CUDACC__) + +/* Arithmetic FP16 operations only supported on arch >= 5.3 */ +#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) +#if !defined(__CUDA_NO_HALF_OPERATORS__) +/* Some basic arithmetic operations expected of a builtin */ +__device__ __forceinline__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); } +__device__ __forceinline__ __half operator-(const __half &lh, const __half &rh) { return __hsub(lh, rh); } +__device__ __forceinline__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); } +__device__ __forceinline__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); } + +__device__ __forceinline__ __half &operator+=(__half &lh, const __half &rh) { lh = __hadd(lh, rh); return lh; } +__device__ __forceinline__ __half &operator-=(__half &lh, const __half &rh) { lh = __hsub(lh, rh); return lh; } +__device__ __forceinline__ __half &operator*=(__half &lh, const __half &rh) { lh = __hmul(lh, rh); return lh; } +__device__ __forceinline__ __half &operator/=(__half &lh, const __half &rh) { lh = __hdiv(lh, rh); return lh; } + +/* Note for increment and decrement we use the raw value 0x3C00 equating to half(1.0f), to avoid the extra conversion */ +__device__ __forceinline__ __half &operator++(__half &h) { __half_raw one; one.x = 0x3C00; h += one; return h; } +__device__ __forceinline__ __half &operator--(__half &h) { __half_raw one; one.x = 0x3C00; h -= one; return h; } +__device__ __forceinline__ __half operator++(__half &h, int) { __half ret = h; __half_raw one; one.x = 0x3C00; h += one; return ret; } +__device__ __forceinline__ __half operator--(__half &h, int) { __half ret = h; __half_raw one; one.x = 0x3C00; h -= one; return ret; } + +/* Unary plus and inverse operators */ +__device__ __forceinline__ __half operator+(const __half &h) { return h; } +__device__ __forceinline__ __half operator-(const __half &h) { return __hneg(h); } + +/* Some basic comparison operations to make it look like a builtin */ +__device__ __forceinline__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); } +__device__ __forceinline__ bool operator!=(const __half &lh, const __half &rh) { return __hneu(lh, rh); } +__device__ __forceinline__ bool operator> (const __half &lh, const __half &rh) { return __hgt(lh, rh); } +__device__ __forceinline__ bool operator< (const __half &lh, const __half &rh) { return __hlt(lh, rh); } +__device__ __forceinline__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); } +__device__ __forceinline__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); } +#endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */ +#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) */ +#endif /* defined(__CUDACC__) */ + +/* __half2 is visible to non-nvcc host compilers */ +struct __CUDA_ALIGN__(4) __half2 { + __half x; + __half y; + + // All construct/copy/assign/move +public: +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) + __half2() = default; + __CUDA_HOSTDEVICE__ __half2(__half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); } + __CUDA_HOSTDEVICE__ __half2 &operator=(__half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); return *this; } +#else + __CUDA_HOSTDEVICE__ __half2() { } +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + __CUDA_HOSTDEVICE__ __half2(const __half &a, const __half &b) : x(a), y(b) { } + __CUDA_HOSTDEVICE__ __half2(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); } + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); return *this; } + + /* Convert to/from __half2_raw */ + __CUDA_HOSTDEVICE__ __half2(const __half2_raw &h2r ) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); } + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2_raw &h2r) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); return *this; } + __CUDA_HOSTDEVICE__ operator __half2_raw() const { __half2_raw ret; ret.x = 0U; ret.y = 0U; __HALF2_TO_UI(ret) = __HALF2_TO_CUI(*this); return ret; } +}; + +/* Global-space operator functions are only available to nvcc compilation */ +#if defined(__CUDACC__) + +/* Arithmetic FP16x2 operations only supported on arch >= 5.3 */ +#if (__CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)) && !defined(__CUDA_NO_HALF2_OPERATORS__) + +__device__ __forceinline__ __half2 operator+(const __half2 &lh, const __half2 &rh) { return __hadd2(lh, rh); } +__device__ __forceinline__ __half2 operator-(const __half2 &lh, const __half2 &rh) { return __hsub2(lh, rh); } +__device__ __forceinline__ __half2 operator*(const __half2 &lh, const __half2 &rh) { return __hmul2(lh, rh); } +__device__ __forceinline__ __half2 operator/(const __half2 &lh, const __half2 &rh) { return __h2div(lh, rh); } + +__device__ __forceinline__ __half2& operator+=(__half2 &lh, const __half2 &rh) { lh = __hadd2(lh, rh); return lh; } +__device__ __forceinline__ __half2& operator-=(__half2 &lh, const __half2 &rh) { lh = __hsub2(lh, rh); return lh; } +__device__ __forceinline__ __half2& operator*=(__half2 &lh, const __half2 &rh) { lh = __hmul2(lh, rh); return lh; } +__device__ __forceinline__ __half2& operator/=(__half2 &lh, const __half2 &rh) { lh = __h2div(lh, rh); return lh; } + +__device__ __forceinline__ __half2 &operator++(__half2 &h) { __half2_raw one; one.x = 0x3C00; one.y = 0x3C00; h = __hadd2(h, one); return h; } +__device__ __forceinline__ __half2 &operator--(__half2 &h) { __half2_raw one; one.x = 0x3C00; one.y = 0x3C00; h = __hsub2(h, one); return h; } +__device__ __forceinline__ __half2 operator++(__half2 &h, int) { __half2 ret = h; __half2_raw one; one.x = 0x3C00; one.y = 0x3C00; h = __hadd2(h, one); return ret; } +__device__ __forceinline__ __half2 operator--(__half2 &h, int) { __half2 ret = h; __half2_raw one; one.x = 0x3C00; one.y = 0x3C00; h = __hsub2(h, one); return ret; } + +__device__ __forceinline__ __half2 operator+(const __half2 &h) { return h; } +__device__ __forceinline__ __half2 operator-(const __half2 &h) { return __hneg2(h); } + +__device__ __forceinline__ bool operator==(const __half2 &lh, const __half2 &rh) { return __hbeq2(lh, rh); } +__device__ __forceinline__ bool operator!=(const __half2 &lh, const __half2 &rh) { return __hbneu2(lh, rh); } +__device__ __forceinline__ bool operator>(const __half2 &lh, const __half2 &rh) { return __hbgt2(lh, rh); } +__device__ __forceinline__ bool operator<(const __half2 &lh, const __half2 &rh) { return __hblt2(lh, rh); } +__device__ __forceinline__ bool operator>=(const __half2 &lh, const __half2 &rh) { return __hbge2(lh, rh); } +__device__ __forceinline__ bool operator<=(const __half2 &lh, const __half2 &rh) { return __hble2(lh, rh); } + +#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) */ +#endif /* defined(__CUDACC__) */ + +/* Restore warning for multiple assignment operators */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( pop ) +#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */ + +/* Restore -Weffc++ warnings from here on */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic pop +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +#undef __CUDA_HOSTDEVICE__ +#undef __CUDA_ALIGN__ + +#ifndef __CUDACC_RTC__ /* no host functions in NVRTC mode */ +static unsigned short __internal_float2half(const float f, unsigned int &sign, unsigned int &remainder) +{ + unsigned int x; + unsigned int u; + unsigned int result = 0U; +#if defined(__CUDACC__) + (void)memcpy(&x, &f, sizeof(f)); +#else + (void)std::memcpy(&x, &f, sizeof(f)); +#endif + u = (x & 0x7fffffffU); + sign = ((x >> 16U) & 0x8000U); + // NaN/+Inf/-Inf + if (u >= 0x7f800000U) { + remainder = 0U; + result = ((u == 0x7f800000U) ? (sign | 0x7c00U) : 0x7fffU); + } else if (u > 0x477fefffU) { // Overflows + remainder = 0x80000000U; + result = (sign | 0x7bffU); + } else if (u >= 0x38800000U) { // Normal numbers + remainder = u << 19U; + u -= 0x38000000U; + result = (sign | (u >> 13U)); + } else if (u < 0x33000001U) { // +0/-0 + remainder = u; + result = sign; + } else { // Denormal numbers + const unsigned int exponent = u >> 23U; + const unsigned int shift = 0x7eU - exponent; + unsigned int mantissa = (u & 0x7fffffU); + mantissa |= 0x800000U; + remainder = mantissa << (32U - shift); + result = (sign | (mantissa >> shift)); + } + return static_cast(result); +} +#endif /* #if !defined(__CUDACC_RTC__) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double x) +{ +#if defined(__CUDA_ARCH__) + __half val; + asm("{ cvt.rn.f16.f64 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "d"(x)); + return val; +#else + // Perform rounding to 11 bits of precision, convert value + // to float and call existing float to half conversion. + // By pre-rounding to 11 bits we avoid additional rounding + // in float to half conversion. + unsigned long long int absx; + unsigned long long int ux; +#if defined(__CUDACC__) + (void)memcpy(&ux, &x, sizeof(x)); +#else + (void)std::memcpy(&ux, &x, sizeof(x)); +#endif + absx = (ux & 0x7fffffffffffffffull); + if ((absx >= 0x40f0000000000000ull) || (absx <= 0x3e60000000000000ull)) + { + // |x| >= 2^16 or NaN or |x| <= 2^(-25) + // double-rounding is not a problem + return __float2half(static_cast(x)); + } + + // here 2^(-25) < |x| < 2^16 + // prepare shifter value such that x + shifter + // done in double precision performs round-to-nearest-even + // and (x + shifter) - shifter results in x rounded to + // 11 bits of precision. Shifter needs to have exponent of + // x plus 53 - 11 = 42 and a leading bit in mantissa to guard + // against negative values. + // So need to have |x| capped to avoid overflow in exponent. + // For inputs that are smaller than half precision minnorm + // we prepare fixed shifter exponent. + unsigned long long shifterBits = ux & 0x7ff0000000000000ull; + if (absx >= 0x3f10000000000000ull) + { // |x| >= 2^(-14) + // add 42 to exponent bits + shifterBits += 42ull << 52; + } + else + { // 2^(-25) < |x| < 2^(-14), potentially results in denormal + // set exponent bits to 42 - 14 + bias + shifterBits = ((42ull - 14 + 1023) << 52); + } + // set leading mantissa bit to protect against negative inputs + shifterBits |= 1ull << 51; + double shifter; +#if defined(__CUDACC__) + (void)memcpy(&shifter, &shifterBits, sizeof(shifterBits)); +#else + (void)std::memcpy(&shifter, &shifterBits, sizeof(shifterBits)); +#endif + double xShiftRound = x + shifter; + + // Prevent the compiler from optimizing away x + shifter - shifter + // by doing intermediate memcopy and harmless bitwize operation + unsigned long long int xShiftRoundBits; +#if defined(__CUDACC__) + (void)memcpy(&xShiftRoundBits, &xShiftRound, sizeof(xShiftRound)); +#else + (void)std::memcpy(&xShiftRoundBits, &xShiftRound, sizeof(xShiftRound)); +#endif + + // the value is positive, so this operation doesn't change anything + xShiftRoundBits &= 0x7fffffffffffffffull; + +#if defined(__CUDACC__) + (void)memcpy(&xShiftRound, &xShiftRoundBits, sizeof(xShiftRound)); +#else + (void)std::memcpy(&xShiftRound, &xShiftRoundBits, sizeof(xShiftRound)); +#endif + + double xRounded = xShiftRound - shifter; + float xRndFlt = static_cast(xRounded); + __half res = __float2half(xRndFlt); + return res; +#endif +} + +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign; + unsigned int remainder; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign; + unsigned int remainder; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rz.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign; + unsigned int remainder; + r.x = __internal_float2half(a, sign, remainder); + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rm.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign; + unsigned int remainder; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder != 0U) && (sign != 0U)) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rp.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign; + unsigned int remainder; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder != 0U) && (sign == 0U)) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a) +{ + __half2 val; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low;\n" + " cvt.rn.f16.f32 low, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a)); +#else + val = __half2(__float2half_rn(a), __float2half_rn(a)); +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b) +{ + __half2 val; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low,high;\n" + " cvt.rn.f16.f32 low, %1;\n" + " cvt.rn.f16.f32 high, %2;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b)); +#else + val = __half2(__float2half_rn(a), __float2half_rn(b)); +#endif + return val; +} + +#ifndef __CUDACC_RTC__ /* no host functions in NVRTC mode */ +static float __internal_half2float(const unsigned short h) +{ + unsigned int sign = ((static_cast(h) >> 15U) & 1U); + unsigned int exponent = ((static_cast(h) >> 10U) & 0x1fU); + unsigned int mantissa = ((static_cast(h) & 0x3ffU) << 13U); + float f; + if (exponent == 0x1fU) { /* NaN or Inf */ + sign = ((mantissa != 0U) ? 0U : sign); + mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U); + exponent = 0xffU; + } else if (exponent == 0U) { /* Denorm or Zero */ + if (mantissa != 0U) { + unsigned int msb; + exponent = 0x71U; + do { + msb = (mantissa & 0x400000U); + mantissa <<= 1U; /* normalize */ + --exponent; + } while (msb == 0U); + mantissa &= 0x7fffffU; /* 1.mantissa is implicit */ + } + } else { + exponent += 0x70U; + } + unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa); +#if defined(__CUDACC__) + (void)memcpy(&f, &u, sizeof(u)); +#else + (void)std::memcpy(&f, &u, sizeof(u)); +#endif + return f; +} +#endif /* !defined(__CUDACC_RTC__) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a) +{ + float val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(a))); +#else + val = __internal_half2float(static_cast<__half_raw>(a).x); +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a) +{ + float val; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, low;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a))); +#else + val = __internal_half2float(static_cast<__half2_raw>(a).x); +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a) +{ + float val; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, high;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a))); +#else + val = __internal_half2float(static_cast<__half2_raw>(a).y); +#endif + return val; +} + +/* Intrinsic functions only available to nvcc compilers */ +#if defined(__CUDACC__) + +/* CUDA vector-types compatible vector creation function (note returns __half2, not half2) */ +__VECTOR_FUNCTIONS_DECL__ __half2 make_half2(__half x, __half y) +{ + __half2 t; t.x = x; t.y = y; return t; +} +#undef __VECTOR_FUNCTIONS_DECL__ + + +/* Definitions of intrinsics */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 f) +{ + __half2 val = __floats2half2_rn(f.x, f.y); + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 l) +{ + float hi_float; + float lo_float; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, low;}\n" : "=f"(lo_float) : "r"(__HALF2_TO_CUI(l))); + + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, high;}\n" : "=f"(hi_float) : "r"(__HALF2_TO_CUI(l))); +#else + lo_float = __internal_half2float(((__half2_raw)l).x); + hi_float = __internal_half2float(((__half2_raw)l).y); +#endif + return make_float2(lo_float, hi_float); +} +__CUDA_FP16_DECL__ int __half2int_rn(__half h) +{ + int i; + asm("cvt.rni.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(__half h) +{ + int i; +#if defined __CUDA_ARCH__ + asm("cvt.rzi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); +#else + const float f = __half2float(h); + i = static_cast(f); + const int max_val = (int)0x7fffffffU; + const int min_val = (int)0x80000000U; + // saturation fixup + if (f != f) { + // NaN + i = 0; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } +#endif + return i; +} +__CUDA_FP16_DECL__ int __half2int_rd(__half h) +{ + int i; + asm("cvt.rmi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ int __half2int_ru(__half h) +{ + int i; + asm("cvt.rpi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(int i) +{ + __half h; +#if defined(__CUDA_ARCH__) + asm("cvt.rn.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +#else + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast(i); + h = __float2half_rn(f); +#endif + return h; +} +__CUDA_FP16_DECL__ __half __int2half_rz(int i) +{ + __half h; + asm("cvt.rz.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __int2half_rd(int i) +{ + __half h; + asm("cvt.rm.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __int2half_ru(int i) +{ + __half h; + asm("cvt.rp.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} + +__CUDA_FP16_DECL__ short int __half2short_rn(__half h) +{ + short int i; + asm("cvt.rni.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(__half h) +{ + short int i; +#if defined __CUDA_ARCH__ + asm("cvt.rzi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); +#else + const float f = __half2float(h); + i = static_cast(f); + const short int max_val = (short int)0x7fffU; + const short int min_val = (short int)0x8000U; + // saturation fixup + if (f != f) { + // NaN + i = 0; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } +#endif + return i; +} +__CUDA_FP16_DECL__ short int __half2short_rd(__half h) +{ + short int i; + asm("cvt.rmi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ short int __half2short_ru(__half h) +{ + short int i; + asm("cvt.rpi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(short int i) +{ + __half h; +#if defined __CUDA_ARCH__ + asm("cvt.rn.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +#else + const float f = static_cast(i); + h = __float2half_rn(f); +#endif + return h; +} +__CUDA_FP16_DECL__ __half __short2half_rz(short int i) +{ + __half h; + asm("cvt.rz.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __short2half_rd(short int i) +{ + __half h; + asm("cvt.rm.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __short2half_ru(short int i) +{ + __half h; + asm("cvt.rp.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} + +__CUDA_FP16_DECL__ unsigned int __half2uint_rn(__half h) +{ + unsigned int i; + asm("cvt.rni.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(__half h) +{ + unsigned int i; +#if defined __CUDA_ARCH__ + asm("cvt.rzi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); +#else + const float f = __half2float(h); + i = static_cast(f); + const unsigned int max_val = 0xffffffffU; + const unsigned int min_val = 0U; + // saturation fixup + if (f != f) { + // NaN + i = 0U; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } +#endif + return i; +} +__CUDA_FP16_DECL__ unsigned int __half2uint_rd(__half h) +{ + unsigned int i; + asm("cvt.rmi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned int __half2uint_ru(__half h) +{ + unsigned int i; + asm("cvt.rpi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(unsigned int i) +{ + __half h; +#if defined __CUDA_ARCH__ + asm("cvt.rn.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +#else + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast(i); + h = __float2half_rn(f); +#endif + return h; +} +__CUDA_FP16_DECL__ __half __uint2half_rz(unsigned int i) +{ + __half h; + asm("cvt.rz.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __uint2half_rd(unsigned int i) +{ + __half h; + asm("cvt.rm.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __uint2half_ru(unsigned int i) +{ + __half h; + asm("cvt.rp.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} + +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(__half h) +{ + unsigned short int i; + asm("cvt.rni.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(__half h) +{ + unsigned short int i; +#if defined __CUDA_ARCH__ + asm("cvt.rzi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); +#else + const float f = __half2float(h); + i = static_cast(f); + const unsigned short int max_val = 0xffffU; + const unsigned short int min_val = 0U; + // saturation fixup + if (f != f) { + // NaN + i = 0U; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } +#endif + return i; +} +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(__half h) +{ + unsigned short int i; + asm("cvt.rmi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(__half h) +{ + unsigned short int i; + asm("cvt.rpi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(unsigned short int i) +{ + __half h; +#if defined __CUDA_ARCH__ + asm("cvt.rn.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +#else + const float f = static_cast(i); + h = __float2half_rn(f); +#endif + return h; +} +__CUDA_FP16_DECL__ __half __ushort2half_rz(unsigned short int i) +{ + __half h; + asm("cvt.rz.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ushort2half_rd(unsigned short int i) +{ + __half h; + asm("cvt.rm.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ushort2half_ru(unsigned short int i) +{ + __half h; + asm("cvt.rp.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} + +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(__half h) +{ + unsigned long long int i; + asm("cvt.rni.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(__half h) +{ + unsigned long long int i; +#if defined __CUDA_ARCH__ + asm("cvt.rzi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); +#else + const float f = __half2float(h); + i = static_cast(f); + const unsigned long long int max_val = 0xffffffffffffffffULL; + const unsigned long long int min_val = 0ULL; + // saturation fixup + if (f != f) { + // NaN + i = 0x8000000000000000ULL; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } +#endif + return i; +} +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(__half h) +{ + unsigned long long int i; + asm("cvt.rmi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(__half h) +{ + unsigned long long int i; + asm("cvt.rpi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(unsigned long long int i) +{ + __half h; +#if defined(__CUDA_ARCH__) + asm("cvt.rn.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +#else + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast(i); + h = __float2half_rn(f); +#endif + return h; +} +__CUDA_FP16_DECL__ __half __ull2half_rz(unsigned long long int i) +{ + __half h; + asm("cvt.rz.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ull2half_rd(unsigned long long int i) +{ + __half h; + asm("cvt.rm.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ull2half_ru(unsigned long long int i) +{ + __half h; + asm("cvt.rp.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} + +__CUDA_FP16_DECL__ long long int __half2ll_rn(__half h) +{ + long long int i; + asm("cvt.rni.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(__half h) +{ + long long int i; +#if defined __CUDA_ARCH__ + asm("cvt.rzi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); +#else + const float f = __half2float(h); + i = static_cast(f); + const long long int max_val = (long long int)0x7fffffffffffffffULL; + const long long int min_val = (long long int)0x8000000000000000ULL; + // saturation fixup + if (f != f) { + // NaN + i = min_val; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } +#endif + return i; +} +__CUDA_FP16_DECL__ long long int __half2ll_rd(__half h) +{ + long long int i; + asm("cvt.rmi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ long long int __half2ll_ru(__half h) +{ + long long int i; + asm("cvt.rpi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(long long int i) +{ + __half h; +#if defined(__CUDA_ARCH__) + asm("cvt.rn.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +#else + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast(i); + h = __float2half_rn(f); +#endif + return h; +} +__CUDA_FP16_DECL__ __half __ll2half_rz(long long int i) +{ + __half h; + asm("cvt.rz.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ll2half_rd(long long int i) +{ + __half h; + asm("cvt.rm.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ll2half_ru(long long int i) +{ + __half h; + asm("cvt.rp.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} + +__CUDA_FP16_DECL__ __half htrunc(const __half h) +{ + __half r; + asm("cvt.rzi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hceil(const __half h) +{ + __half r; + asm("cvt.rpi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hfloor(const __half h) +{ + __half r; + asm("cvt.rmi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hrint(const __half h) +{ + __half r; + asm("cvt.rni.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} + +__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rzi.f16.f16 low, low;\n" + " cvt.rzi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rpi.f16.f16 low, low;\n" + " cvt.rpi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rmi.f16.f16 low, low;\n" + " cvt.rmi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rni.f16.f16 low, low;\n" + " cvt.rni.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 l, const __half2 h) +{ + __half2 val; + asm("{.reg .f16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {alow,blow};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(l)), "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 l, const __half2 h) +{ + __half2 val; + asm("{.reg .f16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(l)), "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half __low2half(const __half2 h) +{ + __half ret; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, low;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(h))); + return ret; +} +__CUDA_FP16_DECL__ int __hisinf(const __half a) +{ + if (__HALF_TO_CUS(a) == 0xFC00) { + return -1; + } + if (__HALF_TO_CUS(a) == 0x7C00) { + return 1; + } + return 0; +} +__CUDA_FP16_DECL__ __half2 __low2half2(const __half2 l) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(l))); + return val; +} +__CUDA_FP16_DECL__ __half2 __high2half2(const __half2 l) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(l))); + return val; +} +__CUDA_FP16_DECL__ __half __high2half(const __half2 h) +{ + __half ret; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, high;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(h))); + return ret; +} +__CUDA_FP16_DECL__ __half2 __halves2half2(const __half l, const __half h) +{ + __half2 val; + asm("{ mov.b32 %0, {%1,%2};}\n" + : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(l)), "h"(__HALF_TO_CUS(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 __half2half2(const __half lh) +{ + __half2 val; + asm("{ mov.b32 %0, {%1,%1};}\n" + : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(lh))); + return val; +} +__CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 lh) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(lh))); + return val; +} +__CUDA_FP16_DECL__ short int __half_as_short(const __half h) +{ + return (short int)__HALF_TO_CUS(h); +} +__CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h) +{ + return __HALF_TO_CUS(h); +} +__CUDA_FP16_DECL__ __half __short_as_half(const short int i) +{ + __half h; + __HALF_TO_US(h) = (unsigned short int)i; + return h; +} +__CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i) +{ + __half h; + __HALF_TO_US(h) = i; + return h; +} + +#if __CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__) +/****************************************************************************** +* __half, __half2 warp shuffle * +******************************************************************************/ +#define __SHUFFLE_HALF2_MACRO(name) /* do */ {\ + __half2 r; \ + asm volatile ("{"#name" %0,%1,%2,%3;\n}" \ + :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c)); \ + return r; \ +} /* while(0) */ + +#define __SHUFFLE_SYNC_HALF2_MACRO(name) /* do */ {\ + __half2 r; \ + asm volatile ("{"#name" %0,%1,%2,%3,%4;\n}" \ + :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \ + return r; \ +} /* while(0) */ + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 + +__CUDA_FP16_DECL__ __half2 __shfl(__half2 var, int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_HALF2_MACRO(shfl.idx.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_up(__half2 var, unsigned int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = (warpSize - width) << 8; + __SHUFFLE_HALF2_MACRO(shfl.up.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_down(__half2 var, unsigned int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_HALF2_MACRO(shfl.down.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_xor(__half2 var, int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_HALF2_MACRO(shfl.bfly.b32) +} + +#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */ + +__CUDA_FP16_DECL__ __half2 __shfl_sync(unsigned mask, __half2 var, int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.idx.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_up_sync(unsigned mask, __half2 var, unsigned int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = (warpSize - width) << 8; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.up.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_down_sync(unsigned mask, __half2 var, unsigned int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.down.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(unsigned mask, __half2 var, int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.bfly.b32) +} + +#undef __SHUFFLE_HALF2_MACRO +#undef __SHUFFLE_SYNC_HALF2_MACRO + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 + +__CUDA_FP16_DECL__ __half __shfl(__half var, int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_up(__half var, unsigned int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_up(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_down(__half var, unsigned int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_down(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_xor(__half var, int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_xor(temp1, delta, width); + return __low2half(temp2); +} + +#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */ + +__CUDA_FP16_DECL__ __half __shfl_sync(unsigned mask, __half var, int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_up_sync(unsigned mask, __half var, unsigned int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_up_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_down_sync(unsigned mask, __half var, unsigned int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_down_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_xor_sync(unsigned mask, __half var, int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_xor_sync(mask, temp1, delta, width); + return __low2half(temp2); +} + +#endif /*__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__)*/ +/****************************************************************************** +* __half and __half2 __ldg,__ldcg,__ldca,__ldcs * +******************************************************************************/ + +#if defined(__cplusplus) && (__CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__)) +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ +__CUDA_FP16_DECL__ __half2 __ldg(const __half2 *ptr) +{ + __half2 ret; + asm ("ld.global.nc.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldg(const __half *ptr) +{ + __half ret; + asm ("ld.global.nc.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *ptr) +{ + __half2 ret; + asm ("ld.global.cg.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcg(const __half *ptr) +{ + __half ret; + asm ("ld.global.cg.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldca(const __half2 *ptr) +{ + __half2 ret; + asm ("ld.global.ca.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldca(const __half *ptr) +{ + __half ret; + asm ("ld.global.ca.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *ptr) +{ + __half2 ret; + asm ("ld.global.cs.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcs(const __half *ptr) +{ + __half ret; + asm ("ld.global.cs.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldlu(const __half2 *ptr) +{ + __half2 ret; + asm ("ld.global.lu.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ __half __ldlu(const __half *ptr) +{ + __half ret; + asm ("ld.global.lu.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcv(const __half2 *ptr) +{ + __half2 ret; + asm ("ld.global.cv.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcv(const __half *ptr) +{ + __half ret; + asm ("ld.global.cv.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ void __stwb(__half2 *ptr, __half2 value) +{ + asm ("st.global.wb.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stwb(__half *ptr, __half value) +{ + asm ("st.global.wb.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcg(__half2 *ptr, __half2 value) +{ + asm ("st.global.cg.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcg(__half *ptr, __half value) +{ + asm ("st.global.cg.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcs(__half2 *ptr, __half2 value) +{ + asm ("st.global.cs.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcs(__half *ptr, __half value) +{ + asm ("st.global.cs.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stwt(__half2 *ptr, __half2 value) +{ + asm ("st.global.wt.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stwt(__half *ptr, __half value) +{ + asm ("st.global.wt.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +#undef __LDG_PTR +#endif /*defined(__cplusplus) && (__CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__))*/ +#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) +/****************************************************************************** +* __half2 comparison * +******************************************************************************/ +#define __COMPARISON_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{ "#name".f16x2.f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + return val; \ +} /* while(0) */ +__CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.eq) +} +__CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.ne) +} +__CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.le) +} +__CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.ge) +} +__CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.lt) +} +__CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.gt) +} +__CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.equ) +} +__CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.neu) +} +__CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.leu) +} +__CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.geu) +} +__CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.ltu) +} +__CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.gtu) +} +#undef __COMPARISON_OP_HALF2_MACRO +#define __BOOL_COMPARISON_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{ "#name".f16x2.f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + if (__HALF2_TO_CUI(val) == 0x3C003C00) \ + return true; \ + else \ + return false; \ +} /* while(0) */ +__CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.eq) +} +__CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.ne) +} +__CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.le) +} +__CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.ge) +} +__CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.lt) +} +__CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.gt) +} +__CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.equ) +} +__CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.neu) +} +__CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.leu) +} +__CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.geu) +} +__CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.ltu) +} +__CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.gtu) +} +#undef __BOOL_COMPARISON_OP_HALF2_MACRO +/****************************************************************************** +* __half comparison * +******************************************************************************/ +#define __COMPARISON_OP_HALF_MACRO(name) /* do */ {\ + unsigned short val; \ + asm( "{ .reg .pred __$temp3;\n" \ + " setp."#name".f16 __$temp3, %1, %2;\n" \ + " selp.u16 %0, 1, 0, __$temp3;}" \ + : "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \ + return val ? true : false; \ +} /* while(0) */ +__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(eq) +} +__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ne) +} +__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(le) +} +__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ge) +} +__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(lt) +} +__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(gt) +} +__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(equ) +} +__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(neu) +} +__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(leu) +} +__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(geu) +} +__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ltu) +} +__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(gtu) +} +#undef __COMPARISON_OP_HALF_MACRO +/****************************************************************************** +* __half2 arithmetic * +******************************************************************************/ +__CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(add) +} +__CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(sub) +} +__CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(mul) +} +__CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(add.sat) +} +__CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(sub.sat) +} +__CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(mul.sat) +} +__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn) +} +__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn.sat) +} +__CUDA_FP16_DECL__ __half2 __h2div(__half2 a, __half2 b) { + __half ha, hb; + + ha = __low2half(a); + hb = __low2half(b); + + __half v1 = __hdiv(ha, hb); + + ha = __high2half(a); + hb = __high2half(b); + + __half v2 = __hdiv(ha, hb); + + return __halves2half2(v1, v2); +} +/****************************************************************************** +* __half arithmetic * +******************************************************************************/ +__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(add) +} +__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(sub) +} +__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(mul) +} +__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(add.sat) +} +__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(sub.sat) +} +__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(mul.sat) +} + +__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn) +} +__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn.sat) +} +__CUDA_FP16_DECL__ __half __hdiv(__half a, __half b) { + __half v, abs, den; + __HALF_TO_US(den) = 0x008F; + float fa, fb, fv, rcp; + + fa = __half2float(a); + fb = __half2float(b); + + asm("{rcp.approx.ftz.f32 %0, %1;\n}" :"=f"(rcp) : "f"(fb)); + + fv = rcp * fa; + + v = __float2half(fv); + __HALF_TO_US(abs) = (unsigned short)(((unsigned int)__HALF_TO_CUS(v)) & 0x00007FFF); + if (__hlt(abs, den) && (!(__HALF_TO_CUS(abs) == 0x0000))) { + float err = __fmaf_rn(-fb, fv, fa); + fv = __fmaf_rn(rcp, err, fv); + v = __float2half(fv); + } + return v; +} + +/****************************************************************************** +* __half2 functions * +******************************************************************************/ +#define __SPEC_CASE2(i,r, spc, ulp) \ + "{.reg.b32 spc, ulp, p;\n"\ + " mov.b32 spc,"#spc";\n"\ + " mov.b32 ulp,"#ulp";\n"\ + " set.eq.f16x2.f16x2 p,"#i", spc;\n"\ + " fma.rn.f16x2 "#r",p,ulp,"#r";\n}\n" +#define __SPEC_CASE(i,r, spc, ulp) \ + "{.reg.b16 spc, ulp, p;\n"\ + " mov.b16 spc,"#spc";\n"\ + " mov.b16 ulp,"#ulp";\n"\ + " set.eq.f16.f16 p,"#i", spc;\n"\ + " fma.rn.f16 "#r",p,ulp,"#r";\n}\n" +#define __APPROX_FCAST(fun) /* do */ {\ + __half val;\ + asm("{.reg.b32 f; \n"\ + " .reg.b16 r; \n"\ + " mov.b16 r,%1; \n"\ + " cvt.f32.f16 f,r; \n"\ + " "#fun".approx.f32 f,f; \n"\ + " cvt.rn.f16.f32 r,f; \n"\ + " mov.b16 %0,r; \n"\ + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\ + return val;\ +} /* while(0) */ +#define __APPROX_FCAST2(fun) /* do */ {\ + __half2 val;\ + asm("{.reg.b16 hl, hu; \n"\ + " .reg.b32 fl, fu; \n"\ + " mov.b32 {hl, hu}, %1; \n"\ + " cvt.f32.f16 fl, hl; \n"\ + " cvt.f32.f16 fu, hu; \n"\ + " "#fun".approx.f32 fl, fl; \n"\ + " "#fun".approx.f32 fu, fu; \n"\ + " cvt.rn.f16.f32 hl, fl; \n"\ + " cvt.rn.f16.f32 hu, fu; \n"\ + " mov.b32 %0, {hl, hu}; \n"\ + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); \ + return val;\ +} /* while(0) */ +static __device__ __forceinline__ float __float_simpl_sinf(float); +static __device__ __forceinline__ float __float_simpl_cosf(float); +__CUDA_FP16_DECL__ __half __hsin_internal(const __half a) { + float f = __half2float(a); + f = __float_simpl_sinf(f); + return __float2half_rn(f); +} +__CUDA_FP16_DECL__ __half hsin(const __half a) { + __half r = __hsin_internal(a); + asm("{\n\t" + " .reg.b16 i,r,t; \n\t" + " mov.b16 r, %0; \n\t" + " mov.b16 i, %1; \n\t" + " mov.b16 t, 0x8000; \n\t" + " and.b16 t,r,t; \n\t" + __SPEC_CASE(i, r, 0X32B3, 0x0800) + __SPEC_CASE(i, r, 0X5CB0, 0x1000) + __SPEC_CASE(i, r, 0XB2B3, 0x8800) + __SPEC_CASE(i, r, 0XDCB0, 0x9000) + " or.b16 r,r,t; \n\t" + " mov.b16 %0, r; \n" + "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a) { + __half l = __low2half(a); + __half h = __high2half(a); + __half2 r = __halves2half2(__hsin_internal(l), __hsin_internal(h)); + asm("{\n\t" + " .reg.b32 i,r,t; \n\t" + " mov.b32 r, %0; \n\t" + " mov.b32 i, %1; \n\t" + " and.b32 t, r, 0x80008000; \n\t" + __SPEC_CASE2(i, r, 0X32B332B3, 0x08000800) + __SPEC_CASE2(i, r, 0X5CB05CB0, 0x10001000) + __SPEC_CASE2(i, r, 0XB2B3B2B3, 0x88008800) + __SPEC_CASE2(i, r, 0XDCB0DCB0, 0x90009000) + " or.b32 r, r, t; \n\t" + " mov.b32 %0, r; \n" + "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ __half __hcos_internal(const __half a) { + float f = __half2float(a); + f = __float_simpl_cosf(f); + return __float2half_rn(f); +} +__CUDA_FP16_DECL__ __half hcos(const __half a) { + __half r = __hcos_internal(a); + asm("{\n\t" + " .reg.b16 i,r; \n\t" + " mov.b16 r, %0; \n\t" + " mov.b16 i, %1; \n\t" + __SPEC_CASE(i, r, 0X2B7C, 0x1000) + __SPEC_CASE(i, r, 0XAB7C, 0x1000) + " mov.b16 %0, r; \n" + "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a) { + __half l = __low2half(a); + __half h = __high2half(a); + __half2 r = __halves2half2(__hcos_internal(l), __hcos_internal(h)); + asm("{\n\t" + " .reg.b32 i,r; \n\t" + " mov.b32 r, %0; \n\t" + " mov.b32 i, %1; \n\t" + __SPEC_CASE2(i, r, 0X2B7C2B7C, 0x10001000) + __SPEC_CASE2(i, r, 0XAB7CAB7C, 0x10001000) + " mov.b32 %0, r; \n" + "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +static __device__ __forceinline__ float __internal_trig_reduction_kernel(float a, int *quadrant) +{ + float j, t; + int q; + q = __float2int_rn(a * 0.636619772F); + j = (float)q; + t = __fmaf_rn(-j, 1.5707962512969971e+000F, a); + t = __fmaf_rn(-j, 7.5497894158615964e-008F, t); + *quadrant = q; + return t; +} +static __device__ __forceinline__ float __internal_sin_cos_kernel(float x, int i) +{ + float x2, z; + x2 = x*x; + + if (i & 1) { + z = 2.44331571e-5F; + z = __fmaf_rn(z, x2, -1.38873163e-3F); + } + else { + z = -1.95152959e-4F; + z = __fmaf_rn(z, x2, 8.33216087e-3F); + } + if (i & 1) { + z = __fmaf_rn(z, x2, 4.16666457e-2F); + z = __fmaf_rn(z, x2, -5.00000000e-1F); + } + else { + z = __fmaf_rn(z, x2, -1.66666546e-1F); + z = __fmaf_rn(z, x2, 0.0F); + } + x = __fmaf_rn(z, x, x); + if (i & 1) { + x = __fmaf_rn(z, x2, 1.0F); + } + if (i & 2) { + x = __fmaf_rn(x, -1.0F, 0.0F); + } + return x; +} +static __device__ __forceinline__ float __float_simpl_sinf(float a) +{ + float z; + int i; + if (::isinf(a)) { + a = a * 0.0F; + } + a = __internal_trig_reduction_kernel(a, &i); + z = __internal_sin_cos_kernel(a, i); + return z; +} +static __device__ __forceinline__ float __float_simpl_cosf(float a) +{ + float z; + int i; + if (::isinf(a)) { + a = a * 0.0F; + } + a = __internal_trig_reduction_kernel(a, &i); + i++; + z = __internal_sin_cos_kernel(a, i); + return z; +} + +__CUDA_FP16_DECL__ __half hexp(const __half a) { + __half val; + asm("{.reg.b32 f, C; \n" + " .reg.b16 h,r; \n" + " mov.b16 h,%1; \n" + " cvt.f32.f16 f,h; \n" + " mov.b32 C, 0x3fb8aa3b; \n" + " mul.f32 f,f,C; \n" + " ex2.approx.f32 f,f; \n" + " cvt.rn.f16.f32 r,f; \n" + __SPEC_CASE(h, r, 0X1F79, 0x9400) + __SPEC_CASE(h, r, 0X25CF, 0x9400) + __SPEC_CASE(h, r, 0XC13B, 0x0400) + __SPEC_CASE(h, r, 0XC1EF, 0x0200) + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu, C; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " mov.b32 C, 0x3fb8aa3b; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " ex2.approx.f32 fl, fl; \n" + " ex2.approx.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0X1F791F79, 0x94009400) + __SPEC_CASE2(h, r, 0X25CF25CF, 0x94009400) + __SPEC_CASE2(h, r, 0XC13BC13B, 0x04000400) + __SPEC_CASE2(h, r, 0XC1EFC1EF, 0x02000200) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hexp2(const __half a) { + __half val; + asm("{.reg.b32 f, ULP; \n" + " .reg.b16 r; \n" + " mov.b16 r,%1; \n" + " cvt.f32.f16 f,r; \n" + " ex2.approx.f32 f,f; \n" + " mov.b32 ULP, 0x33800000;\n" + " fma.rn.f32 f,f,ULP,f; \n" + " cvt.rn.f16.f32 r,f; \n" + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 fl, fu, ULP; \n" + " mov.b32 {hl, hu}, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " ex2.approx.f32 fl, fl; \n" + " ex2.approx.f32 fu, fu; \n" + " mov.b32 ULP, 0x33800000;\n" + " fma.rn.f32 fl,fl,ULP,fl; \n" + " fma.rn.f32 fu,fu,ULP,fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 %0, {hl, hu}; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hexp10(const __half a) { + __half val; + asm("{.reg.b16 h,r; \n" + " .reg.b32 f, C; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " mov.b32 C, 0x40549A78; \n" + " mul.f32 f,f,C; \n" + " ex2.approx.f32 f, f; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(h, r, 0x34DE, 0x9800) + __SPEC_CASE(h, r, 0x9766, 0x9000) + __SPEC_CASE(h, r, 0x9972, 0x1000) + __SPEC_CASE(h, r, 0xA5C4, 0x1000) + __SPEC_CASE(h, r, 0xBF0A, 0x8100) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu, C; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " mov.b32 C, 0x40549A78; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " ex2.approx.f32 fl, fl; \n" + " ex2.approx.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0x34DE34DE, 0x98009800) + __SPEC_CASE2(h, r, 0x97669766, 0x90009000) + __SPEC_CASE2(h, r, 0x99729972, 0x10001000) + __SPEC_CASE2(h, r, 0xA5C4A5C4, 0x10001000) + __SPEC_CASE2(h, r, 0xBF0ABF0A, 0x81008100) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog2(const __half a) { + __half val; + asm("{.reg.b16 h, r; \n" + " .reg.b32 f; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " lg2.approx.f32 f, f; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(r, r, 0xA2E2, 0x8080) + __SPEC_CASE(r, r, 0xBF46, 0x9400) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 fl, fu, r, p; \n" + " mov.b32 {hl, hu}, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.f32 fl, fl; \n" + " lg2.approx.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(r, r, 0xA2E2A2E2, 0x80808080) + __SPEC_CASE2(r, r, 0xBF46BF46, 0x94009400) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog(const __half a) { + __half val; + asm("{.reg.b32 f, C; \n" + " .reg.b16 r,h; \n" + " mov.b16 h,%1; \n" + " cvt.f32.f16 f,h; \n" + " lg2.approx.f32 f,f; \n" + " mov.b32 C, 0x3f317218; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.f16.f32 r,f; \n" + __SPEC_CASE(h, r, 0X160D, 0x9C00) + __SPEC_CASE(h, r, 0X3BFE, 0x8010) + __SPEC_CASE(h, r, 0X3C0B, 0x8080) + __SPEC_CASE(h, r, 0X6051, 0x1C00) + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.f32 fl, fl; \n" + " lg2.approx.f32 fu, fu; \n" + " mov.b32 C, 0x3f317218; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0X160D160D, 0x9C009C00) + __SPEC_CASE2(h, r, 0X3BFE3BFE, 0x80108010) + __SPEC_CASE2(h, r, 0X3C0B3C0B, 0x80808080) + __SPEC_CASE2(h, r, 0X60516051, 0x1C001C00) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog10(const __half a) { + __half val; + asm("{.reg.b16 h, r; \n" + " .reg.b32 f, C; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " lg2.approx.f32 f, f; \n" + " mov.b32 C, 0x3E9A209B; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(h, r, 0x338F, 0x1000) + __SPEC_CASE(h, r, 0x33F8, 0x9000) + __SPEC_CASE(h, r, 0x57E1, 0x9800) + __SPEC_CASE(h, r, 0x719D, 0x9C00) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.f32 fl, fl; \n" + " lg2.approx.f32 fu, fu; \n" + " mov.b32 C, 0x3E9A209B; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0x338F338F, 0x10001000) + __SPEC_CASE2(h, r, 0x33F833F8, 0x90009000) + __SPEC_CASE2(h, r, 0x57E157E1, 0x98009800) + __SPEC_CASE2(h, r, 0x719D719D, 0x9C009C00) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +#undef __SPEC_CASE2 +#undef __SPEC_CASE +__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a) { + __APPROX_FCAST2(rcp) +} +__CUDA_FP16_DECL__ __half hrcp(const __half a) { + __APPROX_FCAST(rcp) +} +__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a) { + __APPROX_FCAST2(rsqrt) +} +__CUDA_FP16_DECL__ __half hrsqrt(const __half a) { + __APPROX_FCAST(rsqrt) +} +__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a) { + __APPROX_FCAST2(sqrt) +} +__CUDA_FP16_DECL__ __half hsqrt(const __half a) { + __APPROX_FCAST(sqrt) +} +#undef __APPROX_FCAST +#undef __APPROX_FCAST2 +__CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a) +{ + __half2 r; + asm("{set.nan.f16x2.f16x2 %0,%1,%2;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ bool __hisnan(const __half a) +{ + __half r; + asm("{set.nan.f16.f16 %0,%1,%2;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(a))); + return __HALF_TO_CUS(r) != 0U; +} +__CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a) +{ + __half2 r; + asm("{neg.f16x2 %0,%1;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ __half __hneg(const __half a) +{ + __half r; + asm("{neg.f16 %0,%1;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 __habs2(const __half2 a) +{ + __half2 r; + asm("{abs.f16x2 %0,%1;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ __half __habs(const __half a) +{ + __half r; + asm("{abs.f16 %0,%1;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} + +__CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c) +{ + // fast version of complex multiply-accumulate + // (a.re, a.im) * (b.re, b.im) + (c.re, c.im) + // acc.re = (c.re + a.re*b.re) - a.im*b.im + // acc.im = (c.im + a.re*b.im) + a.im*b.re + const __half2 a_re = __half2half2(a.x); + __half2 acc = __hfma2(a_re, b, c); + const __half2 a_im = __half2half2(a.y); + const __half2 ib = __halves2half2(__hneg(b.y), b.x); + acc = __hfma2(a_im, ib, acc); + return acc; +} +#endif /*__CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ + +#if __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__) +/****************************************************************************** +* __half arithmetic * +******************************************************************************/ +__CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(max) +} +__CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(min) +} +__CUDA_FP16_DECL__ __half __hmax_nan(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(max.NaN) +} +__CUDA_FP16_DECL__ __half __hmin_nan(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(min.NaN) +} +__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn.relu) +} +/****************************************************************************** +* __half2 arithmetic * +******************************************************************************/ +__CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(max) +} +__CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(min) +} +__CUDA_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(max.NaN) +} +__CUDA_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(min.NaN) +} +__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn.relu) +} +#endif /*__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)*/ + +/* Define __PTR for atomicAdd prototypes below, undef after done */ +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __PTR "l" +#else +#define __PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 + +__CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *address, __half2 val) { + __half2 r; + asm volatile ("{ atom.add.noftz.f16x2 %0,[%1],%2; }\n" + : "=r"(__HALF2_TO_UI(r)) : __PTR(address), "r"(__HALF2_TO_CUI(val)) + : "memory"); + return r; +} + +#endif /*!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600*/ + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + +__CUDA_FP16_DECL__ __half atomicAdd(__half *address, __half val) { + __half r; + asm volatile ("{ atom.add.noftz.f16 %0,[%1],%2; }\n" + : "=h"(__HALF_TO_US(r)) + : __PTR(address), "h"(__HALF_TO_CUS(val)) + : "memory"); + return r; +} + +#endif /*!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700*/ + +#undef __PTR + +#undef __CUDA_FP16_DECL__ +#endif /* defined(__CUDACC__) */ +#endif /* defined(__cplusplus) */ + +#undef __TERNARY_OP_HALF2_MACRO +#undef __TERNARY_OP_HALF_MACRO +#undef __BINARY_OP_HALF2_MACRO +#undef __BINARY_OP_HALF_MACRO + +#undef __CUDA_HOSTDEVICE_FP16_DECL__ +#undef __CUDA_FP16_DECL__ + +/* Define first-class types "half" and "half2", unless user specifies otherwise via "#define CUDA_NO_HALF" */ +/* C cannot ever have these types defined here, because __half and __half2 are C++ classes */ +#if defined(__cplusplus) && !defined(CUDA_NO_HALF) +typedef __half half; +typedef __half2 half2; +// for consistency with __nv_bfloat16 +typedef __half __nv_half; +typedef __half2 __nv_half2; +typedef __half_raw __nv_half_raw; +typedef __half2_raw __nv_half2_raw; +typedef __half nv_half; +typedef __half2 nv_half2; +#endif /* defined(__cplusplus) && !defined(CUDA_NO_HALF) */ + +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) +#undef __CPP_VERSION_AT_LEAST_11_FP16 +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + +#endif /* end of include guard: __CUDA_FP16_HPP__ */ diff --git a/cupy/_core/include/cupy/_cuda/cuda-11/cuda_fp16.h b/cupy/_core/include/cupy/_cuda/cuda-11/cuda_fp16.h new file mode 100644 index 0000000..8dc1c29 --- /dev/null +++ b/cupy/_core/include/cupy/_cuda/cuda-11/cuda_fp16.h @@ -0,0 +1,3794 @@ +/* +* Copyright 1993-2021 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +/** +* \defgroup CUDA_MATH_INTRINSIC_HALF Half Precision Intrinsics +* This section describes half precision intrinsic functions that are +* only supported in device code. +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_ARITHMETIC Half Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_ARITHMETIC Half2 Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_COMPARISON Half Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_COMPARISON Half2 Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_MISC Half Precision Conversion and Data Movement +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_FUNCTIONS Half Math Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_FUNCTIONS Half2 Math Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +#ifndef __CUDA_FP16_H__ +#define __CUDA_FP16_H__ + +#define ___CUDA_FP16_STRINGIFY_INNERMOST(x) #x +#define __CUDA_FP16_STRINGIFY(x) ___CUDA_FP16_STRINGIFY_INNERMOST(x) + +#if defined(__cplusplus) +#if defined(__CUDACC__) +#define __CUDA_FP16_DECL__ static __device__ __inline__ +#define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__ +#else +#define __CUDA_HOSTDEVICE_FP16_DECL__ static +#endif /* defined(__CUDACC__) */ + +#define __CUDA_FP16_TYPES_EXIST__ + +/* Forward-declaration of structures defined in "cuda_fp16.hpp" */ + +/** + * \brief half datatype + * + * \details This structure implements the datatype for storing + * half-precision floating-point numbers. The structure implements + * assignment operators and type conversions. + * 16 bits are being used in total: 1 sign bit, 5 bits for the exponent, + * and the significand is being stored in 10 bits. + * The total precision is 11 bits. There are 15361 representable + * numbers within the interval [0.0, 1.0], endpoints included. + * On average we have log10(2**11) ~ 3.311 decimal digits. + * + * \internal + * \req IEEE 754-2008 compliant implementation of half-precision + * floating-point numbers. + * \endinternal + */ +struct __half; + +/** + * \brief half2 datatype + * + * \details This structure implements the datatype for storing two + * half-precision floating-point numbers. + * The structure implements assignment operators and type conversions. + * + * \internal + * \req Vectorified version of half. + * \endinternal + */ +struct __half2; + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts double number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts double number \p a to half precision in round-to-nearest-even mode. +* \param[in] a - double. Is only being read. +* \returns half +* - \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. +* \returns half +* - \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. +* \returns half +* - \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-towards-zero mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-towards-zero mode. +* \param[in] a - float. Is only being read. +* \returns half +* - \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-down mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-down mode. +* \param[in] a - float. Is only being read. +* +* \returns half +* - \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-up mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-up mode. +* \param[in] a - float. Is only being read. +* +* \returns half +* - \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts \p half number to float. +* +* \details Converts half number \p a to float. +* \param[in] a - float. Is only being read. +* +* \returns float +* - \p a converted to float. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts input to half precision in round-to-nearest-even mode and +* populates both halves of \p half2 with converted value. +* +* \details Converts input \p a to half precision in round-to-nearest-even mode and +* populates both halves of \p half2 with converted value. +* \param[in] a - float. Is only being read. +* +* \returns half2 +* - The \p half2 value with both halves equal to the converted half +* precision number. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both input floats to half precision in round-to-nearest-even +* mode and returns \p half2 with converted values. +* +* \details Converts both input floats to half precision in round-to-nearest-even mode +* and combines the results into one \p half2 number. Low 16 bits of the return +* value correspond to the input \p a, high 16 bits correspond to the input \p +* b. +* \param[in] a - float. Is only being read. +* \param[in] b - float. Is only being read. +* +* \returns half2 +* - The \p half2 value with corresponding halves equal to the +* converted input floats. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts low 16 bits of \p half2 to float and returns the result +* +* \details Converts low 16 bits of \p half2 input \p a to 32-bit floating-point number +* and returns the result. +* \param[in] a - half2. Is only being read. +* +* \returns float +* - The low 16 bits of \p a converted to float. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts high 16 bits of \p half2 to float and returns the result +* +* \details Converts high 16 bits of \p half2 input \p a to 32-bit floating-point number +* and returns the result. +* \param[in] a - half2. Is only being read. +* +* \returns float +* - The high 16 bits of \p a converted to float. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-towards-zero mode. +* +* \details Convert the half-precision floating-point value \p h to a signed short +* integer in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-towards-zero +* mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned short +* integer in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-towards-zero mode. +* +* \details Convert the half-precision floating-point value \p h to a signed integer in +* round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns int +* - \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-towards-zero mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned integer +* in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-towards-zero mode. +* +* \details Convert the half-precision floating-point value \p h to a signed 64-bit +* integer in round-towards-zero mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-towards-zero +* mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit +* integer in round-towards-zero mode. NaN inputs return 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h); + +#if defined(__CUDACC__) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both components of float2 number to half precision in +* round-to-nearest-even mode and returns \p half2 with converted values. +* +* \details Converts both components of float2 to half precision in round-to-nearest +* mode and combines the results into one \p half2 number. Low 16 bits of the +* return value correspond to \p a.x and high 16 bits of the return value +* correspond to \p a.y. +* \param[in] a - float2. Is only being read. +* +* \returns half2 +* - The \p half2 which has corresponding halves equal to the +* converted float2 components. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both halves of \p half2 to float2 and returns the result. +* +* \details Converts both halves of \p half2 input \p a to float2 and returns the +* result. +* \param[in] a - half2. Is only being read. +* +* \returns float2 +* - \p a converted to float2. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-to-nearest-even mode. +* +* \details Convert the half-precision floating-point value \p h to a signed integer in +* round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns int +* - \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __half2int_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to a signed integer in +* round-down mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns int +* - \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __half2int_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to a signed integer in +* round-up mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns int +* - \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __half2int_ru(const __half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-to-nearest-even mode. +* +* \details Convert the signed integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-towards-zero mode. +* +* \details Convert the signed integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __int2half_rz(const int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-down mode. +* +* \details Convert the signed integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __int2half_rd(const int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-up mode. +* +* \details Convert the signed integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __int2half_ru(const int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating-point value \p h to a signed short +* integer in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half2short_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to a signed short +* integer in round-down mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half2short_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to a signed short +* integer in round-up mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half2short_ru(const __half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-towards-zero mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __short2half_rz(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-down mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __short2half_rd(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-up mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __short2half_ru(const short int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-to-nearest-even mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned integer +* in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned integer +* in round-down mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned integer +* in round-up mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-to-nearest-even mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-towards-zero mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __uint2half_rz(const unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-down mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __uint2half_rd(const unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-up mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __uint2half_ru(const unsigned int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned short +* integer in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned short +* integer in round-down mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned short +* integer in round-up mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-towards-zero +* mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-down mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-up mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit +* integer in round-to-nearest-even mode. NaN inputs return 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit +* integer in round-down mode. NaN inputs return 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit +* integer in round-up mode. NaN inputs return 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-towards-zero +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-down mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-up mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating-point value \p h to a signed 64-bit +* integer in round-to-nearest-even mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to a signed 64-bit +* integer in round-down mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to a signed 64-bit +* integer in round-up mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-towards-zero mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +*/ +__CUDA_FP16_DECL__ __half __ll2half_rz(const long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-down mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ll2half_rd(const long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-up mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ll2half_ru(const long long int i); + +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Truncate input argument to the integral part. +* +* \details Round \p h to the nearest integer value that does not exceed \p h in +* magnitude. +* \param[in] h - half. Is only being read. +* +* \returns half +* - The truncated integer value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half htrunc(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculate ceiling of the input argument. +* +* \details Compute the smallest integer value not less than \p h. +* \param[in] h - half. Is only being read. +* +* \returns half +* - The smallest integer value not less than \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hceil(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details Calculate the largest integer value which is less than or equal to \p h. +* \param[in] h - half. Is only being read. +* +* \returns half +* - The largest integer value which is less than or equal to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hfloor(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Round input to nearest integer value in half-precision floating-point +* number. +* +* \details Round \p h to the nearest integer value in half-precision floating-point +* format, with halfway cases rounded to the nearest even integer value. +* \param[in] h - half. Is only being read. +* +* \returns half +* - The nearest integer to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hrint(const __half h); + +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Truncate \p half2 vector input argument to the integral part. +* +* \details Round each component of vector \p h to the nearest integer value that does +* not exceed \p h in magnitude. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* - The truncated \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculate \p half2 vector ceiling of the input argument. +* +* \details For each component of vector \p h compute the smallest integer value not less +* than \p h. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* - The vector of smallest integers not less than \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details For each component of vector \p h calculate the largest integer value which +* is less than or equal to \p h. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* - The vector of largest integers which is less than or equal to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Round input to nearest integer value in half-precision floating-point +* number. +* +* \details Round each component of \p half2 vector \p h to the nearest integer value in +* half-precision floating-point format, with halfway cases rounded to the +* nearest even integer value. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* - The vector of rounded integer values. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns \p half2 with both halves equal to the input value. +* +* \details Returns \p half2 number with both halves equal to the input \p a \p half +* number. +* \param[in] a - half. Is only being read. +* +* \returns half2 +* - The vector which has both its halves equal to the input \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __half2half2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Swaps both halves of the \p half2 input. +* +* \details Swaps both halves of the \p half2 input and returns a new \p half2 number +* with swapped halves. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - \p a with its halves being swapped. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts low 16 bits from each of the two \p half2 inputs and combines +* into one \p half2 number. +* +* \details Extracts low 16 bits from each of the two \p half2 inputs and combines into +* one \p half2 number. Low 16 bits from input \p a is stored in low 16 bits of +* the return value, low 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The low 16 bits of \p a and of \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts high 16 bits from each of the two \p half2 inputs and +* combines into one \p half2 number. +* +* \details Extracts high 16 bits from each of the two \p half2 inputs and combines into +* one \p half2 number. High 16 bits from input \p a is stored in low 16 bits of +* the return value, high 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The high 16 bits of \p a and of \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns high 16 bits of \p half2 input. +* +* \details Returns high 16 bits of \p half2 input \p a. +* \param[in] a - half2. Is only being read. +* +* \returns half +* - The high 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __high2half(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns low 16 bits of \p half2 input. +* +* \details Returns low 16 bits of \p half2 input \p a. +* \param[in] a - half2. Is only being read. +* +* \returns half +* - Returns \p half which contains low 16 bits of the input \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __low2half(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Checks if the input \p half number is infinite. +* +* \details Checks if the input \p half number \p a is infinite. +* \param[in] a - half. Is only being read. +* +* \returns int +* - -1 iff \p a is equal to negative infinity, +* - 1 iff \p a is equal to positive infinity, +* - 0 otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __hisinf(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Combines two \p half numbers into one \p half2 number. +* +* \details Combines two input \p half number \p a and \p b into one \p half2 number. +* Input \p a is stored in low 16 bits of the return value, input \p b is stored +* in high 16 bits of the return value. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half2 +* - The half2 with one half equal to \p a and the other to \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts low 16 bits from \p half2 input. +* +* \details Extracts low 16 bits from \p half2 input \p a and returns a new \p half2 +* number which has both halves equal to the extracted bits. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The half2 with both halves equal to the low 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __low2half2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts high 16 bits from \p half2 input. +* +* \details Extracts high 16 bits from \p half2 input \p a and returns a new \p half2 +* number which has both halves equal to the extracted bits. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The half2 with both halves equal to the high 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __high2half2(const __half2 a); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a \p half as a signed short integer. +* +* \details Reinterprets the bits in the half-precision floating-point number \p h +* as a signed short integer. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half_as_short(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a \p half as an unsigned short integer. +* +* \details Reinterprets the bits in the half-precision floating-point \p h +* as an unsigned short number. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a signed short integer as a \p half. +* +* \details Reinterprets the bits in the signed short integer \p i as a +* half-precision floating-point number. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __short_as_half(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in an unsigned short integer as a \p half. +* +* \details Reinterprets the bits in the unsigned short integer \p i as a +* half-precision floating-point number. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half maximum of two input values. +* +* \details Calculates \p half max(\p a, \p b) +* defined as (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half minimum of two input values. +* +* \details Calculates \p half min(\p a, \p b) +* defined as (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector maximum of two inputs. +* +* \details Calculates \p half2 vector max(\p a, \p b). +* Elementwise \p half operation is defined as +* (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise maximum of vectors \p a and \p b +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector minimum of two inputs. +* +* \details Calculates \p half2 vector min(\p a, \p b). +* Elementwise \p half operation is defined as +* (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise minimum of vectors \p a and \p b +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b); + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) +#if !defined warpSize && !defined __local_warpSize +#define warpSize 32 +#define __local_warpSize +#endif + +#if defined(_WIN32) +# define __DEPRECATED__(msg) __declspec(deprecated(msg)) +#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__)))) +# define __DEPRECATED__(msg) __attribute__((deprecated)) +#else +# define __DEPRECATED__(msg) __attribute__((deprecated(msg))) +#endif + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 +#define __WSB_DEPRECATION_MESSAGE(x) __CUDA_FP16_STRINGIFY(x) "() is deprecated in favor of " __CUDA_FP16_STRINGIFY(x) "_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)." + +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) __half2 __shfl(const __half2 var, const int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down))__half2 __shfl_down(const __half2 var, const unsigned int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half2 __shfl_xor(const __half2 var, const int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) __half __shfl(const __half var, const int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) __half __shfl_up(const __half var, const unsigned int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) __half __shfl_down(const __half var, const unsigned int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half __shfl_xor(const __half var, const int delta, const int width = warpSize); +#endif + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of var held by the thread whose ID is given by delta. +* If width is less than warpSize then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], +* the value returned corresponds to the value of var held by the delta modulo width (i.e. +* within the same subsection). width must have a value which is a power of 2; +* results are undefined if width is not a power of 2, or is a number greater than +* warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned mask, const __half2 var, const int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* +* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. +* The value of var held by the resulting lane ID is returned: in effect, var is shifted up +* the warp by delta threads. If width is less than warpSize then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. +* width must have a value which is a power of 2; results are undefined if width is not a power of 2, +* or is a number greater than warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding delta to the caller's thread ID. +* The value of var held by the resulting thread ID is returned: this has the effect +* of shifting var down the warp by delta threads. If width is less than warpSize then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of width and so the upper delta threads +* will remain unchanged. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: +* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each +* group of width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned mask, const __half2 var, const int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of var held by the thread whose ID is given by delta. +* If width is less than warpSize then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], +* the value returned corresponds to the value of var held by the delta modulo width (i.e. +* within the same subsection). width must have a value which is a power of 2; +* results are undefined if width is not a power of 2, or is a number greater than +* warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_sync(const unsigned mask, const __half var, const int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. +* The value of var held by the resulting lane ID is returned: in effect, var is shifted up +* the warp by delta threads. If width is less than warpSize then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. +* width must have a value which is a power of 2; results are undefined if width is not a power of 2, +* or is a number greater than warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned mask, const __half var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding delta to the caller's thread ID. +* The value of var held by the resulting thread ID is returned: this has the effect +* of shifting var down the warp by delta threads. If width is less than warpSize then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of width and so the upper delta threads +* will remain unchanged. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned mask, const __half var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: +* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each +* group of width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned mask, const __half var, const int delta, const int width = warpSize); + +#if defined(__local_warpSize) +#undef warpSize +#undef __local_warpSize +#endif +#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) */ + +#if defined(__cplusplus) && ( !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) ) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.nc` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldg(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.nc` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldg(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cg` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cg` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.ca` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldca(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.ca` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldca(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cs` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cs` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.lu` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldlu(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.lu` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cv` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldcv(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cv` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wb` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wb` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cg` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cg` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cs` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cs` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wt` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wt` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value); +#endif /*defined(__cplusplus) && ( !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) )*/ + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs half2 vector if-equal comparison. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector not-equal comparison. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-equal comparison. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The \p half2 result of less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-equal comparison. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-than comparison. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The half2 vector result of less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-than comparison. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered if-equal comparison. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of unordered if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered not-equal comparison. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of unordered not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-equal comparison. +* +* Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of unordered less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-equal comparison. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The \p half2 vector result of unordered greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-than comparison. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of unordered less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-than comparison. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The \p half2 vector result of unordered greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Determine whether \p half2 argument is a NaN. +* +* \details Determine whether each half of input \p half2 number \p a is a NaN. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The half2 with the corresponding \p half results set to +* 1.0 for NaN, 0.0 otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode. +* +* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-95 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The sum of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-104 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The subtraction of vector \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode. +* +* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-102 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise multiplying the vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode. +* +* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest +* mode. Prevents floating-point contractions of mul+add into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-95 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The sum of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode. Prevents floating-point contractions of mul+sub +* into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-104 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The subtraction of vector \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode. +* +* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode. Prevents floating-point contractions of +* mul+add or sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-102 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise multiplying the vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector division in round-to-nearest-even mode. +* +* \details Divides \p half2 input vector \p a by input vector \p b in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-103 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The elementwise division of \p a with \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Calculates the absolute value of both halves of the input \p half2 number and +* returns the result. +* +* \details Calculates the absolute value of both halves of the input \p half2 number and +* returns the result. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - Returns \p a with the absolute value of both halves. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __habs2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest +* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The sum of \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The subtraction of vector \p b from \p a, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise multiplication of vectors \p a and \p b, +* with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-105 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode, with saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the +* results to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c, +* with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Negates both halves of the input \p half2 number and returns the +* result. +* +* \details Negates both halves of the input \p half2 number \p a and returns the result. +* \internal +* \req DEEPLEARN-SRM_REQ-101 +* \endinternal +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - Returns \p a with both halves negated. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Calculates the absolute value of input \p half number and returns the result. +* +* \details Calculates the absolute value of input \p half number and returns the result. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The absolute value of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __habs(const __half a); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode. +* +* \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-94 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The sum of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half input \p b from input \p a in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-97 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of subtracting \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode. +* +* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-99 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of multiplying \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode. +* +* \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+add into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-94 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The sum of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hadd_rn(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half input \p b from input \p a in round-to-nearest +* mode. Prevents floating-point contractions of mul+sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-97 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of subtracting \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hsub_rn(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode. +* +* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest +* mode. Prevents floating-point contractions of mul+add or sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-99 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of multiplying \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half __hmul_rn(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half division in round-to-nearest-even mode. +* +* \details Divides \p half input \p a by input \p b in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-98 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of dividing \p a by \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half add of inputs \p a and \p b, in round-to-nearest-even mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The sum of \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Subtracts \p half input \p b from input \p a in round-to-nearest +* mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of subtraction of \p b from \p a, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest +* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of multiplying \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-96 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the result +* to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Negates input \p half number and returns the result. +* +* \details Negates input \p half number and returns the result. +* \internal +* \req DEEPLEARN-SRM_REQ-100 +* \endinternal +* \param[in] a - half. Is only being read. +* +* \returns half +* - minus a +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hneg(const __half a); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector if-equal comparison and returns boolean true +* iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of if-equal comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector not-equal comparison and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of not-equal comparison +* of vectors \p a and \p b are true, +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-equal comparison and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of less-equal comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-equal comparison and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of greater-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-than comparison and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of less-than comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-than comparison and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of greater-than +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered if-equal comparison and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered if-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered not-equal comparison and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered not-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-equal comparison and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered less-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-equal comparison and +* returns boolean true iff both \p half results are true, boolean false +* otherwise. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered +* greater-equal comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-than comparison and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered less-than comparison of +* vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-than comparison and +* returns boolean true iff both \p half results are true, boolean false +* otherwise. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered +* greater-than comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half if-equal comparison. +* +* \details Performs \p half if-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of if-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half not-equal comparison. +* +* \details Performs \p half not-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of not-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half less-equal comparison. +* +* \details Performs \p half less-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of less-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half greater-equal comparison. +* +* \details Performs \p half greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of greater-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half less-than comparison. +* +* \details Performs \p half less-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of less-than comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half greater-than comparison. +* +* \details Performs \p half greater-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of greater-than comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered if-equal comparison. +* +* \details Performs \p half if-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered if-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered not-equal comparison. +* +* \details Performs \p half not-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered not-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered less-equal comparison. +* +* \details Performs \p half less-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered less-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered greater-equal comparison. +* +* \details Performs \p half greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered greater-equal comparison of \p a +* and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered less-than comparison. +* +* \details Performs \p half less-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered less-than comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered greater-than comparison. +* +* \details Performs \p half greater-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered greater-than comparison of \p a +* and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Determine whether \p half argument is a NaN. +* +* \details Determine whether \p half value \p a is a NaN. +* \param[in] a - half. Is only being read. +* +* \returns bool +* - true iff argument is NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hisnan(const __half a); +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half maximum of two input values, NaNs pass through. +* +* \details Calculates \p half max(\p a, \p b) +* defined as (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmax_nan(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half minimum of two input values, NaNs pass through. +* +* \details Calculates \p half min(\p a, \p b) +* defined as (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmin_nan(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode with relu saturation. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* Then negative result is clamped to 0. +* NaN result is converted to canonical NaN. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c with relu saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector maximum of two inputs, NaNs pass through. +* +* \details Calculates \p half2 vector max(\p a, \p b). +* Elementwise \p half operation is defined as +* (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise maximum of vectors \p a and \p b, with NaNs pass through +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector minimum of two inputs, NaNs pass through. +* +* \details Calculates \p half2 vector min(\p a, \p b). +* Elementwise \p half operation is defined as +* (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise minimum of vectors \p a and \p b, with NaNs pass through +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode with relu saturation. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* Then negative result is clamped to 0. +* NaN result is converted to canonical NaN. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c); +#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) */ +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs fast complex multiply-accumulate +* +* \details Interprets vector \p half2 input pairs \p a, \p b, and \p c as +* complex numbers in \p half precision and performs +* complex multiply-accumulate operation: a*b + c +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* - The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half square root in round-to-nearest-even mode. +* +* \details Calculates \p half square root of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The square root of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hsqrt(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half reciprocal square root in round-to-nearest-even +* mode. +* +* \details Calculates \p half reciprocal square root of input \p a in round-to-nearest +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The reciprocal square root of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hrsqrt(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p half reciprocal of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The reciprocal of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hrcp(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half natural logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half natural logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The natural logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hlog(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half binary logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half binary logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The binary logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hlog2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half decimal logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half decimal logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The decimal logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hlog10(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half natural exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half natural exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The natural exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hexp(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half binary exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half binary exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The binary exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hexp2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half decimal exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half decimal exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The decimal exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hexp10(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half cosine in round-to-nearest-even mode. +* +* \details Calculates \p half cosine of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The cosine of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hcos(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half sine in round-to-nearest-even mode. +* +* \details Calculates \p half sine of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The sine of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hsin(const __half a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector square root in round-to-nearest-even mode. +* +* \details Calculates \p half2 square root of input vector \p a in round-to-nearest +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise square root on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector reciprocal square root in round-to-nearest +* mode. +* +* \details Calculates \p half2 reciprocal square root of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise reciprocal square root on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p half2 reciprocal of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise reciprocal on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector natural logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 natural logarithm of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise natural logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2log(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector binary logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 binary logarithm of input vector \p a in round-to-nearest +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise binary logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector decimal logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 decimal logarithm of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise decimal logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half2 exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector binary exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p half2 binary exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise binary exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector decimal exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p half2 decimal exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise decimal exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector cosine in round-to-nearest-even mode. +* +* \details Calculates \p half2 cosine of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise cosine on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector sine in round-to-nearest-even mode. +* +* \details Calculates \p half2 sine of input vector \p a in round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise sine on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a); + +#endif /*if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)*/ + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600) + +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Vector add \p val to the value stored at \p address in global or shared memory, and writes this +* value back to \p address. The atomicity of the add operation is guaranteed separately for each of the +* two __half elements; the entire __half2 is not guaranteed to be atomic as a single 32-bit access. +* +* \details The location of \p address must be in global or shared memory. This operation has undefined +* behavior otherwise. This operation is only supported by devices of compute capability 6.x and higher. +* +* \param[in] address - half2*. An address in global or shared memory. +* \param[in] val - half2. The value to be added. +* +* \returns half2 +* - The old value read from \p address. +* +* \note_ref_guide_atomic +*/ +__CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *const address, const __half2 val); + +#endif /*if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)*/ + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) + +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Adds \p val to the value stored at \p address in global or shared memory, and writes this value +* back to \p address. This operation is performed in one atomic operation. +* +* \details The location of \p address must be in global or shared memory. This operation has undefined +* behavior otherwise. This operation is only supported by devices of compute capability 7.x and higher. +* +* \param[in] address - half*. An address in global or shared memory. +* \param[in] val - half. The value to be added. +* +* \returns half +* - The old value read from \p address. +* +* \note_ref_guide_atomic +*/ +__CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val); + +#endif /*if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)*/ + +#endif /* defined(__CUDACC__) */ + +#undef __CUDA_FP16_DECL__ +#undef __CUDA_HOSTDEVICE_FP16_DECL__ + +#endif /* defined(__cplusplus) */ + +/* Note the .hpp file is included even for host-side compilation, to capture the "half" & "half2" definitions */ +#include "cuda_fp16.hpp" +#undef ___CUDA_FP16_STRINGIFY_INNERMOST +#undef __CUDA_FP16_STRINGIFY + +#endif /* end of include guard: __CUDA_FP16_H__ */ diff --git a/cupy/_core/include/cupy/_cuda/cuda-11/cuda_fp16.hpp b/cupy/_core/include/cupy/_cuda/cuda-11/cuda_fp16.hpp new file mode 100644 index 0000000..e15f46b --- /dev/null +++ b/cupy/_core/include/cupy/_cuda/cuda-11/cuda_fp16.hpp @@ -0,0 +1,2614 @@ +/* +* Copyright 1993-2021 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +#if !defined(__CUDA_FP16_HPP__) +#define __CUDA_FP16_HPP__ + +#if !defined(__CUDA_FP16_H__) +#error "Do not include this file directly. Instead, include cuda_fp16.h." +#endif + +#if !defined(_MSC_VER) && __cplusplus >= 201103L +# define __CPP_VERSION_AT_LEAST_11_FP16 +#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L +# define __CPP_VERSION_AT_LEAST_11_FP16 +#endif + +/* C++11 header for std::move. + * In RTC mode, std::move is provided implicitly; don't include the header + */ +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) && !defined(__CUDACC_RTC__) +#include +#endif /* __cplusplus >= 201103L && !defined(__CUDACC_RTC__) */ + +/* C++ header for std::memcpy (used for type punning in host-side implementations). + * When compiling as a CUDA source file memcpy is provided implicitly. + * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__). + */ +#if defined(__cplusplus) && !defined(__CUDACC__) +#include +#endif /* defined(__cplusplus) && !defined(__CUDACC__) */ + + +/* Set up function decorations */ +#if defined(__CUDACC__) +#define __CUDA_FP16_DECL__ static __device__ __inline__ +#define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__ +#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__ +#define __CUDA_HOSTDEVICE__ __host__ __device__ +#else /* !defined(__CUDACC__) */ +#if defined(__GNUC__) +#define __CUDA_HOSTDEVICE_FP16_DECL__ static __attribute__ ((unused)) +#else +#define __CUDA_HOSTDEVICE_FP16_DECL__ static +#endif /* defined(__GNUC__) */ +#define __CUDA_HOSTDEVICE__ +#endif /* defined(__CUDACC_) */ + +/* Set up structure-alignment attribute */ +#if defined(__CUDACC__) +#define __CUDA_ALIGN__(align) __align__(align) +#else +/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */ +#if __cplusplus >= 201103L +#define __CUDA_ALIGN__(n) alignas(n) /* C++11 kindly gives us a keyword for this */ +#else /* !defined(__CPP_VERSION_AT_LEAST_11_FP16)*/ +#if defined(__GNUC__) +#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +#define __CUDA_ALIGN__(n) __declspec(align(n)) +#else +#define __CUDA_ALIGN__(n) +#endif /* defined(__GNUC__) */ +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ +#endif /* defined(__CUDACC__) */ + +/* Macros to allow half & half2 to be used by inline assembly */ +#define __HALF_TO_US(var) *(reinterpret_cast(&(var))) +#define __HALF_TO_CUS(var) *(reinterpret_cast(&(var))) +#define __HALF2_TO_UI(var) *(reinterpret_cast(&(var))) +#define __HALF2_TO_CUI(var) *(reinterpret_cast(&(var))) + +/* Macros for half & half2 binary arithmetic */ +#define __BINARY_OP_HALF_MACRO(name) /* do */ {\ + __half val; \ + asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2;\n}" \ + :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \ + return val; \ +} /* while(0) */ +#define __BINARY_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + return val; \ +} /* while(0) */ +#define __TERNARY_OP_HALF_MACRO(name) /* do */ {\ + __half val; \ + asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2,%3;\n}" \ + :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \ + return val; \ +} /* while(0) */ +#define __TERNARY_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2,%3;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b)),"r"(__HALF2_TO_CUI(c))); \ + return val; \ +} /* while(0) */ + +/** +* Types which allow static initialization of "half" and "half2" until +* these become an actual builtin. Note this initialization is as a +* bitfield representation of "half", and not a conversion from short->half. +* Such a representation will be deprecated in a future version of CUDA. +* (Note these are visible to non-nvcc compilers, including C-only compilation) +*/ +typedef struct __CUDA_ALIGN__(2) { + unsigned short x; +} __half_raw; + +typedef struct __CUDA_ALIGN__(4) { + unsigned short x; + unsigned short y; +} __half2_raw; + +/* All other definitions in this file are only visible to C++ compilers */ +#if defined(__cplusplus) + +/* Hide GCC member initialization list warnings because of host/device in-function init requirement */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Weffc++" +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +/* class' : multiple assignment operators specified + The class has multiple assignment operators of a single type. This warning is informational */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( push ) +#pragma warning( disable:4522 ) +#endif /* defined(__GNUC__) */ + +struct __CUDA_ALIGN__(2) __half { +protected: + unsigned short __x; + +public: +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) + __half() = default; +#else + __CUDA_HOSTDEVICE__ __half() { } +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + + /* Convert to/from __half_raw */ + __CUDA_HOSTDEVICE__ __half(const __half_raw &hr) : __x(hr.x) { } + __CUDA_HOSTDEVICE__ __half &operator=(const __half_raw &hr) { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ volatile __half &operator=(const __half_raw &hr) volatile { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ volatile __half &operator=(const volatile __half_raw &hr) volatile { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ operator __half_raw() const { __half_raw ret; ret.x = __x; return ret; } + __CUDA_HOSTDEVICE__ operator __half_raw() const volatile { __half_raw ret; ret.x = __x; return ret; } + +#if !defined(__CUDA_NO_HALF_CONVERSIONS__) + + /* Construct from float/double */ + __CUDA_HOSTDEVICE__ __half(const float f) { __x = __float2half(f).__x; } + __CUDA_HOSTDEVICE__ __half(const double f) { __x = __double2half(f).__x; } + + __CUDA_HOSTDEVICE__ operator float() const { return __half2float(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const float f) { __x = __float2half(f).__x; return *this; } + + /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */ + __CUDA_HOSTDEVICE__ __half &operator=(const double f) { __x = __double2half(f).__x; return *this; } + +/* Member functions only available to nvcc compilation so far */ +#if defined(__CUDACC__) + /* Allow automatic construction from types supported natively in hardware */ + /* Note we do avoid constructor init-list because of special host/device compilation rules */ + __CUDA_HOSTDEVICE__ __half(const short val) { __x = __short2half_rn(val).__x; } + __CUDA_HOSTDEVICE__ __half(const unsigned short val) { __x = __ushort2half_rn(val).__x; } + __CUDA_HOSTDEVICE__ __half(const int val) { __x = __int2half_rn(val).__x; } + __CUDA_HOSTDEVICE__ __half(const unsigned int val) { __x = __uint2half_rn(val).__x; } + __CUDA_HOSTDEVICE__ __half(const long long val) { __x = __ll2half_rn(val).__x; } + __CUDA_HOSTDEVICE__ __half(const unsigned long long val) { __x = __ull2half_rn(val).__x; } + + /* Allow automatic casts to supported builtin types, matching all that are permitted with float */ + __CUDA_HOSTDEVICE__ operator short() const { return __half2short_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const short val) { __x = __short2half_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator unsigned short() const { return __half2ushort_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const unsigned short val) { __x = __ushort2half_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator int() const { return __half2int_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const int val) { __x = __int2half_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator unsigned int() const { return __half2uint_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const unsigned int val) { __x = __uint2half_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator long long() const { return __half2ll_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const long long val) { __x = __ll2half_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator unsigned long long() const { return __half2ull_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const unsigned long long val) { __x = __ull2half_rn(val).__x; return *this; } + + /* Boolean conversion - note both 0 and -0 must return false */ + __CUDA_HOSTDEVICE__ operator bool() const { return (__x & 0x7FFFU) != 0U; } +#endif /* defined(__CUDACC__) */ +#endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */ +}; + +/* Global-space operator functions are only available to nvcc compilation */ +#if defined(__CUDACC__) + +/* Arithmetic FP16 operations only supported on arch >= 5.3 */ +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) +#if !defined(__CUDA_NO_HALF_OPERATORS__) +/* Some basic arithmetic operations expected of a builtin */ +__device__ __forceinline__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); } +__device__ __forceinline__ __half operator-(const __half &lh, const __half &rh) { return __hsub(lh, rh); } +__device__ __forceinline__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); } +__device__ __forceinline__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); } + +__device__ __forceinline__ __half &operator+=(__half &lh, const __half &rh) { lh = __hadd(lh, rh); return lh; } +__device__ __forceinline__ __half &operator-=(__half &lh, const __half &rh) { lh = __hsub(lh, rh); return lh; } +__device__ __forceinline__ __half &operator*=(__half &lh, const __half &rh) { lh = __hmul(lh, rh); return lh; } +__device__ __forceinline__ __half &operator/=(__half &lh, const __half &rh) { lh = __hdiv(lh, rh); return lh; } + +/* Note for increment and decrement we use the raw value 0x3C00U equating to half(1.0F), to avoid the extra conversion */ +__device__ __forceinline__ __half &operator++(__half &h) { __half_raw one; one.x = 0x3C00U; h += one; return h; } +__device__ __forceinline__ __half &operator--(__half &h) { __half_raw one; one.x = 0x3C00U; h -= one; return h; } +__device__ __forceinline__ __half operator++(__half &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __half ret = h; + __half_raw one; + one.x = 0x3C00U; + h += one; + return ret; +} +__device__ __forceinline__ __half operator--(__half &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __half ret = h; + __half_raw one; + one.x = 0x3C00U; + h -= one; + return ret; +} + +/* Unary plus and inverse operators */ +__device__ __forceinline__ __half operator+(const __half &h) { return h; } +__device__ __forceinline__ __half operator-(const __half &h) { return __hneg(h); } + +/* Some basic comparison operations to make it look like a builtin */ +__device__ __forceinline__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); } +__device__ __forceinline__ bool operator!=(const __half &lh, const __half &rh) { return __hneu(lh, rh); } +__device__ __forceinline__ bool operator> (const __half &lh, const __half &rh) { return __hgt(lh, rh); } +__device__ __forceinline__ bool operator< (const __half &lh, const __half &rh) { return __hlt(lh, rh); } +__device__ __forceinline__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); } +__device__ __forceinline__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); } +#endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */ +#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) */ +#endif /* defined(__CUDACC__) */ + +/* __half2 is visible to non-nvcc host compilers */ +struct __CUDA_ALIGN__(4) __half2 { + __half x; + __half y; + + // All construct/copy/assign/move +public: +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) + __half2() = default; + __CUDA_HOSTDEVICE__ __half2(const __half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); } + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); return *this; } +#else + __CUDA_HOSTDEVICE__ __half2() { } +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + __CUDA_HOSTDEVICE__ __half2(const __half &a, const __half &b) : x(a), y(b) { } + __CUDA_HOSTDEVICE__ __half2(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); } + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); return *this; } + + /* Convert to/from __half2_raw */ + __CUDA_HOSTDEVICE__ __half2(const __half2_raw &h2r ) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); } + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2_raw &h2r) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); return *this; } + __CUDA_HOSTDEVICE__ operator __half2_raw() const { __half2_raw ret; ret.x = 0U; ret.y = 0U; __HALF2_TO_UI(ret) = __HALF2_TO_CUI(*this); return ret; } +}; + +/* Global-space operator functions are only available to nvcc compilation */ +#if defined(__CUDACC__) + +/* Arithmetic FP16x2 operations only supported on arch >= 5.3 */ +#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) && !defined(__CUDA_NO_HALF2_OPERATORS__) + +__device__ __forceinline__ __half2 operator+(const __half2 &lh, const __half2 &rh) { return __hadd2(lh, rh); } +__device__ __forceinline__ __half2 operator-(const __half2 &lh, const __half2 &rh) { return __hsub2(lh, rh); } +__device__ __forceinline__ __half2 operator*(const __half2 &lh, const __half2 &rh) { return __hmul2(lh, rh); } +__device__ __forceinline__ __half2 operator/(const __half2 &lh, const __half2 &rh) { return __h2div(lh, rh); } + +__device__ __forceinline__ __half2& operator+=(__half2 &lh, const __half2 &rh) { lh = __hadd2(lh, rh); return lh; } +__device__ __forceinline__ __half2& operator-=(__half2 &lh, const __half2 &rh) { lh = __hsub2(lh, rh); return lh; } +__device__ __forceinline__ __half2& operator*=(__half2 &lh, const __half2 &rh) { lh = __hmul2(lh, rh); return lh; } +__device__ __forceinline__ __half2& operator/=(__half2 &lh, const __half2 &rh) { lh = __h2div(lh, rh); return lh; } + +__device__ __forceinline__ __half2 &operator++(__half2 &h) { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hadd2(h, one); return h; } +__device__ __forceinline__ __half2 &operator--(__half2 &h) { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hsub2(h, one); return h; } +__device__ __forceinline__ __half2 operator++(__half2 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __half2 ret = h; + __half2_raw one; + one.x = 0x3C00U; + one.y = 0x3C00U; + h = __hadd2(h, one); + return ret; +} +__device__ __forceinline__ __half2 operator--(__half2 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __half2 ret = h; + __half2_raw one; + one.x = 0x3C00U; + one.y = 0x3C00U; + h = __hsub2(h, one); + return ret; +} + +__device__ __forceinline__ __half2 operator+(const __half2 &h) { return h; } +__device__ __forceinline__ __half2 operator-(const __half2 &h) { return __hneg2(h); } + +__device__ __forceinline__ bool operator==(const __half2 &lh, const __half2 &rh) { return __hbeq2(lh, rh); } +__device__ __forceinline__ bool operator!=(const __half2 &lh, const __half2 &rh) { return __hbneu2(lh, rh); } +__device__ __forceinline__ bool operator>(const __half2 &lh, const __half2 &rh) { return __hbgt2(lh, rh); } +__device__ __forceinline__ bool operator<(const __half2 &lh, const __half2 &rh) { return __hblt2(lh, rh); } +__device__ __forceinline__ bool operator>=(const __half2 &lh, const __half2 &rh) { return __hbge2(lh, rh); } +__device__ __forceinline__ bool operator<=(const __half2 &lh, const __half2 &rh) { return __hble2(lh, rh); } + +#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) */ +#endif /* defined(__CUDACC__) */ + +/* Restore warning for multiple assignment operators */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( pop ) +#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */ + +/* Restore -Weffc++ warnings from here on */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic pop +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +#undef __CUDA_HOSTDEVICE__ +#undef __CUDA_ALIGN__ + +#ifndef __CUDACC_RTC__ /* no host functions in NVRTC mode */ +static inline unsigned short __internal_float2half(const float f, unsigned int &sign, unsigned int &remainder) +{ + unsigned int x; + unsigned int u; + unsigned int result; +#if defined(__CUDACC__) + (void)memcpy(&x, &f, sizeof(f)); +#else + (void)std::memcpy(&x, &f, sizeof(f)); +#endif + u = (x & 0x7fffffffU); + sign = ((x >> 16U) & 0x8000U); + // NaN/+Inf/-Inf + if (u >= 0x7f800000U) { + remainder = 0U; + result = ((u == 0x7f800000U) ? (sign | 0x7c00U) : 0x7fffU); + } else if (u > 0x477fefffU) { // Overflows + remainder = 0x80000000U; + result = (sign | 0x7bffU); + } else if (u >= 0x38800000U) { // Normal numbers + remainder = u << 19U; + u -= 0x38000000U; + result = (sign | (u >> 13U)); + } else if (u < 0x33000001U) { // +0/-0 + remainder = u; + result = sign; + } else { // Denormal numbers + const unsigned int exponent = u >> 23U; + const unsigned int shift = 0x7eU - exponent; + unsigned int mantissa = (u & 0x7fffffU); + mantissa |= 0x800000U; + remainder = mantissa << (32U - shift); + result = (sign | (mantissa >> shift)); + result &= 0x0000FFFFU; + } + return static_cast(result); +} +#endif /* #if !defined(__CUDACC_RTC__) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a) +{ +#if defined(__CUDA_ARCH__) + __half val; + asm("{ cvt.rn.f16.f64 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "d"(a)); + return val; +#else + __half result; + /* + // Perform rounding to 11 bits of precision, convert value + // to float and call existing float to half conversion. + // By pre-rounding to 11 bits we avoid additional rounding + // in float to half conversion. + */ + unsigned long long int absa; + unsigned long long int ua; + #if defined(__CUDACC__) + (void)memcpy(&ua, &a, sizeof(a)); + #else + (void)std::memcpy(&ua, &a, sizeof(a)); + #endif + absa = (ua & 0x7fffffffffffffffULL); + if ((absa >= 0x40f0000000000000ULL) || (absa <= 0x3e60000000000000ULL)) + { + /* + // |a| >= 2^16 or NaN or |a| <= 2^(-25) + // double-rounding is not a problem + */ + result = __float2half(static_cast(a)); + } + else + { + /* + // here 2^(-25) < |a| < 2^16 + // prepare shifter value such that a + shifter + // done in double precision performs round-to-nearest-even + // and (a + shifter) - shifter results in a rounded to + // 11 bits of precision. Shifter needs to have exponent of + // a plus 53 - 11 = 42 and a leading bit in mantissa to guard + // against negative values. + // So need to have |a| capped to avoid overflow in exponent. + // For inputs that are smaller than half precision minnorm + // we prepare fixed shifter exponent. + */ + unsigned long long shifterBits; + if (absa >= 0x3f10000000000000ULL) + { + /* + // Here if |a| >= 2^(-14) + // add 42 to exponent bits + */ + shifterBits = (ua & 0x7ff0000000000000ULL) + 0x02A0000000000000ULL; + } + else + { + /* + // 2^(-25) < |a| < 2^(-14), potentially results in denormal + // set exponent bits to 42 - 14 + bias + */ + shifterBits = 0x41B0000000000000ULL; + } + // set leading mantissa bit to protect against negative inputs + shifterBits |= 0x0008000000000000ULL; + double shifter; + #if defined(__CUDACC__) + (void)memcpy(&shifter, &shifterBits, sizeof(shifterBits)); + #else + (void)std::memcpy(&shifter, &shifterBits, sizeof(shifterBits)); + #endif + double aShiftRound = a + shifter; + + /* + // Prevent the compiler from optimizing away a + shifter - shifter + // by doing intermediate memcopy and harmless bitwize operation + */ + unsigned long long int aShiftRoundBits; + #if defined(__CUDACC__) + (void)memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound)); + #else + (void)std::memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound)); + #endif + + // the value is positive, so this operation doesn't change anything + aShiftRoundBits &= 0x7fffffffffffffffULL; + + #if defined(__CUDACC__) + (void)memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound)); + #else + (void)std::memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound)); + #endif + + result = __float2half(static_cast(aShiftRound - shifter)); + } + + return result; +#endif +} + +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rz.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rm.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder != 0U) && (sign != 0U)) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rp.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder != 0U) && (sign == 0U)) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a) +{ + __half2 val; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low;\n" + " cvt.rn.f16.f32 low, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a)); +#else + val = __half2(__float2half_rn(a), __float2half_rn(a)); +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b) +{ + __half2 val; +#if defined(__CUDA_ARCH__) +#if (__CUDA_ARCH__ >= 800) + asm("{ cvt.rn.f16x2.f32 %0, %2, %1; }\n" + : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b)); +#else + asm("{.reg .f16 low,high;\n" + " cvt.rn.f16.f32 low, %1;\n" + " cvt.rn.f16.f32 high, %2;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b)); +#endif +#else + val = __half2(__float2half_rn(a), __float2half_rn(b)); +#endif + return val; +} + +#ifndef __CUDACC_RTC__ /* no host functions in NVRTC mode */ +static inline float __internal_half2float(const unsigned short h) +{ + unsigned int sign = ((static_cast(h) >> 15U) & 1U); + unsigned int exponent = ((static_cast(h) >> 10U) & 0x1fU); + unsigned int mantissa = ((static_cast(h) & 0x3ffU) << 13U); + float f; + if (exponent == 0x1fU) { /* NaN or Inf */ + /* discard sign of a NaN */ + sign = ((mantissa != 0U) ? (sign >> 1U) : sign); + mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U); + exponent = 0xffU; + } else if (exponent == 0U) { /* Denorm or Zero */ + if (mantissa != 0U) { + unsigned int msb; + exponent = 0x71U; + do { + msb = (mantissa & 0x400000U); + mantissa <<= 1U; /* normalize */ + --exponent; + } while (msb == 0U); + mantissa &= 0x7fffffU; /* 1.mantissa is implicit */ + } + } else { + exponent += 0x70U; + } + const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa); +#if defined(__CUDACC__) + (void)memcpy(&f, &u, sizeof(u)); +#else + (void)std::memcpy(&f, &u, sizeof(u)); +#endif + return f; +} +#endif /* !defined(__CUDACC_RTC__) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a) +{ + float val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(a))); +#else + val = __internal_half2float(static_cast<__half_raw>(a).x); +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a) +{ + float val; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, low;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a))); +#else + val = __internal_half2float(static_cast<__half2_raw>(a).x); +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a) +{ + float val; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, high;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a))); +#else + val = __internal_half2float(static_cast<__half2_raw>(a).y); +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h) +{ + short int i; +#if defined __CUDA_ARCH__ + asm("cvt.rzi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); +#else + const float f = __half2float(h); + const short int max_val = (short int)0x7fffU; + const short int min_val = (short int)0x8000U; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +#endif + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h) +{ + unsigned short int i; +#if defined __CUDA_ARCH__ + asm("cvt.rzi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); +#else + const float f = __half2float(h); + const unsigned short int max_val = 0xffffU; + const unsigned short int min_val = 0U; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0U; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +#endif + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h) +{ + int i; +#if defined __CUDA_ARCH__ + asm("cvt.rzi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); +#else + const float f = __half2float(h); + const int max_val = (int)0x7fffffffU; + const int min_val = (int)0x80000000U; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +#endif + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h) +{ + unsigned int i; +#if defined __CUDA_ARCH__ + asm("cvt.rzi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); +#else + const float f = __half2float(h); + const unsigned int max_val = 0xffffffffU; + const unsigned int min_val = 0U; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0U; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +#endif + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h) +{ + long long int i; +#if defined __CUDA_ARCH__ + asm("cvt.rzi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); +#else + const float f = __half2float(h); + const long long int max_val = (long long int)0x7fffffffffffffffULL; + const long long int min_val = (long long int)0x8000000000000000ULL; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = min_val; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +#endif + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h) +{ + unsigned long long int i; +#if defined __CUDA_ARCH__ + asm("cvt.rzi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); +#else + const float f = __half2float(h); + const unsigned long long int max_val = 0xffffffffffffffffULL; + const unsigned long long int min_val = 0ULL; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0x8000000000000000ULL; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +#endif + return i; +} + +/* Intrinsic functions only available to nvcc compilers */ +#if defined(__CUDACC__) + +/* CUDA vector-types compatible vector creation function (note returns __half2, not half2) */ +__VECTOR_FUNCTIONS_DECL__ __half2 make_half2(const __half x, const __half y) +{ + __half2 t; t.x = x; t.y = y; return t; +} +#undef __VECTOR_FUNCTIONS_DECL__ + + +/* Definitions of intrinsics */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a) +{ + const __half2 val = __floats2half2_rn(a.x, a.y); + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a) +{ + float hi_float; + float lo_float; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, low;}\n" : "=f"(lo_float) : "r"(__HALF2_TO_CUI(a))); + + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, high;}\n" : "=f"(hi_float) : "r"(__HALF2_TO_CUI(a))); +#else + lo_float = __internal_half2float(((__half2_raw)a).x); + hi_float = __internal_half2float(((__half2_raw)a).y); +#endif + return make_float2(lo_float, hi_float); +} +__CUDA_FP16_DECL__ int __half2int_rn(const __half h) +{ + int i; + asm("cvt.rni.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ int __half2int_rd(const __half h) +{ + int i; + asm("cvt.rmi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ int __half2int_ru(const __half h) +{ + int i; + asm("cvt.rpi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i) +{ + __half h; +#if defined(__CUDA_ARCH__) + asm("cvt.rn.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +#else + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast(i); + h = __float2half_rn(f); +#endif + return h; +} +__CUDA_FP16_DECL__ __half __int2half_rz(const int i) +{ + __half h; + asm("cvt.rz.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __int2half_rd(const int i) +{ + __half h; + asm("cvt.rm.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __int2half_ru(const int i) +{ + __half h; + asm("cvt.rp.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} + +__CUDA_FP16_DECL__ short int __half2short_rn(const __half h) +{ + short int i; + asm("cvt.rni.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ short int __half2short_rd(const __half h) +{ + short int i; + asm("cvt.rmi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ short int __half2short_ru(const __half h) +{ + short int i; + asm("cvt.rpi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i) +{ + __half h; +#if defined __CUDA_ARCH__ + asm("cvt.rn.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +#else + const float f = static_cast(i); + h = __float2half_rn(f); +#endif + return h; +} +__CUDA_FP16_DECL__ __half __short2half_rz(const short int i) +{ + __half h; + asm("cvt.rz.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __short2half_rd(const short int i) +{ + __half h; + asm("cvt.rm.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __short2half_ru(const short int i) +{ + __half h; + asm("cvt.rp.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} + +__CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h) +{ + unsigned int i; + asm("cvt.rni.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h) +{ + unsigned int i; + asm("cvt.rmi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h) +{ + unsigned int i; + asm("cvt.rpi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i) +{ + __half h; +#if defined __CUDA_ARCH__ + asm("cvt.rn.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +#else + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast(i); + h = __float2half_rn(f); +#endif + return h; +} +__CUDA_FP16_DECL__ __half __uint2half_rz(const unsigned int i) +{ + __half h; + asm("cvt.rz.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __uint2half_rd(const unsigned int i) +{ + __half h; + asm("cvt.rm.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __uint2half_ru(const unsigned int i) +{ + __half h; + asm("cvt.rp.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} + +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h) +{ + unsigned short int i; + asm("cvt.rni.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h) +{ + unsigned short int i; + asm("cvt.rmi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h) +{ + unsigned short int i; + asm("cvt.rpi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i) +{ + __half h; +#if defined __CUDA_ARCH__ + asm("cvt.rn.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +#else + const float f = static_cast(i); + h = __float2half_rn(f); +#endif + return h; +} +__CUDA_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i) +{ + __half h; + asm("cvt.rz.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i) +{ + __half h; + asm("cvt.rm.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i) +{ + __half h; + asm("cvt.rp.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} + +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h) +{ + unsigned long long int i; + asm("cvt.rni.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h) +{ + unsigned long long int i; + asm("cvt.rmi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h) +{ + unsigned long long int i; + asm("cvt.rpi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i) +{ + __half h; +#if defined(__CUDA_ARCH__) + asm("cvt.rn.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +#else + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast(i); + h = __float2half_rn(f); +#endif + return h; +} +__CUDA_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i) +{ + __half h; + asm("cvt.rz.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i) +{ + __half h; + asm("cvt.rm.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i) +{ + __half h; + asm("cvt.rp.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} + +__CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h) +{ + long long int i; + asm("cvt.rni.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h) +{ + long long int i; + asm("cvt.rmi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h) +{ + long long int i; + asm("cvt.rpi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i) +{ + __half h; +#if defined(__CUDA_ARCH__) + asm("cvt.rn.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +#else + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast(i); + h = __float2half_rn(f); +#endif + return h; +} +__CUDA_FP16_DECL__ __half __ll2half_rz(const long long int i) +{ + __half h; + asm("cvt.rz.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ll2half_rd(const long long int i) +{ + __half h; + asm("cvt.rm.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ll2half_ru(const long long int i) +{ + __half h; + asm("cvt.rp.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} + +__CUDA_FP16_DECL__ __half htrunc(const __half h) +{ + __half r; + asm("cvt.rzi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hceil(const __half h) +{ + __half r; + asm("cvt.rpi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hfloor(const __half h) +{ + __half r; + asm("cvt.rmi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hrint(const __half h) +{ + __half r; + asm("cvt.rni.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} + +__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rzi.f16.f16 low, low;\n" + " cvt.rzi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rpi.f16.f16 low, low;\n" + " cvt.rpi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rmi.f16.f16 low, low;\n" + " cvt.rmi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rni.f16.f16 low, low;\n" + " cvt.rni.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b) +{ + __half2 val; + asm("{.reg .f16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {alow,blow};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b))); + return val; +} +__CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b) +{ + __half2 val; + asm("{.reg .f16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b))); + return val; +} +__CUDA_FP16_DECL__ __half __low2half(const __half2 a) +{ + __half ret; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, low;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a))); + return ret; +} +__CUDA_FP16_DECL__ int __hisinf(const __half a) +{ + int retval; + if (__HALF_TO_CUS(a) == 0xFC00U) { + retval = -1; + } else if (__HALF_TO_CUS(a) == 0x7C00U) { + retval = 1; + } else { + retval = 0; + } + return retval; +} +__CUDA_FP16_DECL__ __half2 __low2half2(const __half2 a) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 __high2half2(const __half2 a) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half __high2half(const __half2 a) +{ + __half ret; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, high;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a))); + return ret; +} +__CUDA_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b) +{ + __half2 val; + asm("{ mov.b32 %0, {%1,%2};}\n" + : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); + return val; +} +__CUDA_FP16_DECL__ __half2 __half2half2(const __half a) +{ + __half2 val; + asm("{ mov.b32 %0, {%1,%1};}\n" + : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ short int __half_as_short(const __half h) +{ + return static_cast(__HALF_TO_CUS(h)); +} +__CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h) +{ + return __HALF_TO_CUS(h); +} +__CUDA_FP16_DECL__ __half __short_as_half(const short int i) +{ + __half h; + __HALF_TO_US(h) = static_cast(i); + return h; +} +__CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i) +{ + __half h; + __HALF_TO_US(h) = i; + return h; +} + +/****************************************************************************** +* __half arithmetic * +******************************************************************************/ +__CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b) +{ +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) + __BINARY_OP_HALF_MACRO(max) +#else + const float fa = __half2float(a); + const float fb = __half2float(b); + float fr; + asm("{max.f32 %0,%1,%2;\n}" + :"=f"(fr) : "f"(fa), "f"(fb)); + const __half hr = __float2half(fr); + return hr; +#endif +} +__CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b) +{ +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) + __BINARY_OP_HALF_MACRO(min) +#else + const float fa = __half2float(a); + const float fb = __half2float(b); + float fr; + asm("{min.f32 %0,%1,%2;\n}" + :"=f"(fr) : "f"(fa), "f"(fb)); + const __half hr = __float2half(fr); + return hr; +#endif +} + +/****************************************************************************** +* __half2 arithmetic * +******************************************************************************/ +__CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b) +{ +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) + __BINARY_OP_HALF2_MACRO(max) +#else + const float2 fa = __half22float2(a); + const float2 fb = __half22float2(b); + float2 fr; + asm("{max.f32 %0,%1,%2;\n}" + :"=f"(fr.x) : "f"(fa.x), "f"(fb.x)); + asm("{max.f32 %0,%1,%2;\n}" + :"=f"(fr.y) : "f"(fa.y), "f"(fb.y)); + const __half2 hr = __float22half2_rn(fr); + return hr; +#endif +} +__CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b) +{ +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) + __BINARY_OP_HALF2_MACRO(min) +#else + const float2 fa = __half22float2(a); + const float2 fb = __half22float2(b); + float2 fr; + asm("{min.f32 %0,%1,%2;\n}" + :"=f"(fr.x) : "f"(fa.x), "f"(fb.x)); + asm("{min.f32 %0,%1,%2;\n}" + :"=f"(fr.y) : "f"(fa.y), "f"(fb.y)); + const __half2 hr = __float22half2_rn(fr); + return hr; +#endif +} + + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) +/****************************************************************************** +* __half, __half2 warp shuffle * +******************************************************************************/ +#define __SHUFFLE_HALF2_MACRO(name) /* do */ {\ + __half2 r; \ + asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3;\n}" \ + :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c)); \ + return r; \ +} /* while(0) */ + +#define __SHUFFLE_SYNC_HALF2_MACRO(name) /* do */ {\ + __half2 r; \ + asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \ + :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \ + return r; \ +} /* while(0) */ + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 + +__CUDA_FP16_DECL__ __half2 __shfl(const __half2 var, const int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_HALF2_MACRO(shfl.idx.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = (warp_size - static_cast(width)) << 8U; + __SHUFFLE_HALF2_MACRO(shfl.up.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_down(const __half2 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_HALF2_MACRO(shfl.down.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_xor(const __half2 var, const int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_HALF2_MACRO(shfl.bfly.b32) +} + +#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */ + +__CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned mask, const __half2 var, const int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.idx.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = (warp_size - static_cast(width)) << 8U; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.up.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.down.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned mask, const __half2 var, const int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.bfly.b32) +} + +#undef __SHUFFLE_HALF2_MACRO +#undef __SHUFFLE_SYNC_HALF2_MACRO + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 + +__CUDA_FP16_DECL__ __half __shfl(const __half var, const int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_up(const __half var, const unsigned int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_up(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_down(const __half var, const unsigned int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_down(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_xor(const __half var, const int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_xor(temp1, delta, width); + return __low2half(temp2); +} + +#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */ + +__CUDA_FP16_DECL__ __half __shfl_sync(const unsigned mask, const __half var, const int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned mask, const __half var, const unsigned int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_up_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned mask, const __half var, const unsigned int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_down_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned mask, const __half var, const int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_xor_sync(mask, temp1, delta, width); + return __low2half(temp2); +} + +#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300)*/ +/****************************************************************************** +* __half and __half2 __ldg,__ldcg,__ldca,__ldcs * +******************************************************************************/ + +#if defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320)) +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ +__CUDA_FP16_DECL__ __half2 __ldg(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.nc.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldg(const __half *const ptr) +{ + __half ret; + asm ("ld.global.nc.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.cg.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr) +{ + __half ret; + asm ("ld.global.cg.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldca(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.ca.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldca(const __half *const ptr) +{ + __half ret; + asm ("ld.global.ca.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.cs.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr) +{ + __half ret; + asm ("ld.global.cs.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldlu(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.lu.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr) +{ + __half ret; + asm ("ld.global.lu.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcv(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.cv.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr) +{ + __half ret; + asm ("ld.global.cv.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value) +{ + asm ("st.global.wb.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value) +{ + asm ("st.global.wb.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value) +{ + asm ("st.global.cg.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value) +{ + asm ("st.global.cg.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value) +{ + asm ("st.global.cs.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value) +{ + asm ("st.global.cs.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value) +{ + asm ("st.global.wt.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value) +{ + asm ("st.global.wt.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +#undef __LDG_PTR +#endif /*defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))*/ +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) +/****************************************************************************** +* __half2 comparison * +******************************************************************************/ +#define __COMPARISON_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + return val; \ +} /* while(0) */ +__CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.eq) +} +__CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.ne) +} +__CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.le) +} +__CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.ge) +} +__CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.lt) +} +__CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.gt) +} +__CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.equ) +} +__CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.neu) +} +__CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.leu) +} +__CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.geu) +} +__CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.ltu) +} +__CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.gtu) +} +#undef __COMPARISON_OP_HALF2_MACRO +#define __BOOL_COMPARISON_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + bool retval; \ + asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + if (__HALF2_TO_CUI(val) == 0x3C003C00U) {\ + retval = true; \ + } else { \ + retval = false; \ + }\ + return retval;\ +} /* while(0) */ +__CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.eq) +} +__CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.ne) +} +__CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.le) +} +__CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.ge) +} +__CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.lt) +} +__CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.gt) +} +__CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.equ) +} +__CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.neu) +} +__CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.leu) +} +__CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.geu) +} +__CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.ltu) +} +__CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.gtu) +} +#undef __BOOL_COMPARISON_OP_HALF2_MACRO +/****************************************************************************** +* __half comparison * +******************************************************************************/ +#define __COMPARISON_OP_HALF_MACRO(name) /* do */ {\ + unsigned short val; \ + asm( "{ .reg .pred __$temp3;\n" \ + " setp." __CUDA_FP16_STRINGIFY(name) ".f16 __$temp3, %1, %2;\n" \ + " selp.u16 %0, 1, 0, __$temp3;}" \ + : "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \ + return (val != 0U) ? true : false; \ +} /* while(0) */ +__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(eq) +} +__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ne) +} +__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(le) +} +__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ge) +} +__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(lt) +} +__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(gt) +} +__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(equ) +} +__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(neu) +} +__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(leu) +} +__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(geu) +} +__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ltu) +} +__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(gtu) +} +#undef __COMPARISON_OP_HALF_MACRO +/****************************************************************************** +* __half2 arithmetic * +******************************************************************************/ +__CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(add) +} +__CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(sub) +} +__CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(mul) +} +__CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(add.sat) +} +__CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(sub.sat) +} +__CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(mul.sat) +} +__CUDA_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(add.rn) +} +__CUDA_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(sub.rn) +} +__CUDA_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(mul.rn) +} +__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn) +} +__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn.sat) +} +__CUDA_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b) { + __half ha = __low2half(a); + __half hb = __low2half(b); + + const __half v1 = __hdiv(ha, hb); + + ha = __high2half(a); + hb = __high2half(b); + + const __half v2 = __hdiv(ha, hb); + + return __halves2half2(v1, v2); +} +/****************************************************************************** +* __half arithmetic * +******************************************************************************/ +__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(add) +} +__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(sub) +} +__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(mul) +} +__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(add.sat) +} +__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(sub.sat) +} +__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(mul.sat) +} +__CUDA_FP16_DECL__ __half __hadd_rn(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(add.rn) +} +__CUDA_FP16_DECL__ __half __hsub_rn(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(sub.rn) +} +__CUDA_FP16_DECL__ __half __hmul_rn(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(mul.rn) +} +__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn) +} +__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn.sat) +} +__CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b) { + __half v; + __half abs; + __half den; + __HALF_TO_US(den) = 0x008FU; + + float rcp; + const float fa = __half2float(a); + const float fb = __half2float(b); + + asm("{rcp.approx.ftz.f32 %0, %1;\n}" :"=f"(rcp) : "f"(fb)); + + float fv = rcp * fa; + + v = __float2half(fv); + __HALF_TO_US(abs) = static_cast(static_cast(__HALF_TO_CUS(v)) & 0x00007FFFU); + if (__hlt(abs, den) && (!(__HALF_TO_CUS(abs) == 0x0000U))) { + const float err = __fmaf_rn(-fb, fv, fa); + fv = __fmaf_rn(rcp, err, fv); + v = __float2half(fv); + } + return v; +} + +/****************************************************************************** +* __half2 functions * +******************************************************************************/ +#define __SPEC_CASE2(i,r, spc, ulp) \ + "{.reg.b32 spc, ulp, p;\n"\ + " mov.b32 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\ + " mov.b32 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\ + " set.eq.f16x2.f16x2 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\ + " fma.rn.f16x2 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n" +#define __SPEC_CASE(i,r, spc, ulp) \ + "{.reg.b16 spc, ulp, p;\n"\ + " mov.b16 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\ + " mov.b16 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\ + " set.eq.f16.f16 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\ + " fma.rn.f16 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n" +#define __APPROX_FCAST(fun) /* do */ {\ + __half val;\ + asm("{.reg.b32 f; \n"\ + " .reg.b16 r; \n"\ + " mov.b16 r,%1; \n"\ + " cvt.f32.f16 f,r; \n"\ + " " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 f,f; \n"\ + " cvt.rn.f16.f32 r,f; \n"\ + " mov.b16 %0,r; \n"\ + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\ + return val;\ +} /* while(0) */ +#define __APPROX_FCAST2(fun) /* do */ {\ + __half2 val;\ + asm("{.reg.b16 hl, hu; \n"\ + " .reg.b32 fl, fu; \n"\ + " mov.b32 {hl, hu}, %1; \n"\ + " cvt.f32.f16 fl, hl; \n"\ + " cvt.f32.f16 fu, hu; \n"\ + " " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 fl, fl; \n"\ + " " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 fu, fu; \n"\ + " cvt.rn.f16.f32 hl, fl; \n"\ + " cvt.rn.f16.f32 hu, fu; \n"\ + " mov.b32 %0, {hl, hu}; \n"\ + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); \ + return val;\ +} /* while(0) */ +static __device__ __forceinline__ float __float_simpl_sinf(float a); +static __device__ __forceinline__ float __float_simpl_cosf(float a); +__CUDA_FP16_DECL__ __half hsin(const __half a) { + const float sl = __float_simpl_sinf(__half2float(a)); + __half r = __float2half_rn(sl); + asm("{\n\t" + " .reg.b16 i,r,t; \n\t" + " mov.b16 r, %0; \n\t" + " mov.b16 i, %1; \n\t" + " and.b16 t, r, 0x8000U; \n\t" + " abs.f16 r, r; \n\t" + " abs.f16 i, i; \n\t" + __SPEC_CASE(i, r, 0X32B3U, 0x0800U) + __SPEC_CASE(i, r, 0X5CB0U, 0x9000U) + " or.b16 r,r,t; \n\t" + " mov.b16 %0, r; \n" + "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a) { + const float sl = __float_simpl_sinf(__half2float(a.x)); + const float sh = __float_simpl_sinf(__half2float(a.y)); + __half2 r = __floats2half2_rn(sl, sh); + asm("{\n\t" + " .reg.b32 i,r,t; \n\t" + " mov.b32 r, %0; \n\t" + " mov.b32 i, %1; \n\t" + " and.b32 t, r, 0x80008000U; \n\t" + " abs.f16x2 r, r; \n\t" + " abs.f16x2 i, i; \n\t" + __SPEC_CASE2(i, r, 0X32B332B3U, 0x08000800U) + __SPEC_CASE2(i, r, 0X5CB05CB0U, 0x90009000U) + " or.b32 r, r, t; \n\t" + " mov.b32 %0, r; \n" + "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ __half hcos(const __half a) { + const float cl = __float_simpl_cosf(__half2float(a)); + __half r = __float2half_rn(cl); + asm("{\n\t" + " .reg.b16 i,r; \n\t" + " mov.b16 r, %0; \n\t" + " mov.b16 i, %1; \n\t" + " abs.f16 i, i; \n\t" + __SPEC_CASE(i, r, 0X2B7CU, 0x1000U) + " mov.b16 %0, r; \n" + "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a) { + const float cl = __float_simpl_cosf(__half2float(a.x)); + const float ch = __float_simpl_cosf(__half2float(a.y)); + __half2 r = __floats2half2_rn(cl, ch); + asm("{\n\t" + " .reg.b32 i,r; \n\t" + " mov.b32 r, %0; \n\t" + " mov.b32 i, %1; \n\t" + " abs.f16x2 i, i; \n\t" + __SPEC_CASE2(i, r, 0X2B7C2B7CU, 0x10001000U) + " mov.b32 %0, r; \n" + "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +static __device__ __forceinline__ float __internal_trig_reduction_kernel(const float a, unsigned int *const quadrant) +{ + const float ar = __fmaf_rn(a, 0.636619772F, 12582912.0F); + const unsigned q = __float_as_uint(ar); + const float j = __fsub_rn(ar, 12582912.0F); + float t = __fmaf_rn(j, -1.5707962512969971e+000F, a); + t = __fmaf_rn(j, -7.5497894158615964e-008F, t); + *quadrant = q; + return t; +} +static __device__ __forceinline__ float __internal_sin_cos_kernel(const float x, const unsigned int i) +{ + float z; + const float x2 = x*x; + float a8; + float a6; + float a4; + float a2; + float a1; + float a0; + + if ((i & 1U) != 0U) { + // cos + a8 = 2.44331571e-5F; + a6 = -1.38873163e-3F; + a4 = 4.16666457e-2F; + a2 = -5.00000000e-1F; + a1 = x2; + a0 = 1.0F; + } + else { + // sin + a8 = -1.95152959e-4F; + a6 = 8.33216087e-3F; + a4 = -1.66666546e-1F; + a2 = 0.0F; + a1 = x; + a0 = x; + } + + z = __fmaf_rn(a8, x2, a6); + z = __fmaf_rn(z, x2, a4); + z = __fmaf_rn(z, x2, a2); + z = __fmaf_rn(z, a1, a0); + + if ((i & 2U) != 0U) { + z = -z; + } + return z; +} +static __device__ __forceinline__ float __float_simpl_sinf(float a) +{ + float z; + unsigned i; + a = __internal_trig_reduction_kernel(a, &i); + z = __internal_sin_cos_kernel(a, i); + return z; +} +static __device__ __forceinline__ float __float_simpl_cosf(float a) +{ + float z; + unsigned i; + a = __internal_trig_reduction_kernel(a, &i); + z = __internal_sin_cos_kernel(a, (i & 0x3U) + 1U); + return z; +} + +__CUDA_FP16_DECL__ __half hexp(const __half a) { + __half val; + asm("{.reg.b32 f, C, nZ; \n" + " .reg.b16 h,r; \n" + " mov.b16 h,%1; \n" + " cvt.f32.f16 f,h; \n" + " mov.b32 C, 0x3fb8aa3bU; \n" + " mov.b32 nZ, 0x80000000U;\n" + " fma.rn.f32 f,f,C,nZ; \n" + " ex2.approx.ftz.f32 f,f; \n" + " cvt.rn.f16.f32 r,f; \n" + __SPEC_CASE(h, r, 0X1F79U, 0x9400U) + __SPEC_CASE(h, r, 0X25CFU, 0x9400U) + __SPEC_CASE(h, r, 0XC13BU, 0x0400U) + __SPEC_CASE(h, r, 0XC1EFU, 0x0200U) + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu,C,nZ; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " mov.b32 C, 0x3fb8aa3bU; \n" + " mov.b32 nZ, 0x80000000U;\n" + " fma.rn.f32 fl,fl,C,nZ; \n" + " fma.rn.f32 fu,fu,C,nZ; \n" + " ex2.approx.ftz.f32 fl, fl; \n" + " ex2.approx.ftz.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0X1F791F79U, 0x94009400U) + __SPEC_CASE2(h, r, 0X25CF25CFU, 0x94009400U) + __SPEC_CASE2(h, r, 0XC13BC13BU, 0x04000400U) + __SPEC_CASE2(h, r, 0XC1EFC1EFU, 0x02000200U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hexp2(const __half a) { + __half val; + asm("{.reg.b32 f, ULP; \n" + " .reg.b16 r; \n" + " mov.b16 r,%1; \n" + " cvt.f32.f16 f,r; \n" + " ex2.approx.ftz.f32 f,f; \n" + " mov.b32 ULP, 0x33800000U;\n" + " fma.rn.f32 f,f,ULP,f; \n" + " cvt.rn.f16.f32 r,f; \n" + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 fl, fu, ULP; \n" + " mov.b32 {hl, hu}, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " ex2.approx.ftz.f32 fl, fl; \n" + " ex2.approx.ftz.f32 fu, fu; \n" + " mov.b32 ULP, 0x33800000U;\n" + " fma.rn.f32 fl,fl,ULP,fl; \n" + " fma.rn.f32 fu,fu,ULP,fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 %0, {hl, hu}; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hexp10(const __half a) { + __half val; + asm("{.reg.b16 h,r; \n" + " .reg.b32 f, C, nZ; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " mov.b32 C, 0x40549A78U; \n" + " mov.b32 nZ, 0x80000000U;\n" + " fma.rn.f32 f,f,C,nZ; \n" + " ex2.approx.ftz.f32 f, f; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(h, r, 0x34DEU, 0x9800U) + __SPEC_CASE(h, r, 0x9766U, 0x9000U) + __SPEC_CASE(h, r, 0x9972U, 0x1000U) + __SPEC_CASE(h, r, 0xA5C4U, 0x1000U) + __SPEC_CASE(h, r, 0xBF0AU, 0x8100U) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu,C,nZ; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " mov.b32 C, 0x40549A78U; \n" + " mov.b32 nZ, 0x80000000U;\n" + " fma.rn.f32 fl,fl,C,nZ; \n" + " fma.rn.f32 fu,fu,C,nZ; \n" + " ex2.approx.ftz.f32 fl, fl; \n" + " ex2.approx.ftz.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0x34DE34DEU, 0x98009800U) + __SPEC_CASE2(h, r, 0x97669766U, 0x90009000U) + __SPEC_CASE2(h, r, 0x99729972U, 0x10001000U) + __SPEC_CASE2(h, r, 0xA5C4A5C4U, 0x10001000U) + __SPEC_CASE2(h, r, 0xBF0ABF0AU, 0x81008100U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog2(const __half a) { + __half val; + asm("{.reg.b16 h, r; \n" + " .reg.b32 f; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " lg2.approx.ftz.f32 f, f; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(r, r, 0xA2E2U, 0x8080U) + __SPEC_CASE(r, r, 0xBF46U, 0x9400U) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 fl, fu, r, p; \n" + " mov.b32 {hl, hu}, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.ftz.f32 fl, fl; \n" + " lg2.approx.ftz.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(r, r, 0xA2E2A2E2U, 0x80808080U) + __SPEC_CASE2(r, r, 0xBF46BF46U, 0x94009400U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog(const __half a) { + __half val; + asm("{.reg.b32 f, C; \n" + " .reg.b16 r,h; \n" + " mov.b16 h,%1; \n" + " cvt.f32.f16 f,h; \n" + " lg2.approx.ftz.f32 f,f; \n" + " mov.b32 C, 0x3f317218U; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.f16.f32 r,f; \n" + __SPEC_CASE(h, r, 0X160DU, 0x9C00U) + __SPEC_CASE(h, r, 0X3BFEU, 0x8010U) + __SPEC_CASE(h, r, 0X3C0BU, 0x8080U) + __SPEC_CASE(h, r, 0X6051U, 0x1C00U) + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.ftz.f32 fl, fl; \n" + " lg2.approx.ftz.f32 fu, fu; \n" + " mov.b32 C, 0x3f317218U; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0X160D160DU, 0x9C009C00U) + __SPEC_CASE2(h, r, 0X3BFE3BFEU, 0x80108010U) + __SPEC_CASE2(h, r, 0X3C0B3C0BU, 0x80808080U) + __SPEC_CASE2(h, r, 0X60516051U, 0x1C001C00U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog10(const __half a) { + __half val; + asm("{.reg.b16 h, r; \n" + " .reg.b32 f, C; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " lg2.approx.ftz.f32 f, f; \n" + " mov.b32 C, 0x3E9A209BU; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(h, r, 0x338FU, 0x1000U) + __SPEC_CASE(h, r, 0x33F8U, 0x9000U) + __SPEC_CASE(h, r, 0x57E1U, 0x9800U) + __SPEC_CASE(h, r, 0x719DU, 0x9C00U) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.ftz.f32 fl, fl; \n" + " lg2.approx.ftz.f32 fu, fu; \n" + " mov.b32 C, 0x3E9A209BU; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0x338F338FU, 0x10001000U) + __SPEC_CASE2(h, r, 0x33F833F8U, 0x90009000U) + __SPEC_CASE2(h, r, 0x57E157E1U, 0x98009800U) + __SPEC_CASE2(h, r, 0x719D719DU, 0x9C009C00U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +#undef __SPEC_CASE2 +#undef __SPEC_CASE +__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a) { + __APPROX_FCAST2(rcp) +} +__CUDA_FP16_DECL__ __half hrcp(const __half a) { + __APPROX_FCAST(rcp) +} +__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a) { + __APPROX_FCAST2(rsqrt) +} +__CUDA_FP16_DECL__ __half hrsqrt(const __half a) { + __APPROX_FCAST(rsqrt) +} +__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a) { + __APPROX_FCAST2(sqrt) +} +__CUDA_FP16_DECL__ __half hsqrt(const __half a) { + __APPROX_FCAST(sqrt) +} +#undef __APPROX_FCAST +#undef __APPROX_FCAST2 +__CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a) +{ + __half2 r; + asm("{set.nan.f16x2.f16x2 %0,%1,%2;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ bool __hisnan(const __half a) +{ + __half r; + asm("{set.nan.f16.f16 %0,%1,%2;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(a))); + return __HALF_TO_CUS(r) != 0U; +} +__CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a) +{ + __half2 r; + asm("{neg.f16x2 %0,%1;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ __half __hneg(const __half a) +{ + __half r; + asm("{neg.f16 %0,%1;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 __habs2(const __half2 a) +{ + __half2 r; + asm("{abs.f16x2 %0,%1;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ __half __habs(const __half a) +{ + __half r; + asm("{abs.f16 %0,%1;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} + +__CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c) +{ + // fast version of complex multiply-accumulate + // (a.re, a.im) * (b.re, b.im) + (c.re, c.im) + // acc.re = (c.re + a.re*b.re) - a.im*b.im + // acc.im = (c.im + a.re*b.im) + a.im*b.re + __half real_tmp = __hfma(a.x, b.x, c.x); + __half img_tmp = __hfma(a.x, b.y, c.y); + real_tmp = __hfma(__hneg(a.y), b.y, real_tmp); + img_tmp = __hfma(a.y, b.x, img_tmp); + return make_half2(real_tmp, img_tmp); +} + +#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)*/ + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) +__CUDA_FP16_DECL__ __half __hmax_nan(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(max.NaN) +} +__CUDA_FP16_DECL__ __half __hmin_nan(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(min.NaN) +} +__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn.relu) +} + +__CUDA_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(max.NaN) +} +__CUDA_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(min.NaN) +} +__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn.relu) +} +#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)*/ + +/* Define __PTR for atomicAdd prototypes below, undef after done */ +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __PTR "l" +#else +#define __PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 + +__CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *const address, const __half2 val) { + __half2 r; + asm volatile ("{ atom.add.noftz.f16x2 %0,[%1],%2; }\n" + : "=r"(__HALF2_TO_UI(r)) : __PTR(address), "r"(__HALF2_TO_CUI(val)) + : "memory"); + return r; +} + +#endif /*!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600*/ + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + +__CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val) { + __half r; + asm volatile ("{ atom.add.noftz.f16 %0,[%1],%2; }\n" + : "=h"(__HALF_TO_US(r)) + : __PTR(address), "h"(__HALF_TO_CUS(val)) + : "memory"); + return r; +} + +#endif /*!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700*/ + +#undef __PTR + +#undef __CUDA_FP16_DECL__ +#endif /* defined(__CUDACC__) */ +#endif /* defined(__cplusplus) */ + +#undef __TERNARY_OP_HALF2_MACRO +#undef __TERNARY_OP_HALF_MACRO +#undef __BINARY_OP_HALF2_MACRO +#undef __BINARY_OP_HALF_MACRO + +#undef __CUDA_HOSTDEVICE_FP16_DECL__ +#undef __CUDA_FP16_DECL__ + +#undef __HALF_TO_US +#undef __HALF_TO_CUS +#undef __HALF2_TO_UI +#undef __HALF2_TO_CUI + +/* Define first-class types "half" and "half2", unless user specifies otherwise via "#define CUDA_NO_HALF" */ +/* C cannot ever have these types defined here, because __half and __half2 are C++ classes */ +#if defined(__cplusplus) && !defined(CUDA_NO_HALF) +typedef __half half; +typedef __half2 half2; +// for consistency with __nv_bfloat16 +typedef __half __nv_half; +typedef __half2 __nv_half2; +typedef __half_raw __nv_half_raw; +typedef __half2_raw __nv_half2_raw; +typedef __half nv_half; +typedef __half2 nv_half2; +#endif /* defined(__cplusplus) && !defined(CUDA_NO_HALF) */ + +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) +#undef __CPP_VERSION_AT_LEAST_11_FP16 +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + +#endif /* end of include guard: __CUDA_FP16_HPP__ */ diff --git a/cupy/_core/include/cupy/_cuda/cuda-12/cuda_fp16.h b/cupy/_core/include/cupy/_cuda/cuda-12/cuda_fp16.h new file mode 100644 index 0000000..01981ed --- /dev/null +++ b/cupy/_core/include/cupy/_cuda/cuda-12/cuda_fp16.h @@ -0,0 +1,4023 @@ +/* +* Copyright 1993-2021 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +/** +* \defgroup CUDA_MATH_INTRINSIC_HALF Half Precision Intrinsics +* This section describes half precision intrinsic functions that are +* only supported in device code. +* To use these functions, include the header file \p cuda_fp16.h in your program. +* The following macros are available to help users selectively enable/disable +* various definitions present in the header file: +* - \p CUDA_NO_HALF - If defined, this macro will prevent the definition of +* additional type aliases in the global namespace, helping to avoid potential +* conflicts with symbols defined in the user program. +* - \p __CUDA_NO_HALF_CONVERSIONS__ - If defined, this macro will prevent the +* use of the C++ type conversions (converting constructors and conversion +* operators) that are common for built-in floating-point types, but may be +* undesirable for \p half which is essentially a user-defined type. +* - \p __CUDA_NO_HALF_OPERATORS__ and \p __CUDA_NO_HALF2_OPERATORS__ - If +* defined, these macros will prevent the inadvertent use of usual arithmetic +* and comparison operators. This enforces the storage-only type semantics and +* prevents C++ style computations on \p half and \p half2 types. +*/ + +/** +* \defgroup CUDA_MATH__HALF_ARITHMETIC Half Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_ARITHMETIC Half2 Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_COMPARISON Half Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_COMPARISON Half2 Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_MISC Half Precision Conversion and Data Movement +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_FUNCTIONS Half Math Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_FUNCTIONS Half2 Math Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +#ifndef __CUDA_FP16_H__ +#define __CUDA_FP16_H__ + +#define ___CUDA_FP16_STRINGIFY_INNERMOST(x) #x +#define __CUDA_FP16_STRINGIFY(x) ___CUDA_FP16_STRINGIFY_INNERMOST(x) + +#if defined(__cplusplus) +#if defined(__CUDACC__) +#define __CUDA_FP16_DECL__ static __device__ __inline__ +#define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__ +#else +#define __CUDA_HOSTDEVICE_FP16_DECL__ static +#endif /* defined(__CUDACC__) */ + +#define __CUDA_FP16_TYPES_EXIST__ + +/* Forward-declaration of structures defined in "cuda_fp16.hpp" */ + +/** + * \brief half datatype + * + * \details This structure implements the datatype for storing + * half-precision floating-point numbers. The structure implements + * assignment operators and type conversions. + * 16 bits are being used in total: 1 sign bit, 5 bits for the exponent, + * and the significand is being stored in 10 bits. + * The total precision is 11 bits. There are 15361 representable + * numbers within the interval [0.0, 1.0], endpoints included. + * On average we have log10(2**11) ~ 3.311 decimal digits. + * + * \internal + * \req IEEE 754-2008 compliant implementation of half-precision + * floating-point numbers. + * \endinternal + */ +struct __half; + +/** + * \brief half2 datatype + * + * \details This structure implements the datatype for storing two + * half-precision floating-point numbers. + * The structure implements assignment operators and type conversions. + * + * \internal + * \req Vectorified version of half. + * \endinternal + */ +struct __half2; + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts double number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts double number \p a to half precision in round-to-nearest-even mode. +* \param[in] a - double. Is only being read. +* \returns half +* - \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. +* \returns half +* - \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. +* \returns half +* - \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-towards-zero mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-towards-zero mode. +* \param[in] a - float. Is only being read. +* \returns half +* - \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-down mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-down mode. +* \param[in] a - float. Is only being read. +* +* \returns half +* - \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-up mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-up mode. +* \param[in] a - float. Is only being read. +* +* \returns half +* - \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts \p half number to float. +* +* \details Converts half number \p a to float. +* \param[in] a - float. Is only being read. +* +* \returns float +* - \p a converted to float. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts input to half precision in round-to-nearest-even mode and +* populates both halves of \p half2 with converted value. +* +* \details Converts input \p a to half precision in round-to-nearest-even mode and +* populates both halves of \p half2 with converted value. +* \param[in] a - float. Is only being read. +* +* \returns half2 +* - The \p half2 value with both halves equal to the converted half +* precision number. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both input floats to half precision in round-to-nearest-even +* mode and returns \p half2 with converted values. +* +* \details Converts both input floats to half precision in round-to-nearest-even mode +* and combines the results into one \p half2 number. Low 16 bits of the return +* value correspond to the input \p a, high 16 bits correspond to the input \p +* b. +* \param[in] a - float. Is only being read. +* \param[in] b - float. Is only being read. +* +* \returns half2 +* - The \p half2 value with corresponding halves equal to the +* converted input floats. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts low 16 bits of \p half2 to float and returns the result +* +* \details Converts low 16 bits of \p half2 input \p a to 32-bit floating-point number +* and returns the result. +* \param[in] a - half2. Is only being read. +* +* \returns float +* - The low 16 bits of \p a converted to float. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts high 16 bits of \p half2 to float and returns the result +* +* \details Converts high 16 bits of \p half2 input \p a to 32-bit floating-point number +* and returns the result. +* \param[in] a - half2. Is only being read. +* +* \returns float +* - The high 16 bits of \p a converted to float. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-towards-zero mode. +* +* \details Convert the half-precision floating-point value \p h to a signed short +* integer in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-towards-zero +* mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned short +* integer in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-towards-zero mode. +* +* \details Convert the half-precision floating-point value \p h to a signed integer in +* round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns int +* - \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-towards-zero mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned integer +* in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-towards-zero mode. +* +* \details Convert the half-precision floating-point value \p h to a signed 64-bit +* integer in round-towards-zero mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-towards-zero +* mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit +* integer in round-towards-zero mode. NaN inputs return 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h); + +#if defined(__CUDACC__) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both components of float2 number to half precision in +* round-to-nearest-even mode and returns \p half2 with converted values. +* +* \details Converts both components of float2 to half precision in round-to-nearest +* mode and combines the results into one \p half2 number. Low 16 bits of the +* return value correspond to \p a.x and high 16 bits of the return value +* correspond to \p a.y. +* \param[in] a - float2. Is only being read. +* +* \returns half2 +* - The \p half2 which has corresponding halves equal to the +* converted float2 components. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both halves of \p half2 to float2 and returns the result. +* +* \details Converts both halves of \p half2 input \p a to float2 and returns the +* result. +* \param[in] a - half2. Is only being read. +* +* \returns float2 +* - \p a converted to float2. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-to-nearest-even mode. +* +* \details Convert the half-precision floating-point value \p h to a signed integer in +* round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns int +* - \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __half2int_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to a signed integer in +* round-down mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns int +* - \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __half2int_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to a signed integer in +* round-up mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns int +* - \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __half2int_ru(const __half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-to-nearest-even mode. +* +* \details Convert the signed integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-towards-zero mode. +* +* \details Convert the signed integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __int2half_rz(const int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-down mode. +* +* \details Convert the signed integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __int2half_rd(const int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-up mode. +* +* \details Convert the signed integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __int2half_ru(const int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating-point value \p h to a signed short +* integer in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half2short_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to a signed short +* integer in round-down mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half2short_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to a signed short +* integer in round-up mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half2short_ru(const __half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-towards-zero mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __short2half_rz(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-down mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __short2half_rd(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-up mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __short2half_ru(const short int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-to-nearest-even mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned integer +* in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned integer +* in round-down mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned integer +* in round-up mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-to-nearest-even mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-towards-zero mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __uint2half_rz(const unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-down mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __uint2half_rd(const unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-up mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __uint2half_ru(const unsigned int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned short +* integer in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned short +* integer in round-down mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned short +* integer in round-up mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-towards-zero +* mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-down mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-up mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit +* integer in round-to-nearest-even mode. NaN inputs return 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit +* integer in round-down mode. NaN inputs return 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit +* integer in round-up mode. NaN inputs return 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-towards-zero +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-down mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-up mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating-point value \p h to a signed 64-bit +* integer in round-to-nearest-even mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to a signed 64-bit +* integer in round-down mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to a signed 64-bit +* integer in round-up mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-towards-zero mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +*/ +__CUDA_FP16_DECL__ __half __ll2half_rz(const long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-down mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ll2half_rd(const long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-up mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ll2half_ru(const long long int i); + +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Truncate input argument to the integral part. +* +* \details Round \p h to the nearest integer value that does not exceed \p h in +* magnitude. +* \param[in] h - half. Is only being read. +* +* \returns half +* - The truncated integer value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half htrunc(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculate ceiling of the input argument. +* +* \details Compute the smallest integer value not less than \p h. +* \param[in] h - half. Is only being read. +* +* \returns half +* - The smallest integer value not less than \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hceil(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details Calculate the largest integer value which is less than or equal to \p h. +* \param[in] h - half. Is only being read. +* +* \returns half +* - The largest integer value which is less than or equal to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hfloor(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Round input to nearest integer value in half-precision floating-point +* number. +* +* \details Round \p h to the nearest integer value in half-precision floating-point +* format, with halfway cases rounded to the nearest even integer value. +* \param[in] h - half. Is only being read. +* +* \returns half +* - The nearest integer to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hrint(const __half h); + +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Truncate \p half2 vector input argument to the integral part. +* +* \details Round each component of vector \p h to the nearest integer value that does +* not exceed \p h in magnitude. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* - The truncated \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculate \p half2 vector ceiling of the input argument. +* +* \details For each component of vector \p h compute the smallest integer value not less +* than \p h. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* - The vector of smallest integers not less than \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details For each component of vector \p h calculate the largest integer value which +* is less than or equal to \p h. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* - The vector of largest integers which is less than or equal to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Round input to nearest integer value in half-precision floating-point +* number. +* +* \details Round each component of \p half2 vector \p h to the nearest integer value in +* half-precision floating-point format, with halfway cases rounded to the +* nearest even integer value. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* - The vector of rounded integer values. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns \p half2 with both halves equal to the input value. +* +* \details Returns \p half2 number with both halves equal to the input \p a \p half +* number. +* \param[in] a - half. Is only being read. +* +* \returns half2 +* - The vector which has both its halves equal to the input \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __half2half2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Swaps both halves of the \p half2 input. +* +* \details Swaps both halves of the \p half2 input and returns a new \p half2 number +* with swapped halves. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - \p a with its halves being swapped. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts low 16 bits from each of the two \p half2 inputs and combines +* into one \p half2 number. +* +* \details Extracts low 16 bits from each of the two \p half2 inputs and combines into +* one \p half2 number. Low 16 bits from input \p a is stored in low 16 bits of +* the return value, low 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The low 16 bits of \p a and of \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts high 16 bits from each of the two \p half2 inputs and +* combines into one \p half2 number. +* +* \details Extracts high 16 bits from each of the two \p half2 inputs and combines into +* one \p half2 number. High 16 bits from input \p a is stored in low 16 bits of +* the return value, high 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The high 16 bits of \p a and of \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns high 16 bits of \p half2 input. +* +* \details Returns high 16 bits of \p half2 input \p a. +* \param[in] a - half2. Is only being read. +* +* \returns half +* - The high 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __high2half(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns low 16 bits of \p half2 input. +* +* \details Returns low 16 bits of \p half2 input \p a. +* \param[in] a - half2. Is only being read. +* +* \returns half +* - Returns \p half which contains low 16 bits of the input \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __low2half(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Checks if the input \p half number is infinite. +* +* \details Checks if the input \p half number \p a is infinite. +* \param[in] a - half. Is only being read. +* +* \returns int +* - -1 iff \p a is equal to negative infinity, +* - 1 iff \p a is equal to positive infinity, +* - 0 otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __hisinf(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Combines two \p half numbers into one \p half2 number. +* +* \details Combines two input \p half number \p a and \p b into one \p half2 number. +* Input \p a is stored in low 16 bits of the return value, input \p b is stored +* in high 16 bits of the return value. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half2 +* - The half2 with one half equal to \p a and the other to \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts low 16 bits from \p half2 input. +* +* \details Extracts low 16 bits from \p half2 input \p a and returns a new \p half2 +* number which has both halves equal to the extracted bits. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The half2 with both halves equal to the low 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __low2half2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts high 16 bits from \p half2 input. +* +* \details Extracts high 16 bits from \p half2 input \p a and returns a new \p half2 +* number which has both halves equal to the extracted bits. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The half2 with both halves equal to the high 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __high2half2(const __half2 a); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a \p half as a signed short integer. +* +* \details Reinterprets the bits in the half-precision floating-point number \p h +* as a signed short integer. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half_as_short(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a \p half as an unsigned short integer. +* +* \details Reinterprets the bits in the half-precision floating-point \p h +* as an unsigned short number. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a signed short integer as a \p half. +* +* \details Reinterprets the bits in the signed short integer \p i as a +* half-precision floating-point number. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __short_as_half(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in an unsigned short integer as a \p half. +* +* \details Reinterprets the bits in the unsigned short integer \p i as a +* half-precision floating-point number. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half maximum of two input values. +* +* \details Calculates \p half max(\p a, \p b) +* defined as (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half minimum of two input values. +* +* \details Calculates \p half min(\p a, \p b) +* defined as (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector maximum of two inputs. +* +* \details Calculates \p half2 vector max(\p a, \p b). +* Elementwise \p half operation is defined as +* (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise maximum of vectors \p a and \p b +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector minimum of two inputs. +* +* \details Calculates \p half2 vector min(\p a, \p b). +* Elementwise \p half operation is defined as +* (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise minimum of vectors \p a and \p b +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b); + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) +#if !defined warpSize && !defined __local_warpSize +#define warpSize 32 +#define __local_warpSize +#endif + +#if defined(_WIN32) +# define __DEPRECATED__(msg) __declspec(deprecated(msg)) +#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__)))) +# define __DEPRECATED__(msg) __attribute__((deprecated)) +#else +# define __DEPRECATED__(msg) __attribute__((deprecated(msg))) +#endif + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 +#define __WSB_DEPRECATION_MESSAGE(x) __CUDA_FP16_STRINGIFY(x) "() is deprecated in favor of " __CUDA_FP16_STRINGIFY(x) "_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)." + +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) __half2 __shfl(const __half2 var, const int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down))__half2 __shfl_down(const __half2 var, const unsigned int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half2 __shfl_xor(const __half2 var, const int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) __half __shfl(const __half var, const int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) __half __shfl_up(const __half var, const unsigned int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) __half __shfl_down(const __half var, const unsigned int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half __shfl_xor(const __half var, const int delta, const int width = warpSize); +#endif + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of var held by the thread whose ID is given by delta. +* If width is less than warpSize then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], +* the value returned corresponds to the value of var held by the delta modulo width (i.e. +* within the same subsection). width must have a value which is a power of 2; +* results are undefined if width is not a power of 2, or is a number greater than +* warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned mask, const __half2 var, const int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* +* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. +* The value of var held by the resulting lane ID is returned: in effect, var is shifted up +* the warp by delta threads. If width is less than warpSize then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. +* width must have a value which is a power of 2; results are undefined if width is not a power of 2, +* or is a number greater than warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding delta to the caller's thread ID. +* The value of var held by the resulting thread ID is returned: this has the effect +* of shifting var down the warp by delta threads. If width is less than warpSize then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of width and so the upper delta threads +* will remain unchanged. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: +* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each +* group of width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned mask, const __half2 var, const int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of var held by the thread whose ID is given by delta. +* If width is less than warpSize then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], +* the value returned corresponds to the value of var held by the delta modulo width (i.e. +* within the same subsection). width must have a value which is a power of 2; +* results are undefined if width is not a power of 2, or is a number greater than +* warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_sync(const unsigned mask, const __half var, const int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. +* The value of var held by the resulting lane ID is returned: in effect, var is shifted up +* the warp by delta threads. If width is less than warpSize then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. +* width must have a value which is a power of 2; results are undefined if width is not a power of 2, +* or is a number greater than warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned mask, const __half var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding delta to the caller's thread ID. +* The value of var held by the resulting thread ID is returned: this has the effect +* of shifting var down the warp by delta threads. If width is less than warpSize then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of width and so the upper delta threads +* will remain unchanged. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned mask, const __half var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: +* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each +* group of width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned mask, const __half var, const int delta, const int width = warpSize); + +#if defined(__local_warpSize) +#undef warpSize +#undef __local_warpSize +#endif +#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) */ + +#if defined(__cplusplus) && ( !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) ) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.nc` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldg(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.nc` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldg(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cg` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cg` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.ca` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldca(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.ca` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldca(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cs` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cs` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.lu` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldlu(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.lu` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cv` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldcv(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cv` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wb` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wb` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cg` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cg` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cs` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cs` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wt` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wt` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value); +#endif /*defined(__cplusplus) && ( !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) )*/ + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs half2 vector if-equal comparison. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector not-equal comparison. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-equal comparison. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The \p half2 result of less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-equal comparison. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-than comparison. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The half2 vector result of less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-than comparison. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered if-equal comparison. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of unordered if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered not-equal comparison. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of unordered not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-equal comparison. +* +* Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of unordered less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-equal comparison. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The \p half2 vector result of unordered greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-than comparison. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of unordered less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-than comparison. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The \p half2 vector result of unordered greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs half2 vector if-equal comparison. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned __heq2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector not-equal comparison. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned __hne2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-equal comparison. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned __hle2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-equal comparison. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned __hge2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-than comparison. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned __hlt2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-than comparison. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned __hgt2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered if-equal comparison. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned __hequ2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered not-equal comparison. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned __hneu2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-equal comparison. +* +* Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned __hleu2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-equal comparison. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned __hgeu2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-than comparison. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned __hltu2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-than comparison. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p unsigned bits are set to 0xFFFF for true, or 0x0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns unsigned int +* - The vector mask result of unordered greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned __hgtu2_mask(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Determine whether \p half2 argument is a NaN. +* +* \details Determine whether each half of input \p half2 number \p a is a NaN. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The half2 with the corresponding \p half results set to +* 1.0 for NaN, 0.0 otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode. +* +* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-95 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The sum of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-104 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The subtraction of vector \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode. +* +* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-102 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise multiplying the vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode. +* +* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest +* mode. Prevents floating-point contractions of mul+add into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-95 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The sum of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode. Prevents floating-point contractions of mul+sub +* into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-104 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The subtraction of vector \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode. +* +* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode. Prevents floating-point contractions of +* mul+add or sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-102 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise multiplying the vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector division in round-to-nearest-even mode. +* +* \details Divides \p half2 input vector \p a by input vector \p b in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-103 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The elementwise division of \p a with \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Calculates the absolute value of both halves of the input \p half2 number and +* returns the result. +* +* \details Calculates the absolute value of both halves of the input \p half2 number and +* returns the result. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - Returns \p a with the absolute value of both halves. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __habs2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest +* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The sum of \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The subtraction of vector \p b from \p a, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise multiplication of vectors \p a and \p b, +* with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-105 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode, with saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the +* results to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c, +* with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Negates both halves of the input \p half2 number and returns the +* result. +* +* \details Negates both halves of the input \p half2 number \p a and returns the result. +* \internal +* \req DEEPLEARN-SRM_REQ-101 +* \endinternal +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - Returns \p a with both halves negated. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Calculates the absolute value of input \p half number and returns the result. +* +* \details Calculates the absolute value of input \p half number and returns the result. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The absolute value of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __habs(const __half a); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode. +* +* \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-94 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The sum of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half input \p b from input \p a in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-97 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of subtracting \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode. +* +* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-99 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of multiplying \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode. +* +* \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+add into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-94 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The sum of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hadd_rn(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half input \p b from input \p a in round-to-nearest +* mode. Prevents floating-point contractions of mul+sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-97 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of subtracting \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hsub_rn(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode. +* +* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest +* mode. Prevents floating-point contractions of mul+add or sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-99 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of multiplying \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half __hmul_rn(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half division in round-to-nearest-even mode. +* +* \details Divides \p half input \p a by input \p b in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-98 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of dividing \p a by \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half add of inputs \p a and \p b, in round-to-nearest-even mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The sum of \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Subtracts \p half input \p b from input \p a in round-to-nearest +* mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of subtraction of \p b from \p a, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest +* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of multiplying \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-96 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the result +* to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Negates input \p half number and returns the result. +* +* \details Negates input \p half number and returns the result. +* \internal +* \req DEEPLEARN-SRM_REQ-100 +* \endinternal +* \param[in] a - half. Is only being read. +* +* \returns half +* - minus a +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hneg(const __half a); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector if-equal comparison and returns boolean true +* iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of if-equal comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector not-equal comparison and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of not-equal comparison +* of vectors \p a and \p b are true, +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-equal comparison and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of less-equal comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-equal comparison and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of greater-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-than comparison and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of less-than comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-than comparison and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of greater-than +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered if-equal comparison and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered if-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered not-equal comparison and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered not-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-equal comparison and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered less-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-equal comparison and +* returns boolean true iff both \p half results are true, boolean false +* otherwise. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered +* greater-equal comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-than comparison and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered less-than comparison of +* vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-than comparison and +* returns boolean true iff both \p half results are true, boolean false +* otherwise. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered +* greater-than comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half if-equal comparison. +* +* \details Performs \p half if-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of if-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half not-equal comparison. +* +* \details Performs \p half not-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of not-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half less-equal comparison. +* +* \details Performs \p half less-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of less-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half greater-equal comparison. +* +* \details Performs \p half greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of greater-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half less-than comparison. +* +* \details Performs \p half less-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of less-than comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half greater-than comparison. +* +* \details Performs \p half greater-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of greater-than comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered if-equal comparison. +* +* \details Performs \p half if-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered if-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered not-equal comparison. +* +* \details Performs \p half not-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered not-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered less-equal comparison. +* +* \details Performs \p half less-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered less-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered greater-equal comparison. +* +* \details Performs \p half greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered greater-equal comparison of \p a +* and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered less-than comparison. +* +* \details Performs \p half less-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered less-than comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered greater-than comparison. +* +* \details Performs \p half greater-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered greater-than comparison of \p a +* and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Determine whether \p half argument is a NaN. +* +* \details Determine whether \p half value \p a is a NaN. +* \param[in] a - half. Is only being read. +* +* \returns bool +* - true iff argument is NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hisnan(const __half a); +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half maximum of two input values, NaNs pass through. +* +* \details Calculates \p half max(\p a, \p b) +* defined as (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmax_nan(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half minimum of two input values, NaNs pass through. +* +* \details Calculates \p half min(\p a, \p b) +* defined as (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmin_nan(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode with relu saturation. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* Then negative result is clamped to 0. +* NaN result is converted to canonical NaN. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c with relu saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector maximum of two inputs, NaNs pass through. +* +* \details Calculates \p half2 vector max(\p a, \p b). +* Elementwise \p half operation is defined as +* (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise maximum of vectors \p a and \p b, with NaNs pass through +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector minimum of two inputs, NaNs pass through. +* +* \details Calculates \p half2 vector min(\p a, \p b). +* Elementwise \p half operation is defined as +* (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise minimum of vectors \p a and \p b, with NaNs pass through +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode with relu saturation. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* Then negative result is clamped to 0. +* NaN result is converted to canonical NaN. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c); +#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) */ +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs fast complex multiply-accumulate +* +* \details Interprets vector \p half2 input pairs \p a, \p b, and \p c as +* complex numbers in \p half precision and performs +* complex multiply-accumulate operation: a*b + c +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* - The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half square root in round-to-nearest-even mode. +* +* \details Calculates \p half square root of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The square root of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hsqrt(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half reciprocal square root in round-to-nearest-even +* mode. +* +* \details Calculates \p half reciprocal square root of input \p a in round-to-nearest +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The reciprocal square root of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hrsqrt(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p half reciprocal of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The reciprocal of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hrcp(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half natural logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half natural logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The natural logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hlog(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half binary logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half binary logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The binary logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hlog2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half decimal logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half decimal logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The decimal logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hlog10(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half natural exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half natural exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The natural exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hexp(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half binary exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half binary exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The binary exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hexp2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half decimal exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half decimal exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The decimal exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hexp10(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half cosine in round-to-nearest-even mode. +* +* \details Calculates \p half cosine of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The cosine of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hcos(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half sine in round-to-nearest-even mode. +* +* \details Calculates \p half sine of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The sine of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hsin(const __half a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector square root in round-to-nearest-even mode. +* +* \details Calculates \p half2 square root of input vector \p a in round-to-nearest +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise square root on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector reciprocal square root in round-to-nearest +* mode. +* +* \details Calculates \p half2 reciprocal square root of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise reciprocal square root on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p half2 reciprocal of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise reciprocal on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector natural logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 natural logarithm of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise natural logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2log(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector binary logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 binary logarithm of input vector \p a in round-to-nearest +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise binary logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector decimal logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 decimal logarithm of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise decimal logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half2 exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector binary exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p half2 binary exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise binary exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector decimal exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p half2 decimal exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise decimal exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector cosine in round-to-nearest-even mode. +* +* \details Calculates \p half2 cosine of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise cosine on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector sine in round-to-nearest-even mode. +* +* \details Calculates \p half2 sine of input vector \p a in round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise sine on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a); + +#endif /*if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)*/ + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600) + +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Vector add \p val to the value stored at \p address in global or shared memory, and writes this +* value back to \p address. The atomicity of the add operation is guaranteed separately for each of the +* two __half elements; the entire __half2 is not guaranteed to be atomic as a single 32-bit access. +* +* \details The location of \p address must be in global or shared memory. This operation has undefined +* behavior otherwise. This operation is only supported by devices of compute capability 6.x and higher. +* +* \param[in] address - half2*. An address in global or shared memory. +* \param[in] val - half2. The value to be added. +* +* \returns half2 +* - The old value read from \p address. +* +* \note_ref_guide_atomic +*/ +__CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *const address, const __half2 val); + +#endif /*if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)*/ + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) + +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Adds \p val to the value stored at \p address in global or shared memory, and writes this value +* back to \p address. This operation is performed in one atomic operation. +* +* \details The location of \p address must be in global or shared memory. This operation has undefined +* behavior otherwise. This operation is only supported by devices of compute capability 7.x and higher. +* +* \param[in] address - half*. An address in global or shared memory. +* \param[in] val - half. The value to be added. +* +* \returns half +* - The old value read from \p address. +* +* \note_ref_guide_atomic +*/ +__CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val); + +#endif /*if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)*/ + +#endif /* defined(__CUDACC__) */ + +#undef __CUDA_FP16_DECL__ +#undef __CUDA_HOSTDEVICE_FP16_DECL__ + +#endif /* defined(__cplusplus) */ + +/* Note the .hpp file is included even for host-side compilation, to capture the "half" & "half2" definitions */ +#include "cuda_fp16.hpp" +#undef ___CUDA_FP16_STRINGIFY_INNERMOST +#undef __CUDA_FP16_STRINGIFY + +#endif /* end of include guard: __CUDA_FP16_H__ */ diff --git a/cupy/_core/include/cupy/_cuda/cuda-12/cuda_fp16.hpp b/cupy/_core/include/cupy/_cuda/cuda-12/cuda_fp16.hpp new file mode 100644 index 0000000..22fb0c0 --- /dev/null +++ b/cupy/_core/include/cupy/_cuda/cuda-12/cuda_fp16.hpp @@ -0,0 +1,2738 @@ +/* +* Copyright 1993-2022 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +#if !defined(__CUDA_FP16_HPP__) +#define __CUDA_FP16_HPP__ + +#if !defined(__CUDA_FP16_H__) +#error "Do not include this file directly. Instead, include cuda_fp16.h." +#endif + +#if !defined(_MSC_VER) && __cplusplus >= 201103L +# define __CPP_VERSION_AT_LEAST_11_FP16 +#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L +# define __CPP_VERSION_AT_LEAST_11_FP16 +#endif + +// implicitly provided by NVRTC +#if !defined(__CUDACC_RTC__) +#include +#endif /* !defined(__CUDACC_RTC__) */ + + +#if !defined(IF_DEVICE_OR_CUDACC) +#if defined(__CUDACC__) + #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, c) +#else + #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, f) +#endif +#endif +/* C++11 header for std::move. + * In RTC mode, std::move is provided implicitly; don't include the header + */ +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) && !defined(__CUDACC_RTC__) +#include +#endif /* __cplusplus >= 201103L && !defined(__CUDACC_RTC__) */ + +/* C++ header for std::memcpy (used for type punning in host-side implementations). + * When compiling as a CUDA source file memcpy is provided implicitly. + * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__). + */ +#if defined(__cplusplus) && !defined(__CUDACC__) +#include +#endif /* defined(__cplusplus) && !defined(__CUDACC__) */ + + +/* Set up function decorations */ +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +#define __CUDA_FP16_DECL__ static __device__ __inline__ +#define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__ +#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__ +#define __CUDA_HOSTDEVICE__ __host__ __device__ +#else /* !defined(__CUDACC__) */ +#if defined(__GNUC__) +#define __CUDA_HOSTDEVICE_FP16_DECL__ static __attribute__ ((unused)) +#else +#define __CUDA_HOSTDEVICE_FP16_DECL__ static +#endif /* defined(__GNUC__) */ +#define __CUDA_HOSTDEVICE__ +#endif /* defined(__CUDACC_) */ + +/* Set up structure-alignment attribute */ +#if defined(__CUDACC__) +#define __CUDA_ALIGN__(align) __align__(align) +#else +/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */ +#if __cplusplus >= 201103L +#define __CUDA_ALIGN__(n) alignas(n) /* C++11 kindly gives us a keyword for this */ +#else /* !defined(__CPP_VERSION_AT_LEAST_11_FP16)*/ +#if defined(__GNUC__) +#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +#define __CUDA_ALIGN__(n) __declspec(align(n)) +#else +#define __CUDA_ALIGN__(n) +#endif /* defined(__GNUC__) */ +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ +#endif /* defined(__CUDACC__) */ + +/* Macros to allow half & half2 to be used by inline assembly */ +#define __HALF_TO_US(var) *(reinterpret_cast(&(var))) +#define __HALF_TO_CUS(var) *(reinterpret_cast(&(var))) +#define __HALF2_TO_UI(var) *(reinterpret_cast(&(var))) +#define __HALF2_TO_CUI(var) *(reinterpret_cast(&(var))) + +/* Macros for half & half2 binary arithmetic */ +#define __BINARY_OP_HALF_MACRO(name) /* do */ {\ + __half val; \ + asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2;\n}" \ + :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \ + return val; \ +} /* while(0) */ +#define __BINARY_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + return val; \ +} /* while(0) */ +#define __TERNARY_OP_HALF_MACRO(name) /* do */ {\ + __half val; \ + asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2,%3;\n}" \ + :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \ + return val; \ +} /* while(0) */ +#define __TERNARY_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2,%3;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b)),"r"(__HALF2_TO_CUI(c))); \ + return val; \ +} /* while(0) */ + +/** +* Types which allow static initialization of "half" and "half2" until +* these become an actual builtin. Note this initialization is as a +* bitfield representation of "half", and not a conversion from short->half. +* Such a representation will be deprecated in a future version of CUDA. +* (Note these are visible to non-nvcc compilers, including C-only compilation) +*/ +typedef struct __CUDA_ALIGN__(2) { + unsigned short x; +} __half_raw; + +typedef struct __CUDA_ALIGN__(4) { + unsigned short x; + unsigned short y; +} __half2_raw; + +/* All other definitions in this file are only visible to C++ compilers */ +#if defined(__cplusplus) + +/* Hide GCC member initialization list warnings because of host/device in-function init requirement */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Weffc++" +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +/* class' : multiple assignment operators specified + The class has multiple assignment operators of a single type. This warning is informational */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( push ) +#pragma warning( disable:4522 ) +#endif /* defined(__GNUC__) */ + +struct __CUDA_ALIGN__(2) __half { +protected: + unsigned short __x; + +public: +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) + __half() = default; +#else + __CUDA_HOSTDEVICE__ __half() { } +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + + /* Convert to/from __half_raw */ + __CUDA_HOSTDEVICE__ __half(const __half_raw &hr) : __x(hr.x) { } + __CUDA_HOSTDEVICE__ __half &operator=(const __half_raw &hr) { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ volatile __half &operator=(const __half_raw &hr) volatile { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ volatile __half &operator=(const volatile __half_raw &hr) volatile { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ operator __half_raw() const { __half_raw ret; ret.x = __x; return ret; } + __CUDA_HOSTDEVICE__ operator __half_raw() const volatile { __half_raw ret; ret.x = __x; return ret; } + +#if !defined(__CUDA_NO_HALF_CONVERSIONS__) + + /* Construct from float/double */ + __CUDA_HOSTDEVICE__ __half(const float f) { __x = __float2half(f).__x; } + __CUDA_HOSTDEVICE__ __half(const double f) { __x = __double2half(f).__x; } + + __CUDA_HOSTDEVICE__ operator float() const { return __half2float(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const float f) { __x = __float2half(f).__x; return *this; } + + /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */ + __CUDA_HOSTDEVICE__ __half &operator=(const double f) { __x = __double2half(f).__x; return *this; } + +/* Member functions only available to nvcc compilation so far */ +#if defined(__CUDACC__) + /* Allow automatic construction from types supported natively in hardware */ + /* Note we do avoid constructor init-list because of special host/device compilation rules */ + __CUDA_HOSTDEVICE__ __half(const short val) { __x = __short2half_rn(val).__x; } + __CUDA_HOSTDEVICE__ __half(const unsigned short val) { __x = __ushort2half_rn(val).__x; } + __CUDA_HOSTDEVICE__ __half(const int val) { __x = __int2half_rn(val).__x; } + __CUDA_HOSTDEVICE__ __half(const unsigned int val) { __x = __uint2half_rn(val).__x; } + __CUDA_HOSTDEVICE__ __half(const long long val) { __x = __ll2half_rn(val).__x; } + __CUDA_HOSTDEVICE__ __half(const unsigned long long val) { __x = __ull2half_rn(val).__x; } + + /* Allow automatic casts to supported builtin types, matching all that are permitted with float */ + __CUDA_HOSTDEVICE__ operator short() const { return __half2short_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const short val) { __x = __short2half_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator unsigned short() const { return __half2ushort_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const unsigned short val) { __x = __ushort2half_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator int() const { return __half2int_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const int val) { __x = __int2half_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator unsigned int() const { return __half2uint_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const unsigned int val) { __x = __uint2half_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator long long() const { return __half2ll_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const long long val) { __x = __ll2half_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator unsigned long long() const { return __half2ull_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const unsigned long long val) { __x = __ull2half_rn(val).__x; return *this; } + + /* Boolean conversion - note both 0 and -0 must return false */ + __CUDA_HOSTDEVICE__ operator bool() const { return (__x & 0x7FFFU) != 0U; } +#endif /* defined(__CUDACC__) */ +#endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */ +}; + +/* Global-space operator functions are only available to nvcc compilation */ +#if defined(__CUDACC__) + +/* Arithmetic FP16 operations only supported on arch >= 5.3 */ +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA) +#if !defined(__CUDA_NO_HALF_OPERATORS__) +/* Some basic arithmetic operations expected of a builtin */ +__device__ __forceinline__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); } +__device__ __forceinline__ __half operator-(const __half &lh, const __half &rh) { return __hsub(lh, rh); } +__device__ __forceinline__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); } +__device__ __forceinline__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); } + +__device__ __forceinline__ __half &operator+=(__half &lh, const __half &rh) { lh = __hadd(lh, rh); return lh; } +__device__ __forceinline__ __half &operator-=(__half &lh, const __half &rh) { lh = __hsub(lh, rh); return lh; } +__device__ __forceinline__ __half &operator*=(__half &lh, const __half &rh) { lh = __hmul(lh, rh); return lh; } +__device__ __forceinline__ __half &operator/=(__half &lh, const __half &rh) { lh = __hdiv(lh, rh); return lh; } + +/* Note for increment and decrement we use the raw value 0x3C00U equating to half(1.0F), to avoid the extra conversion */ +__device__ __forceinline__ __half &operator++(__half &h) { __half_raw one; one.x = 0x3C00U; h += one; return h; } +__device__ __forceinline__ __half &operator--(__half &h) { __half_raw one; one.x = 0x3C00U; h -= one; return h; } +__device__ __forceinline__ __half operator++(__half &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __half ret = h; + __half_raw one; + one.x = 0x3C00U; + h += one; + return ret; +} +__device__ __forceinline__ __half operator--(__half &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __half ret = h; + __half_raw one; + one.x = 0x3C00U; + h -= one; + return ret; +} + +/* Unary plus and inverse operators */ +__device__ __forceinline__ __half operator+(const __half &h) { return h; } +__device__ __forceinline__ __half operator-(const __half &h) { return __hneg(h); } + +/* Some basic comparison operations to make it look like a builtin */ +__device__ __forceinline__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); } +__device__ __forceinline__ bool operator!=(const __half &lh, const __half &rh) { return __hneu(lh, rh); } +__device__ __forceinline__ bool operator> (const __half &lh, const __half &rh) { return __hgt(lh, rh); } +__device__ __forceinline__ bool operator< (const __half &lh, const __half &rh) { return __hlt(lh, rh); } +__device__ __forceinline__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); } +__device__ __forceinline__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); } +#endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */ +#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA) */ +#endif /* defined(__CUDACC__) */ + +/* __half2 is visible to non-nvcc host compilers */ +struct __CUDA_ALIGN__(4) __half2 { + __half x; + __half y; + + // All construct/copy/assign/move +public: +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) + __half2() = default; + __CUDA_HOSTDEVICE__ __half2(const __half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); } + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); return *this; } +#else + __CUDA_HOSTDEVICE__ __half2() { } +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + __CUDA_HOSTDEVICE__ __half2(const __half &a, const __half &b) : x(a), y(b) { } + __CUDA_HOSTDEVICE__ __half2(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); } + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); return *this; } + + /* Convert to/from __half2_raw */ + __CUDA_HOSTDEVICE__ __half2(const __half2_raw &h2r ) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); } + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2_raw &h2r) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); return *this; } + __CUDA_HOSTDEVICE__ operator __half2_raw() const { __half2_raw ret; ret.x = 0U; ret.y = 0U; __HALF2_TO_UI(ret) = __HALF2_TO_CUI(*this); return ret; } +}; + +/* Global-space operator functions are only available to nvcc compilation */ +#if defined(__CUDACC__) + +/* Arithmetic FP16x2 operations only supported on arch >= 5.3 */ +#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA)) && !defined(__CUDA_NO_HALF2_OPERATORS__) + +__device__ __forceinline__ __half2 operator+(const __half2 &lh, const __half2 &rh) { return __hadd2(lh, rh); } +__device__ __forceinline__ __half2 operator-(const __half2 &lh, const __half2 &rh) { return __hsub2(lh, rh); } +__device__ __forceinline__ __half2 operator*(const __half2 &lh, const __half2 &rh) { return __hmul2(lh, rh); } +__device__ __forceinline__ __half2 operator/(const __half2 &lh, const __half2 &rh) { return __h2div(lh, rh); } + +__device__ __forceinline__ __half2& operator+=(__half2 &lh, const __half2 &rh) { lh = __hadd2(lh, rh); return lh; } +__device__ __forceinline__ __half2& operator-=(__half2 &lh, const __half2 &rh) { lh = __hsub2(lh, rh); return lh; } +__device__ __forceinline__ __half2& operator*=(__half2 &lh, const __half2 &rh) { lh = __hmul2(lh, rh); return lh; } +__device__ __forceinline__ __half2& operator/=(__half2 &lh, const __half2 &rh) { lh = __h2div(lh, rh); return lh; } + +__device__ __forceinline__ __half2 &operator++(__half2 &h) { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hadd2(h, one); return h; } +__device__ __forceinline__ __half2 &operator--(__half2 &h) { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hsub2(h, one); return h; } +__device__ __forceinline__ __half2 operator++(__half2 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __half2 ret = h; + __half2_raw one; + one.x = 0x3C00U; + one.y = 0x3C00U; + h = __hadd2(h, one); + return ret; +} +__device__ __forceinline__ __half2 operator--(__half2 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast(ignored); + + const __half2 ret = h; + __half2_raw one; + one.x = 0x3C00U; + one.y = 0x3C00U; + h = __hsub2(h, one); + return ret; +} + +__device__ __forceinline__ __half2 operator+(const __half2 &h) { return h; } +__device__ __forceinline__ __half2 operator-(const __half2 &h) { return __hneg2(h); } + +__device__ __forceinline__ bool operator==(const __half2 &lh, const __half2 &rh) { return __hbeq2(lh, rh); } +__device__ __forceinline__ bool operator!=(const __half2 &lh, const __half2 &rh) { return __hbneu2(lh, rh); } +__device__ __forceinline__ bool operator>(const __half2 &lh, const __half2 &rh) { return __hbgt2(lh, rh); } +__device__ __forceinline__ bool operator<(const __half2 &lh, const __half2 &rh) { return __hblt2(lh, rh); } +__device__ __forceinline__ bool operator>=(const __half2 &lh, const __half2 &rh) { return __hbge2(lh, rh); } +__device__ __forceinline__ bool operator<=(const __half2 &lh, const __half2 &rh) { return __hble2(lh, rh); } + +#endif /* (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA)) && !defined(__CUDA_NO_HALF2_OPERATORS__) */ +#endif /* defined(__CUDACC__) */ + +/* Restore warning for multiple assignment operators */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( pop ) +#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */ + +/* Restore -Weffc++ warnings from here on */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic pop +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +#undef __CUDA_HOSTDEVICE__ +#undef __CUDA_ALIGN__ + +#ifndef __CUDACC_RTC__ /* no host functions in NVRTC mode */ +static inline unsigned short __internal_float2half(const float f, unsigned int &sign, unsigned int &remainder) +{ + unsigned int x; + unsigned int u; + unsigned int result; +#if defined(__CUDACC__) + (void)memcpy(&x, &f, sizeof(f)); +#else + (void)std::memcpy(&x, &f, sizeof(f)); +#endif + u = (x & 0x7fffffffU); + sign = ((x >> 16U) & 0x8000U); + // NaN/+Inf/-Inf + if (u >= 0x7f800000U) { + remainder = 0U; + result = ((u == 0x7f800000U) ? (sign | 0x7c00U) : 0x7fffU); + } else if (u > 0x477fefffU) { // Overflows + remainder = 0x80000000U; + result = (sign | 0x7bffU); + } else if (u >= 0x38800000U) { // Normal numbers + remainder = u << 19U; + u -= 0x38000000U; + result = (sign | (u >> 13U)); + } else if (u < 0x33000001U) { // +0/-0 + remainder = u; + result = sign; + } else { // Denormal numbers + const unsigned int exponent = u >> 23U; + const unsigned int shift = 0x7eU - exponent; + unsigned int mantissa = (u & 0x7fffffU); + mantissa |= 0x800000U; + remainder = mantissa << (32U - shift); + result = (sign | (mantissa >> shift)); + result &= 0x0000FFFFU; + } + return static_cast(result); +} +#endif /* #if !defined(__CUDACC_RTC__) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a) +{ +IF_DEVICE_OR_CUDACC( + __half val; + asm("{ cvt.rn.f16.f64 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "d"(a)); + return val; +, + __half result; + // Perform rounding to 11 bits of precision, convert value + // to float and call existing float to half conversion. + // By pre-rounding to 11 bits we avoid additional rounding + // in float to half conversion. + unsigned long long int absa; + unsigned long long int ua; + (void)memcpy(&ua, &a, sizeof(a)); + absa = (ua & 0x7fffffffffffffffULL); + if ((absa >= 0x40f0000000000000ULL) || (absa <= 0x3e60000000000000ULL)) + { + // |a| >= 2^16 or NaN or |a| <= 2^(-25) + // double-rounding is not a problem + result = __float2half(static_cast(a)); + } + else + { + // here 2^(-25) < |a| < 2^16 + // prepare shifter value such that a + shifter + // done in double precision performs round-to-nearest-even + // and (a + shifter) - shifter results in a rounded to + // 11 bits of precision. Shifter needs to have exponent of + // a plus 53 - 11 = 42 and a leading bit in mantissa to guard + // against negative values. + // So need to have |a| capped to avoid overflow in exponent. + // For inputs that are smaller than half precision minnorm + // we prepare fixed shifter exponent. + unsigned long long shifterBits; + if (absa >= 0x3f10000000000000ULL) + { // Here if |a| >= 2^(-14) + // add 42 to exponent bits + shifterBits = (ua & 0x7ff0000000000000ULL) + 0x02A0000000000000ULL; + } + else + { // 2^(-25) < |a| < 2^(-14), potentially results in denormal + // set exponent bits to 42 - 14 + bias + shifterBits = 0x41B0000000000000ULL; + } + // set leading mantissa bit to protect against negative inputs + shifterBits |= 0x0008000000000000ULL; + double shifter; + (void)memcpy(&shifter, &shifterBits, sizeof(shifterBits)); + double aShiftRound = a + shifter; + + // Prevent the compiler from optimizing away a + shifter - shifter + // by doing intermediate memcopy and harmless bitwize operation + unsigned long long int aShiftRoundBits; + (void)memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound)); + + // the value is positive, so this operation doesn't change anything + aShiftRoundBits &= 0x7fffffffffffffffULL; + + (void)memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound)); + + result = __float2half(static_cast(aShiftRound - shifter)); + } + + return result; +, + __half result; + /* + // Perform rounding to 11 bits of precision, convert value + // to float and call existing float to half conversion. + // By pre-rounding to 11 bits we avoid additional rounding + // in float to half conversion. + */ + unsigned long long int absa; + unsigned long long int ua; + (void)std::memcpy(&ua, &a, sizeof(a)); + absa = (ua & 0x7fffffffffffffffULL); + if ((absa >= 0x40f0000000000000ULL) || (absa <= 0x3e60000000000000ULL)) + { + /* + // |a| >= 2^16 or NaN or |a| <= 2^(-25) + // double-rounding is not a problem + */ + result = __float2half(static_cast(a)); + } + else + { + /* + // here 2^(-25) < |a| < 2^16 + // prepare shifter value such that a + shifter + // done in double precision performs round-to-nearest-even + // and (a + shifter) - shifter results in a rounded to + // 11 bits of precision. Shifter needs to have exponent of + // a plus 53 - 11 = 42 and a leading bit in mantissa to guard + // against negative values. + // So need to have |a| capped to avoid overflow in exponent. + // For inputs that are smaller than half precision minnorm + // we prepare fixed shifter exponent. + */ + unsigned long long shifterBits; + if (absa >= 0x3f10000000000000ULL) + { + /* + // Here if |a| >= 2^(-14) + // add 42 to exponent bits + */ + shifterBits = (ua & 0x7ff0000000000000ULL) + 0x02A0000000000000ULL; + } + else + { + /* + // 2^(-25) < |a| < 2^(-14), potentially results in denormal + // set exponent bits to 42 - 14 + bias + */ + shifterBits = 0x41B0000000000000ULL; + } + // set leading mantissa bit to protect against negative inputs + shifterBits |= 0x0008000000000000ULL; + double shifter; + (void)std::memcpy(&shifter, &shifterBits, sizeof(shifterBits)); + double aShiftRound = a + shifter; + + /* + // Prevent the compiler from optimizing away a + shifter - shifter + // by doing intermediate memcopy and harmless bitwize operation + */ + unsigned long long int aShiftRoundBits; + (void)std::memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound)); + + // the value is positive, so this operation doesn't change anything + aShiftRoundBits &= 0x7fffffffffffffffULL; + + (void)std::memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound)); + + result = __float2half(static_cast(aShiftRound - shifter)); + } + + return result; +) +} + +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a) +{ + __half val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +, + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a) +{ + __half val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +, + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a) +{ + __half val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ cvt.rz.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +, + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + val = r; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a) +{ + __half val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ cvt.rm.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +, + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder != 0U) && (sign != 0U)) { + r.x++; + } + val = r; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a) +{ + __half val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ cvt.rp.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +, + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder != 0U) && (sign == 0U)) { + r.x++; + } + val = r; +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a) +{ + __half2 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 low;\n" + " cvt.rn.f16.f32 low, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a)); +, + val = __half2(__float2half_rn(a), __float2half_rn(a)); +) + return val; +} + +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) +__CUDA_FP16_DECL__ __half2 __internal_device_float2_to_half2_rn(const float a, const float b) { + __half2 val; +NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, + asm("{ cvt.rn.f16x2.f32 %0, %2, %1; }\n" + : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b)); +, + asm("{.reg .f16 low,high;\n" + " cvt.rn.f16.f32 low, %1;\n" + " cvt.rn.f16.f32 high, %2;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b)); +) + return val; +} + +#endif /* defined(__CUDACC__) || defined(_NVHPC_CUDA) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b) +{ + __half2 val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + val = __internal_device_float2_to_half2_rn(a,b); +, + val = __half2(__float2half_rn(a), __float2half_rn(b)); +) + return val; +} + +#ifndef __CUDACC_RTC__ /* no host functions in NVRTC mode */ +static inline float __internal_half2float(const unsigned short h) +{ + unsigned int sign = ((static_cast(h) >> 15U) & 1U); + unsigned int exponent = ((static_cast(h) >> 10U) & 0x1fU); + unsigned int mantissa = ((static_cast(h) & 0x3ffU) << 13U); + float f; + if (exponent == 0x1fU) { /* NaN or Inf */ + /* discard sign of a NaN */ + sign = ((mantissa != 0U) ? (sign >> 1U) : sign); + mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U); + exponent = 0xffU; + } else if (exponent == 0U) { /* Denorm or Zero */ + if (mantissa != 0U) { + unsigned int msb; + exponent = 0x71U; + do { + msb = (mantissa & 0x400000U); + mantissa <<= 1U; /* normalize */ + --exponent; + } while (msb == 0U); + mantissa &= 0x7fffffU; /* 1.mantissa is implicit */ + } + } else { + exponent += 0x70U; + } + const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa); +#if defined(__CUDACC__) + (void)memcpy(&f, &u, sizeof(u)); +#else + (void)std::memcpy(&f, &u, sizeof(u)); +#endif + return f; +} +#endif /* !defined(__CUDACC_RTC__) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a) +{ + float val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(a))); +, + val = __internal_half2float(static_cast<__half_raw>(a).x); +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a) +{ + float val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, low;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a))); +, + val = __internal_half2float(static_cast<__half2_raw>(a).x); +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a) +{ + float val; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, high;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a))); +, + val = __internal_half2float(static_cast<__half2_raw>(a).y); +) + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h) +{ + short int i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rzi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); +, + const float f = __half2float(h); + const short int max_val = (short int)0x7fffU; + const short int min_val = (short int)0x8000U; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h) +{ + unsigned short int i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rzi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); +, + const float f = __half2float(h); + const unsigned short int max_val = 0xffffU; + const unsigned short int min_val = 0U; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0U; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h) +{ + int i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rzi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); +, + const float f = __half2float(h); + const int max_val = (int)0x7fffffffU; + const int min_val = (int)0x80000000U; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h) +{ + unsigned int i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rzi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); +, + const float f = __half2float(h); + const unsigned int max_val = 0xffffffffU; + const unsigned int min_val = 0U; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0U; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h) +{ + long long int i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rzi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); +, + const float f = __half2float(h); + const long long int max_val = (long long int)0x7fffffffffffffffULL; + const long long int min_val = (long long int)0x8000000000000000ULL; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = min_val; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h) +{ + unsigned long long int i; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rzi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); +, + const float f = __half2float(h); + const unsigned long long int max_val = 0xffffffffffffffffULL; + const unsigned long long int min_val = 0ULL; + const unsigned short bits = static_cast(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0x8000000000000000ULL; + } else if (f > static_cast(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast(f); + } +) + return i; +} + +/* Intrinsic functions only available to nvcc compilers */ +#if defined(__CUDACC__) + +/* CUDA vector-types compatible vector creation function (note returns __half2, not half2) */ +__VECTOR_FUNCTIONS_DECL__ __half2 make_half2(const __half x, const __half y) +{ + __half2 t; t.x = x; t.y = y; return t; +} +#undef __VECTOR_FUNCTIONS_DECL__ + + +/* Definitions of intrinsics */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a) +{ + const __half2 val = __floats2half2_rn(a.x, a.y); + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a) +{ + float hi_float; + float lo_float; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, low;}\n" : "=f"(lo_float) : "r"(__HALF2_TO_CUI(a))); + + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, high;}\n" : "=f"(hi_float) : "r"(__HALF2_TO_CUI(a))); +, + lo_float = __internal_half2float(((__half2_raw)a).x); + hi_float = __internal_half2float(((__half2_raw)a).y); +) + return make_float2(lo_float, hi_float); +} +__CUDA_FP16_DECL__ int __half2int_rn(const __half h) +{ + int i; + asm("cvt.rni.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ int __half2int_rd(const __half h) +{ + int i; + asm("cvt.rmi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ int __half2int_ru(const __half h) +{ + int i; + asm("cvt.rpi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rn.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +, + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast(i); + h = __float2half_rn(f); +) + return h; +} +__CUDA_FP16_DECL__ __half __int2half_rz(const int i) +{ + __half h; + asm("cvt.rz.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __int2half_rd(const int i) +{ + __half h; + asm("cvt.rm.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __int2half_ru(const int i) +{ + __half h; + asm("cvt.rp.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} + +__CUDA_FP16_DECL__ short int __half2short_rn(const __half h) +{ + short int i; + asm("cvt.rni.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ short int __half2short_rd(const __half h) +{ + short int i; + asm("cvt.rmi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ short int __half2short_ru(const __half h) +{ + short int i; + asm("cvt.rpi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rn.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +, + const float f = static_cast(i); + h = __float2half_rn(f); +) + return h; +} +__CUDA_FP16_DECL__ __half __short2half_rz(const short int i) +{ + __half h; + asm("cvt.rz.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __short2half_rd(const short int i) +{ + __half h; + asm("cvt.rm.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __short2half_ru(const short int i) +{ + __half h; + asm("cvt.rp.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} + +__CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h) +{ + unsigned int i; + asm("cvt.rni.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h) +{ + unsigned int i; + asm("cvt.rmi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h) +{ + unsigned int i; + asm("cvt.rpi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rn.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +, + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast(i); + h = __float2half_rn(f); +) + return h; +} +__CUDA_FP16_DECL__ __half __uint2half_rz(const unsigned int i) +{ + __half h; + asm("cvt.rz.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __uint2half_rd(const unsigned int i) +{ + __half h; + asm("cvt.rm.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __uint2half_ru(const unsigned int i) +{ + __half h; + asm("cvt.rp.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} + +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h) +{ + unsigned short int i; + asm("cvt.rni.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h) +{ + unsigned short int i; + asm("cvt.rmi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h) +{ + unsigned short int i; + asm("cvt.rpi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rn.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +, + const float f = static_cast(i); + h = __float2half_rn(f); +) + return h; +} +__CUDA_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i) +{ + __half h; + asm("cvt.rz.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i) +{ + __half h; + asm("cvt.rm.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i) +{ + __half h; + asm("cvt.rp.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} + +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h) +{ + unsigned long long int i; + asm("cvt.rni.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h) +{ + unsigned long long int i; + asm("cvt.rmi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h) +{ + unsigned long long int i; + asm("cvt.rpi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rn.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +, + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast(i); + h = __float2half_rn(f); +) + return h; +} +__CUDA_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i) +{ + __half h; + asm("cvt.rz.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i) +{ + __half h; + asm("cvt.rm.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i) +{ + __half h; + asm("cvt.rp.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} + +__CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h) +{ + long long int i; + asm("cvt.rni.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h) +{ + long long int i; + asm("cvt.rmi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h) +{ + long long int i; + asm("cvt.rpi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i) +{ + __half h; +NV_IF_ELSE_TARGET(NV_IS_DEVICE, + asm("cvt.rn.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +, + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast(i); + h = __float2half_rn(f); +) + return h; +} +__CUDA_FP16_DECL__ __half __ll2half_rz(const long long int i) +{ + __half h; + asm("cvt.rz.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ll2half_rd(const long long int i) +{ + __half h; + asm("cvt.rm.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ll2half_ru(const long long int i) +{ + __half h; + asm("cvt.rp.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} + +__CUDA_FP16_DECL__ __half htrunc(const __half h) +{ + __half r; + asm("cvt.rzi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hceil(const __half h) +{ + __half r; + asm("cvt.rpi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hfloor(const __half h) +{ + __half r; + asm("cvt.rmi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hrint(const __half h) +{ + __half r; + asm("cvt.rni.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} + +__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rzi.f16.f16 low, low;\n" + " cvt.rzi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rpi.f16.f16 low, low;\n" + " cvt.rpi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rmi.f16.f16 low, low;\n" + " cvt.rmi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rni.f16.f16 low, low;\n" + " cvt.rni.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b) +{ + __half2 val; + asm("{.reg .f16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {alow,blow};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b))); + return val; +} +__CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b) +{ + __half2 val; + asm("{.reg .f16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b))); + return val; +} +__CUDA_FP16_DECL__ __half __low2half(const __half2 a) +{ + __half ret; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, low;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a))); + return ret; +} +__CUDA_FP16_DECL__ int __hisinf(const __half a) +{ + int retval; + if (__HALF_TO_CUS(a) == 0xFC00U) { + retval = -1; + } else if (__HALF_TO_CUS(a) == 0x7C00U) { + retval = 1; + } else { + retval = 0; + } + return retval; +} +__CUDA_FP16_DECL__ __half2 __low2half2(const __half2 a) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 __high2half2(const __half2 a) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half __high2half(const __half2 a) +{ + __half ret; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, high;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a))); + return ret; +} +__CUDA_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b) +{ + __half2 val; + asm("{ mov.b32 %0, {%1,%2};}\n" + : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); + return val; +} +__CUDA_FP16_DECL__ __half2 __half2half2(const __half a) +{ + __half2 val; + asm("{ mov.b32 %0, {%1,%1};}\n" + : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ short int __half_as_short(const __half h) +{ + return static_cast(__HALF_TO_CUS(h)); +} +__CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h) +{ + return __HALF_TO_CUS(h); +} +__CUDA_FP16_DECL__ __half __short_as_half(const short int i) +{ + __half h; + __HALF_TO_US(h) = static_cast(i); + return h; +} +__CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i) +{ + __half h; + __HALF_TO_US(h) = i; + return h; +} + +/****************************************************************************** +* __half arithmetic * +******************************************************************************/ +__CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b) +{ +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) + __BINARY_OP_HALF_MACRO(max) +#else + const float fa = __half2float(a); + const float fb = __half2float(b); + float fr; + asm("{max.f32 %0,%1,%2;\n}" + :"=f"(fr) : "f"(fa), "f"(fb)); + const __half hr = __float2half(fr); + return hr; +#endif +} +__CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b) +{ +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) + __BINARY_OP_HALF_MACRO(min) +#else + const float fa = __half2float(a); + const float fb = __half2float(b); + float fr; + asm("{min.f32 %0,%1,%2;\n}" + :"=f"(fr) : "f"(fa), "f"(fb)); + const __half hr = __float2half(fr); + return hr; +#endif +} + +/****************************************************************************** +* __half2 arithmetic * +******************************************************************************/ +__CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b) +{ +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) + __BINARY_OP_HALF2_MACRO(max) +#else + const float2 fa = __half22float2(a); + const float2 fb = __half22float2(b); + float2 fr; + asm("{max.f32 %0,%1,%2;\n}" + :"=f"(fr.x) : "f"(fa.x), "f"(fb.x)); + asm("{max.f32 %0,%1,%2;\n}" + :"=f"(fr.y) : "f"(fa.y), "f"(fb.y)); + const __half2 hr = __float22half2_rn(fr); + return hr; +#endif +} +__CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b) +{ +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) + __BINARY_OP_HALF2_MACRO(min) +#else + const float2 fa = __half22float2(a); + const float2 fb = __half22float2(b); + float2 fr; + asm("{min.f32 %0,%1,%2;\n}" + :"=f"(fr.x) : "f"(fa.x), "f"(fb.x)); + asm("{min.f32 %0,%1,%2;\n}" + :"=f"(fr.y) : "f"(fa.y), "f"(fb.y)); + const __half2 hr = __float22half2_rn(fr); + return hr; +#endif +} + + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) || defined(_NVHPC_CUDA) +/****************************************************************************** +* __half, __half2 warp shuffle * +******************************************************************************/ +#define __SHUFFLE_HALF2_MACRO(name) /* do */ {\ + __half2 r; \ + asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3;\n}" \ + :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c)); \ + return r; \ +} /* while(0) */ + +#define __SHUFFLE_SYNC_HALF2_MACRO(name) /* do */ {\ + __half2 r; \ + asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \ + :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \ + return r; \ +} /* while(0) */ + +#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700) + +__CUDA_FP16_DECL__ __half2 __shfl(const __half2 var, const int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_HALF2_MACRO(shfl.idx.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = (warp_size - static_cast(width)) << 8U; + __SHUFFLE_HALF2_MACRO(shfl.up.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_down(const __half2 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_HALF2_MACRO(shfl.down.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_xor(const __half2 var, const int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_HALF2_MACRO(shfl.bfly.b32) +} + +#endif /* defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700) */ + +__CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned mask, const __half2 var, const int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.idx.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = (warp_size - static_cast(width)) << 8U; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.up.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.down.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned mask, const __half2 var, const int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.bfly.b32) +} + +#undef __SHUFFLE_HALF2_MACRO +#undef __SHUFFLE_SYNC_HALF2_MACRO + +#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700) + +__CUDA_FP16_DECL__ __half __shfl(const __half var, const int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_up(const __half var, const unsigned int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_up(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_down(const __half var, const unsigned int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_down(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_xor(const __half var, const int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_xor(temp1, delta, width); + return __low2half(temp2); +} + +#endif /* defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 700) */ + +__CUDA_FP16_DECL__ __half __shfl_sync(const unsigned mask, const __half var, const int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned mask, const __half var, const unsigned int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_up_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned mask, const __half var, const unsigned int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_down_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned mask, const __half var, const int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_xor_sync(mask, temp1, delta, width); + return __low2half(temp2); +} + +#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) || defined(_NVHPC_CUDA) */ +/****************************************************************************** +* __half and __half2 __ldg,__ldcg,__ldca,__ldcs * +******************************************************************************/ + +#if defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) || defined(_NVHPC_CUDA)) +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ +__CUDA_FP16_DECL__ __half2 __ldg(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.nc.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldg(const __half *const ptr) +{ + __half ret; + asm ("ld.global.nc.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.cg.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr) +{ + __half ret; + asm ("ld.global.cg.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldca(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.ca.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldca(const __half *const ptr) +{ + __half ret; + asm ("ld.global.ca.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.cs.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr) +{ + __half ret; + asm ("ld.global.cs.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldlu(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.lu.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr) +{ + __half ret; + asm ("ld.global.lu.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcv(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.cv.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr) +{ + __half ret; + asm ("ld.global.cv.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value) +{ + asm ("st.global.wb.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value) +{ + asm ("st.global.wb.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value) +{ + asm ("st.global.cg.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value) +{ + asm ("st.global.cg.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value) +{ + asm ("st.global.cs.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value) +{ + asm ("st.global.cs.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value) +{ + asm ("st.global.wt.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value) +{ + asm ("st.global.wt.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +#undef __LDG_PTR +#endif /* defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) || defined(_NVHPC_CUDA)) */ +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA) +/****************************************************************************** +* __half2 comparison * +******************************************************************************/ +#define __COMPARISON_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + return val; \ +} /* while(0) */ +__CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.eq) +} +__CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.ne) +} +__CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.le) +} +__CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.ge) +} +__CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.lt) +} +__CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.gt) +} +__CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.equ) +} +__CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.neu) +} +__CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.leu) +} +__CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.geu) +} +__CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.ltu) +} +__CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.gtu) +} +#undef __COMPARISON_OP_HALF2_MACRO +/****************************************************************************** +* __half2 comparison with mask output * +******************************************************************************/ +#define __COMPARISON_OP_HALF2_MACRO_MASK(name) /* do */ {\ + unsigned val; \ + asm( "{ " __CUDA_FP16_STRINGIFY(name) ".u32.f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + return val; \ +} /* while(0) */ +__CUDA_FP16_DECL__ unsigned __heq2_mask(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO_MASK(set.eq) +} +__CUDA_FP16_DECL__ unsigned __hne2_mask(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO_MASK(set.ne) +} +__CUDA_FP16_DECL__ unsigned __hle2_mask(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO_MASK(set.le) +} +__CUDA_FP16_DECL__ unsigned __hge2_mask(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO_MASK(set.ge) +} +__CUDA_FP16_DECL__ unsigned __hlt2_mask(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO_MASK(set.lt) +} +__CUDA_FP16_DECL__ unsigned __hgt2_mask(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO_MASK(set.gt) +} +__CUDA_FP16_DECL__ unsigned __hequ2_mask(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO_MASK(set.equ) +} +__CUDA_FP16_DECL__ unsigned __hneu2_mask(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO_MASK(set.neu) +} +__CUDA_FP16_DECL__ unsigned __hleu2_mask(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO_MASK(set.leu) +} +__CUDA_FP16_DECL__ unsigned __hgeu2_mask(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO_MASK(set.geu) +} +__CUDA_FP16_DECL__ unsigned __hltu2_mask(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO_MASK(set.ltu) +} +__CUDA_FP16_DECL__ unsigned __hgtu2_mask(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO_MASK(set.gtu) +} +#undef __COMPARISON_OP_HALF2_MACRO_MASK +#define __BOOL_COMPARISON_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + bool retval; \ + asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + if (__HALF2_TO_CUI(val) == 0x3C003C00U) {\ + retval = true; \ + } else { \ + retval = false; \ + }\ + return retval;\ +} /* while(0) */ +__CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.eq) +} +__CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.ne) +} +__CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.le) +} +__CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.ge) +} +__CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.lt) +} +__CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.gt) +} +__CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.equ) +} +__CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.neu) +} +__CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.leu) +} +__CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.geu) +} +__CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.ltu) +} +__CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.gtu) +} +#undef __BOOL_COMPARISON_OP_HALF2_MACRO +/****************************************************************************** +* __half comparison * +******************************************************************************/ +#define __COMPARISON_OP_HALF_MACRO(name) /* do */ {\ + unsigned short val; \ + asm( "{ .reg .pred __$temp3;\n" \ + " setp." __CUDA_FP16_STRINGIFY(name) ".f16 __$temp3, %1, %2;\n" \ + " selp.u16 %0, 1, 0, __$temp3;}" \ + : "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \ + return (val != 0U) ? true : false; \ +} /* while(0) */ +__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(eq) +} +__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ne) +} +__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(le) +} +__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ge) +} +__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(lt) +} +__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(gt) +} +__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(equ) +} +__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(neu) +} +__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(leu) +} +__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(geu) +} +__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ltu) +} +__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(gtu) +} +#undef __COMPARISON_OP_HALF_MACRO +/****************************************************************************** +* __half2 arithmetic * +******************************************************************************/ +__CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(add) +} +__CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(sub) +} +__CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(mul) +} +__CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(add.sat) +} +__CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(sub.sat) +} +__CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(mul.sat) +} +__CUDA_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(add.rn) +} +__CUDA_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(sub.rn) +} +__CUDA_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(mul.rn) +} +__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn) +} +__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn.sat) +} +__CUDA_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b) { + __half ha = __low2half(a); + __half hb = __low2half(b); + + const __half v1 = __hdiv(ha, hb); + + ha = __high2half(a); + hb = __high2half(b); + + const __half v2 = __hdiv(ha, hb); + + return __halves2half2(v1, v2); +} +/****************************************************************************** +* __half arithmetic * +******************************************************************************/ +__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(add) +} +__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(sub) +} +__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(mul) +} +__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(add.sat) +} +__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(sub.sat) +} +__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(mul.sat) +} +__CUDA_FP16_DECL__ __half __hadd_rn(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(add.rn) +} +__CUDA_FP16_DECL__ __half __hsub_rn(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(sub.rn) +} +__CUDA_FP16_DECL__ __half __hmul_rn(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(mul.rn) +} +__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn) +} +__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn.sat) +} +__CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b) { + __half v; + __half abs; + __half den; + __HALF_TO_US(den) = 0x008FU; + + float rcp; + const float fa = __half2float(a); + const float fb = __half2float(b); + + asm("{rcp.approx.ftz.f32 %0, %1;\n}" :"=f"(rcp) : "f"(fb)); + + float fv = rcp * fa; + + v = __float2half(fv); + abs = __habs(v); + if (__hlt(abs, den) && __hlt(__float2half(0.0f), abs)) { + const float err = __fmaf_rn(-fb, fv, fa); + fv = __fmaf_rn(rcp, err, fv); + v = __float2half(fv); + } + return v; +} + +/****************************************************************************** +* __half2 functions * +******************************************************************************/ +#define __SPEC_CASE2(i,r, spc, ulp) \ + "{.reg.b32 spc, ulp, p;\n"\ + " mov.b32 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\ + " mov.b32 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\ + " set.eq.f16x2.f16x2 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\ + " fma.rn.f16x2 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n" +#define __SPEC_CASE(i,r, spc, ulp) \ + "{.reg.b16 spc, ulp, p;\n"\ + " mov.b16 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\ + " mov.b16 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\ + " set.eq.f16.f16 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\ + " fma.rn.f16 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n" +#define __APPROX_FCAST(fun) /* do */ {\ + __half val;\ + asm("{.reg.b32 f; \n"\ + " .reg.b16 r; \n"\ + " mov.b16 r,%1; \n"\ + " cvt.f32.f16 f,r; \n"\ + " " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 f,f; \n"\ + " cvt.rn.f16.f32 r,f; \n"\ + " mov.b16 %0,r; \n"\ + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\ + return val;\ +} /* while(0) */ +#define __APPROX_FCAST2(fun) /* do */ {\ + __half2 val;\ + asm("{.reg.b16 hl, hu; \n"\ + " .reg.b32 fl, fu; \n"\ + " mov.b32 {hl, hu}, %1; \n"\ + " cvt.f32.f16 fl, hl; \n"\ + " cvt.f32.f16 fu, hu; \n"\ + " " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 fl, fl; \n"\ + " " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 fu, fu; \n"\ + " cvt.rn.f16.f32 hl, fl; \n"\ + " cvt.rn.f16.f32 hu, fu; \n"\ + " mov.b32 %0, {hl, hu}; \n"\ + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); \ + return val;\ +} /* while(0) */ +static __device__ __forceinline__ float __float_simpl_sinf(float a); +static __device__ __forceinline__ float __float_simpl_cosf(float a); +__CUDA_FP16_DECL__ __half hsin(const __half a) { + const float sl = __float_simpl_sinf(__half2float(a)); + __half r = __float2half_rn(sl); + asm("{\n\t" + " .reg.b16 i,r,t; \n\t" + " mov.b16 r, %0; \n\t" + " mov.b16 i, %1; \n\t" + " and.b16 t, r, 0x8000U; \n\t" + " abs.f16 r, r; \n\t" + " abs.f16 i, i; \n\t" + __SPEC_CASE(i, r, 0X32B3U, 0x0800U) + __SPEC_CASE(i, r, 0X5CB0U, 0x9000U) + " or.b16 r,r,t; \n\t" + " mov.b16 %0, r; \n" + "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a) { + const float sl = __float_simpl_sinf(__half2float(a.x)); + const float sh = __float_simpl_sinf(__half2float(a.y)); + __half2 r = __floats2half2_rn(sl, sh); + asm("{\n\t" + " .reg.b32 i,r,t; \n\t" + " mov.b32 r, %0; \n\t" + " mov.b32 i, %1; \n\t" + " and.b32 t, r, 0x80008000U; \n\t" + " abs.f16x2 r, r; \n\t" + " abs.f16x2 i, i; \n\t" + __SPEC_CASE2(i, r, 0X32B332B3U, 0x08000800U) + __SPEC_CASE2(i, r, 0X5CB05CB0U, 0x90009000U) + " or.b32 r, r, t; \n\t" + " mov.b32 %0, r; \n" + "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ __half hcos(const __half a) { + const float cl = __float_simpl_cosf(__half2float(a)); + __half r = __float2half_rn(cl); + asm("{\n\t" + " .reg.b16 i,r; \n\t" + " mov.b16 r, %0; \n\t" + " mov.b16 i, %1; \n\t" + " abs.f16 i, i; \n\t" + __SPEC_CASE(i, r, 0X2B7CU, 0x1000U) + " mov.b16 %0, r; \n" + "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a) { + const float cl = __float_simpl_cosf(__half2float(a.x)); + const float ch = __float_simpl_cosf(__half2float(a.y)); + __half2 r = __floats2half2_rn(cl, ch); + asm("{\n\t" + " .reg.b32 i,r; \n\t" + " mov.b32 r, %0; \n\t" + " mov.b32 i, %1; \n\t" + " abs.f16x2 i, i; \n\t" + __SPEC_CASE2(i, r, 0X2B7C2B7CU, 0x10001000U) + " mov.b32 %0, r; \n" + "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +static __device__ __forceinline__ float __internal_trig_reduction_kernel(const float a, unsigned int *const quadrant) +{ + const float ar = __fmaf_rn(a, 0.636619772F, 12582912.0F); + const unsigned q = __float_as_uint(ar); + const float j = __fsub_rn(ar, 12582912.0F); + float t = __fmaf_rn(j, -1.5707962512969971e+000F, a); + t = __fmaf_rn(j, -7.5497894158615964e-008F, t); + *quadrant = q; + return t; +} +static __device__ __forceinline__ float __internal_sin_cos_kernel(const float x, const unsigned int i) +{ + float z; + const float x2 = x*x; + float a8; + float a6; + float a4; + float a2; + float a1; + float a0; + + if ((i & 1U) != 0U) { + // cos + a8 = 2.44331571e-5F; + a6 = -1.38873163e-3F; + a4 = 4.16666457e-2F; + a2 = -5.00000000e-1F; + a1 = x2; + a0 = 1.0F; + } + else { + // sin + a8 = -1.95152959e-4F; + a6 = 8.33216087e-3F; + a4 = -1.66666546e-1F; + a2 = 0.0F; + a1 = x; + a0 = x; + } + + z = __fmaf_rn(a8, x2, a6); + z = __fmaf_rn(z, x2, a4); + z = __fmaf_rn(z, x2, a2); + z = __fmaf_rn(z, a1, a0); + + if ((i & 2U) != 0U) { + z = -z; + } + return z; +} +static __device__ __forceinline__ float __float_simpl_sinf(float a) +{ + float z; + unsigned i; + a = __internal_trig_reduction_kernel(a, &i); + z = __internal_sin_cos_kernel(a, i); + return z; +} +static __device__ __forceinline__ float __float_simpl_cosf(float a) +{ + float z; + unsigned i; + a = __internal_trig_reduction_kernel(a, &i); + z = __internal_sin_cos_kernel(a, (i & 0x3U) + 1U); + return z; +} + +__CUDA_FP16_DECL__ __half hexp(const __half a) { + __half val; + asm("{.reg.b32 f, C, nZ; \n" + " .reg.b16 h,r; \n" + " mov.b16 h,%1; \n" + " cvt.f32.f16 f,h; \n" + " mov.b32 C, 0x3fb8aa3bU; \n" + " mov.b32 nZ, 0x80000000U;\n" + " fma.rn.f32 f,f,C,nZ; \n" + " ex2.approx.ftz.f32 f,f; \n" + " cvt.rn.f16.f32 r,f; \n" + __SPEC_CASE(h, r, 0X1F79U, 0x9400U) + __SPEC_CASE(h, r, 0X25CFU, 0x9400U) + __SPEC_CASE(h, r, 0XC13BU, 0x0400U) + __SPEC_CASE(h, r, 0XC1EFU, 0x0200U) + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu,C,nZ; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " mov.b32 C, 0x3fb8aa3bU; \n" + " mov.b32 nZ, 0x80000000U;\n" + " fma.rn.f32 fl,fl,C,nZ; \n" + " fma.rn.f32 fu,fu,C,nZ; \n" + " ex2.approx.ftz.f32 fl, fl; \n" + " ex2.approx.ftz.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0X1F791F79U, 0x94009400U) + __SPEC_CASE2(h, r, 0X25CF25CFU, 0x94009400U) + __SPEC_CASE2(h, r, 0XC13BC13BU, 0x04000400U) + __SPEC_CASE2(h, r, 0XC1EFC1EFU, 0x02000200U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hexp2(const __half a) { + __half val; + asm("{.reg.b32 f, ULP; \n" + " .reg.b16 r; \n" + " mov.b16 r,%1; \n" + " cvt.f32.f16 f,r; \n" + " ex2.approx.ftz.f32 f,f; \n" + " mov.b32 ULP, 0x33800000U;\n" + " fma.rn.f32 f,f,ULP,f; \n" + " cvt.rn.f16.f32 r,f; \n" + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 fl, fu, ULP; \n" + " mov.b32 {hl, hu}, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " ex2.approx.ftz.f32 fl, fl; \n" + " ex2.approx.ftz.f32 fu, fu; \n" + " mov.b32 ULP, 0x33800000U;\n" + " fma.rn.f32 fl,fl,ULP,fl; \n" + " fma.rn.f32 fu,fu,ULP,fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 %0, {hl, hu}; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hexp10(const __half a) { + __half val; + asm("{.reg.b16 h,r; \n" + " .reg.b32 f, C, nZ; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " mov.b32 C, 0x40549A78U; \n" + " mov.b32 nZ, 0x80000000U;\n" + " fma.rn.f32 f,f,C,nZ; \n" + " ex2.approx.ftz.f32 f, f; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(h, r, 0x34DEU, 0x9800U) + __SPEC_CASE(h, r, 0x9766U, 0x9000U) + __SPEC_CASE(h, r, 0x9972U, 0x1000U) + __SPEC_CASE(h, r, 0xA5C4U, 0x1000U) + __SPEC_CASE(h, r, 0xBF0AU, 0x8100U) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu,C,nZ; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " mov.b32 C, 0x40549A78U; \n" + " mov.b32 nZ, 0x80000000U;\n" + " fma.rn.f32 fl,fl,C,nZ; \n" + " fma.rn.f32 fu,fu,C,nZ; \n" + " ex2.approx.ftz.f32 fl, fl; \n" + " ex2.approx.ftz.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0x34DE34DEU, 0x98009800U) + __SPEC_CASE2(h, r, 0x97669766U, 0x90009000U) + __SPEC_CASE2(h, r, 0x99729972U, 0x10001000U) + __SPEC_CASE2(h, r, 0xA5C4A5C4U, 0x10001000U) + __SPEC_CASE2(h, r, 0xBF0ABF0AU, 0x81008100U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog2(const __half a) { + __half val; + asm("{.reg.b16 h, r; \n" + " .reg.b32 f; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " lg2.approx.ftz.f32 f, f; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(r, r, 0xA2E2U, 0x8080U) + __SPEC_CASE(r, r, 0xBF46U, 0x9400U) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 fl, fu, r, p; \n" + " mov.b32 {hl, hu}, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.ftz.f32 fl, fl; \n" + " lg2.approx.ftz.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(r, r, 0xA2E2A2E2U, 0x80808080U) + __SPEC_CASE2(r, r, 0xBF46BF46U, 0x94009400U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog(const __half a) { + __half val; + asm("{.reg.b32 f, C; \n" + " .reg.b16 r,h; \n" + " mov.b16 h,%1; \n" + " cvt.f32.f16 f,h; \n" + " lg2.approx.ftz.f32 f,f; \n" + " mov.b32 C, 0x3f317218U; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.f16.f32 r,f; \n" + __SPEC_CASE(h, r, 0X160DU, 0x9C00U) + __SPEC_CASE(h, r, 0X3BFEU, 0x8010U) + __SPEC_CASE(h, r, 0X3C0BU, 0x8080U) + __SPEC_CASE(h, r, 0X6051U, 0x1C00U) + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.ftz.f32 fl, fl; \n" + " lg2.approx.ftz.f32 fu, fu; \n" + " mov.b32 C, 0x3f317218U; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0X160D160DU, 0x9C009C00U) + __SPEC_CASE2(h, r, 0X3BFE3BFEU, 0x80108010U) + __SPEC_CASE2(h, r, 0X3C0B3C0BU, 0x80808080U) + __SPEC_CASE2(h, r, 0X60516051U, 0x1C001C00U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog10(const __half a) { + __half val; + asm("{.reg.b16 h, r; \n" + " .reg.b32 f, C; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " lg2.approx.ftz.f32 f, f; \n" + " mov.b32 C, 0x3E9A209BU; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(h, r, 0x338FU, 0x1000U) + __SPEC_CASE(h, r, 0x33F8U, 0x9000U) + __SPEC_CASE(h, r, 0x57E1U, 0x9800U) + __SPEC_CASE(h, r, 0x719DU, 0x9C00U) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.ftz.f32 fl, fl; \n" + " lg2.approx.ftz.f32 fu, fu; \n" + " mov.b32 C, 0x3E9A209BU; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0x338F338FU, 0x10001000U) + __SPEC_CASE2(h, r, 0x33F833F8U, 0x90009000U) + __SPEC_CASE2(h, r, 0x57E157E1U, 0x98009800U) + __SPEC_CASE2(h, r, 0x719D719DU, 0x9C009C00U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +#undef __SPEC_CASE2 +#undef __SPEC_CASE +__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a) { + __APPROX_FCAST2(rcp) +} +__CUDA_FP16_DECL__ __half hrcp(const __half a) { + __APPROX_FCAST(rcp) +} +__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a) { + __APPROX_FCAST2(rsqrt) +} +__CUDA_FP16_DECL__ __half hrsqrt(const __half a) { + __APPROX_FCAST(rsqrt) +} +__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a) { + __APPROX_FCAST2(sqrt) +} +__CUDA_FP16_DECL__ __half hsqrt(const __half a) { + __APPROX_FCAST(sqrt) +} +#undef __APPROX_FCAST +#undef __APPROX_FCAST2 +__CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a) +{ + __half2 r; + asm("{set.nan.f16x2.f16x2 %0,%1,%2;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ bool __hisnan(const __half a) +{ + __half r; + asm("{set.nan.f16.f16 %0,%1,%2;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(a))); + return __HALF_TO_CUS(r) != 0U; +} +__CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a) +{ + __half2 r; + asm("{neg.f16x2 %0,%1;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ __half __hneg(const __half a) +{ + __half r; + asm("{neg.f16 %0,%1;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 __habs2(const __half2 a) +{ + __half2 r; + asm("{abs.f16x2 %0,%1;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ __half __habs(const __half a) +{ + __half r; + asm("{abs.f16 %0,%1;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} + +__CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c) +{ + // fast version of complex multiply-accumulate + // (a.re, a.im) * (b.re, b.im) + (c.re, c.im) + // acc.re = (c.re + a.re*b.re) - a.im*b.im + // acc.im = (c.im + a.re*b.im) + a.im*b.re + __half real_tmp = __hfma(a.x, b.x, c.x); + __half img_tmp = __hfma(a.x, b.y, c.y); + real_tmp = __hfma(__hneg(a.y), b.y, real_tmp); + img_tmp = __hfma(a.y, b.x, img_tmp); + return make_half2(real_tmp, img_tmp); +} + +#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) || defined(_NVHPC_CUDA) */ + +#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) +__CUDA_FP16_DECL__ __half __hmax_nan(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(max.NaN) +} +__CUDA_FP16_DECL__ __half __hmin_nan(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(min.NaN) +} +__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn.relu) +} + +__CUDA_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(max.NaN) +} +__CUDA_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(min.NaN) +} +__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn.relu) +} +#endif /*defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)*/ + +/* Define __PTR for atomicAdd prototypes below, undef after done */ +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __PTR "l" +#else +#define __PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ + +#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600) + +__CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *const address, const __half2 val) { + __half2 r; + asm volatile ("{ atom.add.noftz.f16x2 %0,[%1],%2; }\n" + : "=r"(__HALF2_TO_UI(r)) : __PTR(address), "r"(__HALF2_TO_CUI(val)) + : "memory"); + return r; +} + +#endif /*defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)*/ + +#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) + +__CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val) { + __half r; + asm volatile ("{ atom.add.noftz.f16 %0,[%1],%2; }\n" + : "=h"(__HALF_TO_US(r)) + : __PTR(address), "h"(__HALF_TO_CUS(val)) + : "memory"); + return r; +} + +#endif /*defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)*/ + +#undef __PTR + +#undef __CUDA_FP16_DECL__ +#endif /* defined(__CUDACC__) */ +#endif /* defined(__cplusplus) */ + +#undef __TERNARY_OP_HALF2_MACRO +#undef __TERNARY_OP_HALF_MACRO +#undef __BINARY_OP_HALF2_MACRO +#undef __BINARY_OP_HALF_MACRO + +#undef __CUDA_HOSTDEVICE_FP16_DECL__ +#undef __CUDA_FP16_DECL__ + +#undef __HALF_TO_US +#undef __HALF_TO_CUS +#undef __HALF2_TO_UI +#undef __HALF2_TO_CUI + +/* Define first-class types "half" and "half2", unless user specifies otherwise via "#define CUDA_NO_HALF" */ +/* C cannot ever have these types defined here, because __half and __half2 are C++ classes */ +#if defined(__cplusplus) && !defined(CUDA_NO_HALF) +typedef __half half; +typedef __half2 half2; +// for consistency with __nv_bfloat16 +typedef __half __nv_half; +typedef __half2 __nv_half2; +typedef __half_raw __nv_half_raw; +typedef __half2_raw __nv_half2_raw; +typedef __half nv_half; +typedef __half2 nv_half2; +#endif /* defined(__cplusplus) && !defined(CUDA_NO_HALF) */ + +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) +#undef __CPP_VERSION_AT_LEAST_11_FP16 +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + +#endif /* end of include guard: __CUDA_FP16_HPP__ */ diff --git a/cupy/_core/include/cupy/atomics.cuh b/cupy/_core/include/cupy/atomics.cuh new file mode 100644 index 0000000..772c1d7 --- /dev/null +++ b/cupy/_core/include/cupy/atomics.cuh @@ -0,0 +1,114 @@ +#pragma once + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600) + +__device__ double atomicAdd(double *address, double val) +{ + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull; + unsigned long long int assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); +} + +#endif // #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600) + +__device__ float16 atomicAdd(float16* address, float16 val) { + unsigned int *aligned = (unsigned int*)((size_t)address - ((size_t)address & 2)); + unsigned int old = *aligned; + unsigned int assumed; + unsigned short old_as_us; + do { + assumed = old; + old_as_us = (unsigned short)((size_t)address & 2 ? old >> 16 : old & 0xffff); +#if __CUDACC_VER_MAJOR__ >= 9 + half sum = __float2half_rn(__half2float(__ushort_as_half(old_as_us)) + float(val)); + unsigned short sum_as_us = __half_as_ushort(sum); +#else + unsigned short sum_as_us = __float2half_rn(__half2float(old_as_us) + float(val)); +#endif + unsigned int sum_as_ui = (size_t)address & 2 ? (sum_as_us << 16) | (old & 0xffff) + : (old & 0xffff0000) | sum_as_us; + old = atomicCAS(aligned, assumed, sum_as_ui); + } while(assumed != old); + __half_raw raw; + raw.x = old_as_us; + return float16(raw); +}; + + +__device__ long long atomicAdd(long long *address, long long val) { + return atomicAdd(reinterpret_cast(address), + static_cast(val)); +} + + +#if __HIPCC__ +#include +#endif // #if __HIPCC__ + +// Skip if ROCm 4.5+ as it implements the following atomic functions. +#if !defined(__HIPCC__) || HIP_VERSION < 40400000 + +__device__ float atomicMax(float* address, float val) { + int* address_as_i = reinterpret_cast(address); + int old = *address_as_i, assumed; + do { + assumed = old; + old = atomicCAS( + reinterpret_cast(address), assumed, + __float_as_int(fmaxf(val, __int_as_float(assumed)))); + } while (assumed != old); + return __int_as_float(old); +} + + +__device__ double atomicMax(double* address, double val) { + unsigned long long* address_as_i = + reinterpret_cast(address); + unsigned long long old = *address_as_i, assumed; + do { + assumed = old; + const long long result = __double_as_longlong( + fmaxf(val, __longlong_as_double(reinterpret_cast(assumed)))); + old = atomicCAS( + address_as_i, assumed, + reinterpret_cast(result)); + } while (assumed != old); + return __longlong_as_double(reinterpret_cast(old)); +} + + +__device__ float atomicMin(float* address, float val) { + int* address_as_i = reinterpret_cast(address); + int old = *address_as_i, assumed; + do { + assumed = old; + old = atomicCAS( + reinterpret_cast(address), assumed, + __float_as_int(fminf(val, __int_as_float(assumed)))); + } while (assumed != old); + return __int_as_float(old); +} + + +__device__ double atomicMin(double* address, double val) { + unsigned long long* address_as_i = + reinterpret_cast(address); + unsigned long long old = *address_as_i, assumed; + do { + assumed = old; + const long long result = __double_as_longlong( + fminf(val, __longlong_as_double(reinterpret_cast(assumed)))); + old = atomicCAS( + address_as_i, assumed, + reinterpret_cast(result)); + } while (assumed != old); + return __longlong_as_double(reinterpret_cast(old)); +} + +#endif // #if !defined(__HIPCC__) || HIP_VERSION < 40400000 diff --git a/cupy/_core/include/cupy/carray.cuh b/cupy/_core/include/cupy/carray.cuh new file mode 100644 index 0000000..e117a3a --- /dev/null +++ b/cupy/_core/include/cupy/carray.cuh @@ -0,0 +1,849 @@ +#pragma once + +#if __cplusplus >= 201103 || (defined(_MSC_VER) && _MSC_VER >= 1900) +#ifndef __CUDACC_RTC__ +// in NVRTC std:initializer_list is pre-defined (no need to include it) +#include +#endif +#endif + +// Basic implementation of std::type_traits +// We use this regardless when C++ is requested, as NVRTC by default lacks many +// C++ features like this. We need to wrap in a namespace in case Jitify kicks +// in and/or users provide custom definitions. +namespace cupy { + namespace type_traits { + template + struct conditional { typedef T type; }; + template + struct conditional { typedef F type; }; + + template + struct enable_if {}; + template + struct enable_if { typedef T type; }; + } +} + +// math +#ifndef M_PI +#define M_PI 3.1415926535897932384626433832795 +#endif + +#ifdef __HIPCC_RTC__ + +#include +#if HIP_VERSION >= 40400000 +// HIP runtime headers can be no longer explicitly included since ROCm 4.5 so +// we only include necessary standard headers. +#include +#include + +// Confirmed to AMD, ROCm 5.0 doesn't recognize __forceinline__ and +// __noinline__. +#define __noinline__ __attribute__((noinline)) +#define __forceinline__ inline __attribute__((always_inline)) + +#else +#include +#endif // #if HIP_VERSION >= 40400000 + +#elif __HIPCC__ + +#include + +#elif __CUDACC_VER_MAJOR__ >= 9 + +#include + +#else // #if __CUDACC_VER_MAJOR__ >= 9 + +struct __half_raw { + unsigned short x; +}; + +struct half { +private: + unsigned short data_; +public: + __device__ half() {} + __device__ half(const half &v) : data_(v.data_) {} + __device__ half(float v) : data_(__float2half_rn(v)) {} + + explicit __device__ half(const __half_raw &v) : data_(v.x) {} + explicit __device__ half(bool v) : data_(__float2half_rn(float(v))) {} + explicit __device__ half(double v) : data_(__float2half_rn(float(v))) {} + explicit __device__ half(int v) : data_(__float2half_rn(float(v))) {} + explicit __device__ half(unsigned int v) : data_(__float2half_rn(float(v))) {} + explicit __device__ half(long long v) : data_(__float2half_rn(float(v))) {} + explicit __device__ half(unsigned long long v) : data_(__float2half_rn(float(v))) {} + + __device__ operator float() const {return __half2float(data_);} + __device__ operator __half_raw() const {__half_raw ret = {data_}; return ret;} +}; + +#endif // #if __CUDACC_VER_MAJOR__ >= 9 + +class float16 { +private: + half data_; +public: + __device__ float16() {} + __device__ float16(float v) : data_(v) {} + + explicit __device__ float16(bool v) : data_(float(v)) {} + explicit __device__ float16(double v) : data_(v) {} + explicit __device__ float16(int v) : data_(v) {} + explicit __device__ float16(unsigned int v) : data_(v) {} + explicit __device__ float16(long long v) : data_(v) {} + explicit __device__ float16(unsigned long long v) : data_(v) {} + + explicit __device__ float16(const half &v): data_(v) {} + explicit __device__ float16(const __half_raw &v): data_(v) {} + + __device__ operator float() const {return float(data_);} + + static const unsigned short nan = 0x7e00u; + + __device__ int iszero() const { + return (__half_raw(data_).x & 0x7fffu) == 0; + } + + __device__ int isnan() const { + __half_raw raw_ = __half_raw(data_); + return (raw_.x & 0x7c00u) == 0x7c00u && (raw_.x & 0x03ffu) != 0x0000u; + } + + __device__ int isinf() const { + return (__half_raw(data_).x & 0x7fffu) == 0x7c00u; + } + + __device__ int isfinite() const { + return (__half_raw(data_).x & 0x7c00u) != 0x7c00u; + } + + __device__ int signbit() const { + return (__half_raw(data_).x & 0x8000u) != 0; + } + +#ifdef __HIPCC__ + +__device__ float16 operator-() { + return float16(-data_); +} + +#endif + + template + inline __device__ float16& operator+=(const T& rhs) { + *this = *this + rhs; + return *this; + } + + template + inline __device__ float16& operator-=(const T& rhs) { + *this = *this - rhs; + return *this; + } + + template + inline __device__ float16& operator*=(const T& rhs) { + *this = *this * rhs; + return *this; + } + + template + inline __device__ float16& operator/=(const T& rhs) { + *this = *this / rhs; + return *this; + } + + friend __device__ float16 copysign(float16 x, float16 y) { + __half_raw x_raw_ = __half_raw(x.data_); + __half_raw y_raw_ = __half_raw(y.data_); + __half_raw ret_raw_; + ret_raw_.x = (x_raw_.x & 0x7fffu) | (y_raw_.x & 0x8000u); + return float16(ret_raw_); + } + + friend __device__ float16 nextafter(float16 x, float16 y) { + __half_raw x_raw_ = __half_raw(x.data_); + __half_raw y_raw_ = __half_raw(y.data_); + __half_raw ret_raw_; + if (!x.isfinite() || y.isnan()) { + ret_raw_.x = nan; + } else if (eq_nonan(x, y)) { + ret_raw_.x = x_raw_.x; + } else if (x.iszero()) { + ret_raw_.x = (y_raw_.x & 0x8000u) + 1; + } else if (!(x_raw_.x & 0x8000u)) { + if (static_cast(x_raw_.x) > static_cast(y_raw_.x)) { + ret_raw_.x = x_raw_.x - 1; + } else { + ret_raw_.x = x_raw_.x + 1; + } + } else if(!(y_raw_.x & 0x8000u) || (x_raw_.x & 0x7fffu) > (y_raw_.x & 0x7fffu)) { + ret_raw_.x = x_raw_.x - 1; + } else { + ret_raw_.x = x_raw_.x + 1; + } + return float16(ret_raw_); + } + +private: + static __device__ int eq_nonan(const float16 x, const float16 y) { + __half_raw x_raw_ = __half_raw(x.data_); + __half_raw y_raw_ = __half_raw(y.data_); + return (x_raw_.x == y_raw_.x || ((x_raw_.x | y_raw_.x) & 0x7fff) == 0); + } +}; + + +__device__ float16 min(float16 x, float16 y) { + return float16(min(float(x), float(y))); +} +__device__ float16 max(float16 x, float16 y) { + return float16(max(float(x), float(y))); +} +__device__ float16 fmin(float16 x, float16 y) { + return float16(fmin(float(x), float(y))); +} +__device__ float16 fmax(float16 x, float16 y) { + return float16(fmax(float(x), float(y))); +} +__device__ int iszero(float16 x) {return x.iszero();} +__device__ int isnan(float16 x) {return x.isnan();} +__device__ int isinf(float16 x) {return x.isinf();} +__device__ int isfinite(float16 x) {return x.isfinite();} +__device__ int signbit(float16 x) {return x.signbit();} + +// CArray +#define CUPY_FOR(i, n) \ + for (ptrdiff_t i = \ + static_cast(blockIdx.x) * blockDim.x + threadIdx.x; \ + i < (n); \ + i += static_cast(blockDim.x) * gridDim.x) + +#ifdef CUPY_JIT_MODE +#ifdef CUPY_JIT_NVCC +#include +#include +#include +#else +#include +#include +#include +#endif // CUPY_JIT_NVCC +#endif // CUPY_JIT_MODE + +#ifdef CUPY_JIT_MODE +namespace cupy { + +/* + * param ndim: the size of the returned tuple + * param T: the type of the tuple elements + */ +template +struct as_tuple { + + template + struct as_tuple_impl { + using ChildClass = as_tuple_impl<_ndim - 1, T, Args...>; + using type = typename ChildClass::type; + + template + __device__ static type call(Ints ints, Args... args) { + return ChildClass::call(ints, ints[_ndim - 1], args...); + } + }; + + template + struct as_tuple_impl<0, Args...> { + using type = thrust::tuple; + + template + __device__ static type call(Ints ints, Args... args) { + return thrust::make_tuple(args...); + } + }; + + using type = typename as_tuple_impl::type; + + template + __device__ static type call(Ints ints, Args... args) { + return as_tuple_impl::call(ints, args...); + } + +}; + +} // namespace cupy + +template +struct Dim { + __device__ Dim() {} +}; +#endif // CUPY_JIT_MODE + +template +class CArrayIterator { +public: + typedef ptrdiff_t difference_type; + typedef T value_type; + typedef T* pointer; + typedef T& reference; +#ifdef CUPY_JIT_NVCC + typedef std::random_access_iterator_tag iterator_category; +#endif // CUPY_JIT_NVCC + +private: + T* head_; + index_t step_; +public: + __host__ __device__ CArrayIterator(T* head, index_t step) { + this->head_ = head; + this->step_ = step; + } + __host__ __device__ CArrayIterator(const CArrayIterator& itr) { + this->head_ = itr.head_; + this->step_ = itr.step_; + } + __host__ __device__ bool operator==(const CArrayIterator& itr) const { + return (this->head_ == itr.head_) && (this->step_ == itr.step_); + } + __host__ __device__ bool operator!=(const CArrayIterator& itr) const { + return !(*this == itr); + } + __host__ __device__ T& operator*() const { + return *(this->head_); + } + __host__ __device__ const T* operator->() const { + return this->head_; + } + __host__ __device__ CArrayIterator& operator++() { + this->head_ += this->step_; + return *this; + } + __host__ __device__ CArrayIterator operator++(int) { + CArrayIterator tmp = *this; + this->head_ += this->step_; + return tmp; + } + __host__ __device__ CArrayIterator& operator--() { + this->head_ -= this->step_; + return *this; + } + __host__ __device__ CArrayIterator operator--(int) { + CArrayIterator tmp = *this; + this->head_ -= this->step_; + return tmp; + } + __host__ __device__ CArrayIterator operator+(ptrdiff_t n) const { + CArrayIterator out = *this; + out.head_ += out.step_ * n; + return out; + } + __host__ __device__ difference_type operator-(const CArrayIterator& itr) const { + return (this->head_ - itr.head_) / this->step_; + } + __host__ __device__ CArrayIterator operator-(ptrdiff_t n) const { + CArrayIterator out = *this; + out.head_ -= out.step_ * n; + return out; + } + __host__ __device__ CArrayIterator& operator+=(ptrdiff_t n) { + this->head_ += this->step_ * n; + return *this; + } + __host__ __device__ CArrayIterator& operator-=(ptrdiff_t n) { + this->head_ -= this->step_ * n; + return *this; + } + __host__ __device__ T& operator[](index_t n) const { + return *(this->head_ + this->step_ * n); + } + __host__ __device__ bool operator<(const CArrayIterator& itr) const { + return this->head_ < itr.head_; + } + __host__ __device__ bool operator>(const CArrayIterator& itr) const { + return this->head_ > itr.head_; + } + __host__ __device__ bool operator<=(const CArrayIterator& itr) const { + return !(*this > itr); + } + __host__ __device__ bool operator>=(const CArrayIterator& itr) const { + return !(*this < itr); + } +}; + +template +class CArray { +public: + static const int ndim = _ndim; + static const bool c_contiguous = _c_contiguous; + typedef typename cupy::type_traits::conditional<_use_32bit_indexing, int, ptrdiff_t>::type index_t; + typedef typename cupy::type_traits::conditional<_c_contiguous, T*, CArrayIterator >::type iterator; + +private: + T* data_; + ptrdiff_t size_; + ptrdiff_t shape_[ndim]; + ptrdiff_t strides_[ndim]; + +public: + // Constructor supports pointers or initializer lists and strides is optional + // as long as _c_contiguous=true. + // CArray ca(data, shape, strides); + // CArray ca(data, shape); + // CArray ca(data, {1, 2, 3}, {48, 24, 8}); + // CArray ca(data, {1, 2, 3}); + // Initializer lists and optional strides are only supported with -std=c++11 + // or higher. + + template + __device__ CArray(T* data, const Int1* shape, const Int2* strides) + : data_(data), size_(1) + { + if (_c_contiguous) { + assert(strides[_ndim-1] == sizeof(T)); + for (int i = _ndim-1; i > 0; i--) { + assert(strides[i-1] == shape[i] * strides[i]); + } + } + for (int i = 0; i < _ndim; i++) { + this->size_ *= shape[i]; + this->shape_[i] = shape[i]; + this->strides_[i] = strides[i]; + } + } + +#if __cplusplus >= 201103 || (defined(_MSC_VER) && _MSC_VER >= 1900) + template + __device__ CArray(typename cupy::type_traits::enable_if<_c_contiguous, U>::type* data, + const Int* shape) + : data_(data), size_(1) + { + for (int i = 0; i < _ndim; i++) { + this->size_ *= shape[i]; + this->shape_[i] = shape[i]; + } + this->strides_[_ndim-1] = sizeof(T); + for (int i = _ndim-1; i > 0; i--) { + this->strides_[i-1] = shape[i] * this->strides_[i]; + } + } + + template + __device__ CArray(typename cupy::type_traits::enable_if<_c_contiguous, U>::type* data, + const std::initializer_list shape) + : CArray(data, shape.begin()) + { + assert(shape.size() == _ndim); + } + + template + __device__ CArray(T* data, + const std::initializer_list shape, + const std::initializer_list strides) + : CArray(data, shape.begin(), strides.begin()) + { + assert(shape.size() == _ndim); + assert(strides.size() == _ndim); + } +#endif + + __device__ CArray() : data_(NULL), size_(1) + { + memset(this->shape_, 0, sizeof(this->shape_)); + memset(this->strides_, 0, sizeof(this->strides_)); + } + + __device__ ptrdiff_t size() const { + return size_; + } + + __device__ const ptrdiff_t* shape() const { + return shape_; + } + + __device__ const ptrdiff_t* strides() const { + return strides_; + } + +#ifdef CUPY_JIT_MODE + __device__ typename cupy::as_tuple<_ndim, ptrdiff_t>::type get_shape() const { + return cupy::as_tuple<_ndim, ptrdiff_t>::call(shape_); + } + + __device__ typename cupy::as_tuple<_ndim, ptrdiff_t>::type get_strides() const { + return cupy::as_tuple<_ndim, ptrdiff_t>::call(strides_); + } +#endif // CUPY_JIT_MODE + +#if __cplusplus >= 201103 || (defined(_MSC_VER) && _MSC_VER >= 1900) + template + __device__ T& operator[](const std::initializer_list idx_) { + assert(idx_.size() == _ndim); + Int idx[ndim]; + memcpy(idx, idx_.begin(), ndim*sizeof(Int)); + return this->operator[](idx); + } + + template + __device__ const T& operator[](const std::initializer_list idx_) const { + assert(idx_.size() == _ndim); + Int idx[ndim]; + memcpy(idx, idx_.begin(), ndim*sizeof(Int)); + return this->operator[](idx); + } +#endif + + template + __device__ T& operator[](const Int (&idx)[ndim]) { + return const_cast(const_cast(*this)[idx]); + } + + template + __device__ const T& operator[](const Int (&idx)[ndim]) const { + index_t diff = 0; + for (int dim = 0; dim < ndim; ++dim) { + diff += static_cast(strides_[dim]) * static_cast(idx[dim]); + } + const char* ptr = reinterpret_cast(data_); + return *reinterpret_cast(ptr + diff); + } + + __device__ T& operator[](ptrdiff_t i) { + return const_cast(const_cast(*this)[i]); + } + +#ifdef CUPY_JIT_MODE + __forceinline__ __device__ iterator begin_ptr() const { + return reinterpret_cast(data_); + } + + __forceinline__ __device__ iterator end_ptr() const { + return reinterpret_cast(data_) + size_; + } + + __forceinline__ __device__ iterator begin() const { + return iterator(data_, strides_[0] / sizeof(T)); + } + + __forceinline__ __device__ iterator end() const { + return iterator(data_ + size_ * strides_[0] / sizeof(T), strides_[0] / sizeof(T)); + } + + template + __forceinline__ __device__ const T& _indexing(const Tuple &idx, Dim, const char* ptr) const { + index_t i = static_cast(thrust::get(idx)); + ptr += static_cast(strides_[dim]) * i; + return _indexing(idx, Dim(), ptr); + } + + template + __forceinline__ __device__ const T& _indexing(const Tuple &idx, Dim<_ndim>, const char* ptr) const { + return *reinterpret_cast(ptr); + } + + template + __forceinline__ __device__ const T& _indexing(const Tuple &idx) const { + const char* ptr = reinterpret_cast(data_); + return _indexing(idx, Dim<0>(), ptr); + } + + template + __forceinline__ __device__ T& _indexing(const Tuple &idx) { + return const_cast(const_cast(*this)._indexing(idx)); + } + + template + __forceinline__ __device__ char* _slicing(const Tuple &idx, char* new_head_ptr, Dim, Dim) const { + index_t i = static_cast(thrust::get(idx)); + new_head_ptr += static_cast(strides_[dim]) * i; + return _slicing(idx, new_head_ptr, Dim(), Dim()); + } + + template + __forceinline__ __device__ char* _slicing(const Tuple &idx, char* new_head_ptr, Dim, Dim) const { + return new_head_ptr; + } + + template + __forceinline__ __device__ CArray _slicing(const Tuple &idx, Dim) { + char* new_head_ptr = reinterpret_cast(data_); + new_head_ptr = _slicing(idx, new_head_ptr, Dim<0>(), Dim()); + T* new_head = reinterpret_cast(new_head_ptr); + return CArray(new_head, shape_ + dimreduce, strides_ + dimreduce); + } + + __forceinline__ __device__ CArray _slicing(const int idx) { + char* new_head_ptr = reinterpret_cast(data_); + index_t i = static_cast(idx); + new_head_ptr += static_cast(strides_[0]) * i; + T* new_head = reinterpret_cast(new_head_ptr); + return CArray(new_head, shape_ + 1, strides_ + 1); + } +#endif // CUPY_JIT_MODE + + __device__ const T& operator[](ptrdiff_t idx) const { + if (c_contiguous) { + // contiguous arrays can be directly addressed by the + // numeric value, avoiding expensive 64 bit operations in cuda + return data_[idx]; + } else { + // 64-bit mults and divs are pretty expensive and can lead to severe + // performance degradation in computation bound kernels + index_t diff = 0; + index_t i = static_cast(idx); + for (int dim = ndim; --dim > 0; ) { + index_t shape_dim = static_cast(shape_[dim]); + diff += static_cast(strides_[dim]) * (i % shape_dim); + i /= shape_dim; + } + diff += static_cast(strides_[0]) * i; + const char* ptr = reinterpret_cast(data_); + return *reinterpret_cast(ptr + diff); + } + } +}; + +template + +class CArray { +private: + T* data_; + ptrdiff_t size_; + +public: + static const int ndim = 0; + + __device__ CArray() : data_(NULL), size_(1) { } + + __device__ explicit CArray(T* data) : data_(data), size_(1) { } + + template + __device__ CArray(T* data, Int size) : data_(data), size_(size) { } + + // These constructors are just to match the non-0-dim constructors + template + __device__ CArray(T* data, const Int1* shape, const Int2* strides) + : data_(data), size_(1) { } + +#if __cplusplus >= 201103 || (defined(_MSC_VER) && _MSC_VER >= 1900) + __device__ CArray(T* data, + const std::initializer_list shape, + const std::initializer_list strides) + : data_(data), size_(1) + { + assert(shape.size() == 0); + assert(strides.size() == 0); + } +#endif + + + __device__ ptrdiff_t size() const { + return size_; + } + + __device__ const ptrdiff_t* shape() const { + return NULL; + } + + __device__ const ptrdiff_t* strides() const { + return NULL; + } + + template + __device__ T& operator[](const U&) { + return *data_; + } + + template + __device__ T operator[](const U&) const { + return *data_; + } +}; + +template +class CIndexer { +public: + static const int ndim = _ndim; +private: + ptrdiff_t size_; + ptrdiff_t shape_[ndim]; + ptrdiff_t index_[ndim]; + + typedef ptrdiff_t index_t[ndim]; + +public: + // Constructor supports pointers or initializer lists and index is optional + // CIndexer<3> ca(shape, index); + // CIndexer<3> ca({1, 2, 3}, {0, 1, 1}); + // CIndexer<3> ca = {1, 2, 3}; + // Initializer lists are only supported with -std=c++11 or higher. + + template + __device__ explicit CIndexer(const Int* shape) + : size_(1) { + for (int i = 0; i < _ndim; i++) { + this->size_ *= shape[i]; + this->shape_[i] = shape[i]; + this->index_[i] = 0; + } + } + + template + __device__ CIndexer(const Int1* shape, const Int2* index) + : size_(1) { + for (int i = 0; i < _ndim; i++) { + this->size_ *= shape[i]; + this->shape_[i] = shape[i]; + this->index_[i] = index[i]; + } + } + +#if __cplusplus >= 201103 || (defined(_MSC_VER) && _MSC_VER >= 1900) + template + __device__ CIndexer(const std::initializer_list shape) + : CIndexer(shape.begin()) + { + assert(shape.size() == _ndim); + } + + template + __device__ CIndexer(const std::initializer_list shape, + const std::initializer_list index) + : CIndexer(shape.begin(), index.begin()) + { + assert(shape.size() == _ndim); + assert(index.size() == _ndim); + } +#endif + + __device__ CIndexer() : size_(1) + { + memset(this->shape_, 0, sizeof(this->shape_)); + memset(this->index_, 0, sizeof(this->index_)); + } + + __device__ ptrdiff_t size() const { + return size_; + } + + __device__ const index_t& get() const { + return index_; + } + + __device__ void set(ptrdiff_t i) { + // ndim == 0 case uses partial template specialization + if (ndim == 1) { + index_[0] = i; + } else if (!_use_32bit_indexing && size_ > 1LL << 31) { + // 64-bit division is very slow on GPU + this->_set(static_cast(i)); + } else { + this->_set(static_cast(i)); + } + } + +private: + template + __device__ void _set(index_t i) { + for (int dim = ndim; --dim > 0; ) { + index_t s = static_cast(shape_[dim]); + if (s & (s - 1)) { + index_t t = i / s; + index_[dim] = i - t * s; + i = t; + } else { // exp of 2 + index_[dim] = i & (s - 1); + i >>= _log2(s); + } + } + index_[0] = i; + } + + // can also be implemented as __ffs(x)-1 or 31-__clz(x) + static unsigned int __device__ _log2(unsigned int x) { return __popc(x-1); } + static unsigned long long int __device__ _log2(unsigned long long int x) { return __popcll(x-1); } +}; + +template +class CIndexer<0, _use_32bit_indexing> { +private: + ptrdiff_t size_; + +public: + static const int ndim = 0; + + __device__ CIndexer() : size_(1) { } + + template + __device__ explicit CIndexer(Int size) : size_(size) { } + + // These constructors are just to match the non-0-dim constructors + template + __device__ explicit CIndexer(const Int* shape) : size_(1) { } + + template + __device__ CIndexer(const Int1* shape, const Int2* strides) + : size_(1) { } + +#if __cplusplus >= 201103 || (defined(_MSC_VER) && _MSC_VER >= 1900) + __device__ CIndexer(const std::initializer_list shape) + : size_(1) + { + assert(shape.size() == 0); + } + + __device__ CIndexer(const std::initializer_list shape, + const std::initializer_list index) + : size_(1) + { + assert(shape.size() == 0); + assert(index.size() == 0); + } +#endif + + __device__ ptrdiff_t size() const { + return size_; + } + + __device__ void set(ptrdiff_t i) { + } + + __device__ const ptrdiff_t* get() const { + return NULL; + } +}; + +__device__ int _floor_divide(int x, int y) { + if (y == 0) return 0; + int q = x / y; + return q - (((x < 0) != (y < 0)) && q * y != x); +} + +__device__ long long _floor_divide(long long x, long long y) { + if (y == 0) return 0; + long long q = x / y; + return q - (((x < 0) != (y < 0)) && q * y != x); +} + +__device__ unsigned _floor_divide(unsigned x, unsigned y) { + if (y == 0) return 0; + return x / y; +} + +__device__ unsigned long long _floor_divide( + unsigned long long x, unsigned long long y) { + if (y == 0) return 0; + return x / y; +} + +__device__ float _floor_divide(float x, float y) { + return floor(x / y); +} + +__device__ double _floor_divide(double x, double y) { + return floor(x / y); +} diff --git a/cupy/_core/include/cupy/complex.cuh b/cupy/_core/include/cupy/complex.cuh new file mode 100644 index 0000000..fa4908b --- /dev/null +++ b/cupy/_core/include/cupy/complex.cuh @@ -0,0 +1,100 @@ +#pragma once + +#include + +using thrust::complex; +using thrust::conj; +using thrust::real; +using thrust::imag; +using thrust::arg; +using thrust::exp; +using thrust::log; +using thrust::log10; +using thrust::sin; +using thrust::cos; +using thrust::tan; +using thrust::sinh; +using thrust::cosh; +using thrust::tanh; +using thrust::asinh; +using thrust::acosh; +using thrust::atanh; +using thrust::asin; +using thrust::acos; +using thrust::atan; + +template +__host__ __device__ bool isnan(complex x) { + return isnan(x.real()) || isnan(x.imag()); +} + +template +__host__ __device__ bool isinf(complex x) { + return isinf(x.real()) || isinf(x.imag()); +} + +template +__host__ __device__ bool isfinite(complex x) { + return isfinite(x.real()) && isfinite(x.imag()); +} + +template +__host__ __device__ complex log1p(complex x) { + x += 1; + return log(x); +} + +template +__host__ __device__ complex log2(complex x) { + complex y = log(x); + y /= log(T(2)); + return y; +} + +template +__host__ __device__ complex expm1(complex x) { + complex y = exp(x); + y -= 1; + return y; +} + +template +__host__ __device__ complex min(complex x, complex y) { + if (isnan(x)) { + return y; + } else if (isnan(y)) { + return x; + } else if (x.real() < y.real()) { + return x; + } else if (x.real() > y.real()) { + return y; + } else if (x.imag() < y.imag()) { + return x; + } else { + return y; + } +} + +template +__host__ __device__ complex max(complex x, complex y) { + if (isnan(x)) { + return y; + } else if (isnan(y)) { + return x; + } else if (x.real() < y.real()) { + return y; + } else if (x.real() > y.real()) { + return x; + } else if (x.imag() < y.imag()) { + return y; + } else { + return x; + } +} + +template +__host__ __device__ complex rint(complex x) { + return complex(rint(x.real()), rint(x.imag())); +} + +// ToDo: assignment operator for complex = T2 for T2 all types diff --git a/cupy/_core/include/cupy/complex/README.md b/cupy/_core/include/cupy/complex/README.md new file mode 100644 index 0000000..ee065ba --- /dev/null +++ b/cupy/_core/include/cupy/complex/README.md @@ -0,0 +1,3 @@ +These files are copied from thrust project and are modified. + + http://thrust.github.io/ \ No newline at end of file diff --git a/cupy/_core/include/cupy/complex/arithmetic.h b/cupy/_core/include/cupy/complex/arithmetic.h new file mode 100644 index 0000000..0cc5293 --- /dev/null +++ b/cupy/_core/include/cupy/complex/arithmetic.h @@ -0,0 +1,314 @@ +/* + * Copyright 2008-2013 NVIDIA Corporation + * Copyright 2013 Filipe RNC Maia + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace thrust { + +/* --- Binary Arithmetic Operators --- */ + +template +__host__ __device__ inline complex operator+(const complex& lhs, + const complex& rhs) { + return complex(lhs.real() + rhs.real(), lhs.imag() + rhs.imag()); +} + +template +__host__ __device__ inline complex operator+(const volatile complex& lhs, + const volatile complex& rhs) { + return complex(lhs.real() + rhs.real(), lhs.imag() + rhs.imag()); +} + +template +__host__ __device__ inline complex operator+(const complex& lhs, + const T& rhs) { + return complex(lhs.real() + rhs, lhs.imag()); +} + +template +__host__ __device__ inline complex operator+(const T& lhs, + const complex& rhs) { + return complex(rhs.real() + lhs, rhs.imag()); +} + +// TODO(leofang): support operator+ for (complex x, complex y) + +template +__host__ __device__ inline complex operator-(const complex& lhs, + const complex& rhs) { + return complex(lhs.real() - rhs.real(), lhs.imag() - rhs.imag()); +} + +template +__host__ __device__ inline complex operator-(const complex& lhs, + const T& rhs) { + return complex(lhs.real() - rhs, lhs.imag()); +} + +template +__host__ __device__ inline complex operator-(const T& lhs, + const complex& rhs) { + return complex(lhs - rhs.real(), -rhs.imag()); +} + +// TODO(leofang): support operator- for (complex x, complex y) + +template +__host__ __device__ inline complex operator*(const complex& lhs, + const complex& rhs) { + return complex(lhs.real() * rhs.real() - lhs.imag() * rhs.imag(), + lhs.real() * rhs.imag() + lhs.imag() * rhs.real()); +} + +template +__host__ __device__ inline complex operator*(const complex& lhs, + const T& rhs) { + return complex(lhs.real() * rhs, lhs.imag() * rhs); +} + +template +__host__ __device__ inline complex operator*(const T& lhs, + const complex& rhs) { + return complex(rhs.real() * lhs, rhs.imag() * lhs); +} + +// TODO(leofang): support operator* for (complex x, complex y) + +template +__host__ __device__ inline complex operator/(const complex& lhs, + const complex& rhs) { + T s = abs(rhs.real()) + abs(rhs.imag()); + T oos = T(1.0) / s; + T ars = lhs.real() * oos; + T ais = lhs.imag() * oos; + T brs = rhs.real() * oos; + T bis = rhs.imag() * oos; + s = (brs * brs) + (bis * bis); + oos = T(1.0) / s; + complex quot(((ars * brs) + (ais * bis)) * oos, + ((ais * brs) - (ars * bis)) * oos); + return quot; +} + +template +__host__ __device__ inline complex operator/(const complex& lhs, + const T& rhs) { + return complex(lhs.real() / rhs, lhs.imag() / rhs); +} + +template +__host__ __device__ inline complex operator/(const T& lhs, + const complex& rhs) { + return complex(lhs) / rhs; +} + +// TODO(leofang): support operator/ for (complex x, complex y) + +/* --- Unary comparison with Numpy logic. This means that a + bi > c + di if either + * a > c or a == c and b > d. --- */ + +template +__host__ __device__ inline bool operator<(const complex& lhs, + const complex& rhs) { + if (lhs == rhs) { + return false; + } else if (lhs.real() < rhs.real()) { + return true; + } else if (lhs.real() == rhs.real()) { + return lhs.imag() < rhs.imag(); + } else { + return false; + } +} + +template +__host__ __device__ inline bool operator<=(const complex& lhs, + const complex& rhs) { + if (lhs == rhs || lhs < rhs) { + return true; + } else { + return false; + } +} + +template +__host__ __device__ inline bool operator>(const complex& lhs, + const complex& rhs) { + if (lhs == rhs) { + return false; + } else if (lhs.real() > rhs.real()) { + return true; + } else if (lhs.real() == rhs.real()) { + return lhs.imag() > rhs.imag(); + } else { + return false; + } +} + +template +__host__ __device__ inline bool operator>=(const complex& lhs, + const complex& rhs) { + if (lhs == rhs || lhs > rhs) { + return true; + } else { + return false; + } +} + +template +__host__ __device__ inline bool operator<(const T& lhs, + const complex& rhs) { + return complex(lhs) < rhs; +} + +template +__host__ __device__ inline bool operator>(const T& lhs, + const complex& rhs) { + return complex(lhs) > rhs; +} + +template +__host__ __device__ inline bool operator<(const complex& lhs, + const T& rhs) { + return lhs < complex(rhs); +} + +template +__host__ __device__ inline bool operator>(const complex& lhs, + const T& rhs) { + return lhs > complex(rhs); +} + +template +__host__ __device__ inline bool operator<=(const T& lhs, + const complex& rhs) { + return complex(lhs) <= rhs; +} + +template +__host__ __device__ inline bool operator>=(const T& lhs, + const complex& rhs) { + return complex(lhs) >= rhs; +} + +template +__host__ __device__ inline bool operator<=(const complex& lhs, + const T& rhs) { + return lhs <= complex(rhs); +} + +template +__host__ __device__ inline bool operator>=(const complex& lhs, + const T& rhs) { + return lhs >= complex(rhs); +} + +/* --- Unary Arithmetic Operators --- */ + +template +__host__ __device__ inline complex operator+(const complex& rhs) { + return rhs; +} + +template +__host__ __device__ inline complex operator-(const complex& rhs) { + return rhs * -T(1); +} + +/* --- Other Basic Arithmetic Functions --- */ + +// As hypot is only C++11 we have to use the C interface +template +__host__ __device__ inline T abs(const complex& z) { + return hypot(z.real(), z.imag()); +} + +namespace detail { +namespace complex { +__host__ __device__ inline float abs(const thrust::complex& z) { + return hypotf(z.real(), z.imag()); +} + +__host__ __device__ inline double abs(const thrust::complex& z) { + return hypot(z.real(), z.imag()); +} +} +} + +template <> +__host__ __device__ inline float abs(const complex& z) { + return detail::complex::abs(z); +} +template <> +__host__ __device__ inline double abs(const complex& z) { + return detail::complex::abs(z); +} + +template +__host__ __device__ inline T arg(const complex& z) { + return atan2(z.imag(), z.real()); +} + +template +__host__ __device__ inline complex conj(const complex& z) { + return complex(z.real(), -z.imag()); +} + +template +__host__ __device__ inline T real(const complex& z) { + return z.real(); +} + +template +__host__ __device__ inline T imag(const complex& z) { + return z.imag(); +} + +template +__host__ __device__ inline T norm(const complex& z) { + return z.real() * z.real() + z.imag() * z.imag(); +} + +template <> +__host__ __device__ inline float norm(const complex& z) { + if (::abs(z.real()) < ::sqrtf(FLT_MIN) && ::abs(z.imag()) < ::sqrtf(FLT_MIN)) { + float a = z.real() * 4.0f; + float b = z.imag() * 4.0f; + return (a * a + b * b) / 16.0f; + } + return z.real() * z.real() + z.imag() * z.imag(); +} + +template <> +__host__ __device__ inline double norm(const complex& z) { + if (::abs(z.real()) < ::sqrt(DBL_MIN) && ::abs(z.imag()) < ::sqrt(DBL_MIN)) { + double a = z.real() * 4.0; + double b = z.imag() * 4.0; + return (a * a + b * b) / 16.0; + } + return z.real() * z.real() + z.imag() * z.imag(); +} + +template +__host__ __device__ inline complex polar(const T& m, + const T& theta) { + return complex(m * cos(theta), m * sin(theta)); +} +} diff --git a/cupy/_core/include/cupy/complex/catrig.h b/cupy/_core/include/cupy/complex/catrig.h new file mode 100644 index 0000000..2d29d78 --- /dev/null +++ b/cupy/_core/include/cupy/complex/catrig.h @@ -0,0 +1,730 @@ +/* + * Copyright 2008-2013 NVIDIA Corporation + * Copyright 2013 Filipe RNC Maia + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*- + * Copyright (c) 2012 Stephen Montgomery-Smith + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Adapted from FreeBSD by Filipe Maia : + * freebsd/lib/msun/src/catrig.c + */ + +#pragma once + +#include +#include + +namespace thrust { +namespace detail { +namespace complex { + +using thrust::complex; + +__host__ __device__ inline void raise_inexact() { + const volatile float tiny = 7.888609052210118054117286e-31; /* 0x1p-100; */ + // needs the volatile to prevent compiler from ignoring it + volatile float junk = 1 + tiny; + (void)junk; +} + +__host__ __device__ inline complex clog_for_large_values(complex z); + +/* + * Testing indicates that all these functions are accurate up to 4 ULP. + * The functions casin(h) and cacos(h) are about 2.5 times slower than asinh. + * The functions catan(h) are a little under 2 times slower than atanh. + * + * The code for casinh, casin, cacos, and cacosh comes first. The code is + * rather complicated, and the four functions are highly interdependent. + * + * The code for catanh and catan comes at the end. It is much simpler than + * the other functions, and the code for these can be disconnected from the + * rest of the code. + */ + +/* + * ================================ + * | casinh, casin, cacos, cacosh | + * ================================ + */ + +/* + * The algorithm is very close to that in "Implementing the complex arcsine + * and arccosine functions using exception handling" by T. E. Hull, Thomas F. + * Fairgrieve, and Ping Tak Peter Tang, published in ACM Transactions on + * Mathematical Software, Volume 23 Issue 3, 1997, Pages 299-335, + * http://dl.acm.org/citation.cfm?id=275324. + * + * Throughout we use the convention z = x + I*y. + * + * casinh(z) = sign(x)*log(A+sqrt(A*A-1)) + I*asin(B) + * where + * A = (|z+I| + |z-I|) / 2 + * B = (|z+I| - |z-I|) / 2 = y/A + * + * These formulas become numerically unstable: + * (a) for Re(casinh(z)) when z is close to the line segment [-I, I] (that + * is, Re(casinh(z)) is close to 0); + * (b) for Im(casinh(z)) when z is close to either of the intervals + * [I, I*infinity) or (-I*infinity, -I] (that is, |Im(casinh(z))| is + * close to PI/2). + * + * These numerical problems are overcome by defining + * f(a, b) = (hypot(a, b) - b) / 2 = a*a / (hypot(a, b) + b) / 2 + * Then if A < A_crossover, we use + * log(A + sqrt(A*A-1)) = log1p((A-1) + sqrt((A-1)*(A+1))) + * A-1 = f(x, 1+y) + f(x, 1-y) + * and if B > B_crossover, we use + * asin(B) = atan2(y, sqrt(A*A - y*y)) = atan2(y, sqrt((A+y)*(A-y))) + * A-y = f(x, y+1) + f(x, y-1) + * where without loss of generality we have assumed that x and y are + * non-negative. + * + * Much of the difficulty comes because the intermediate computations may + * produce overflows or underflows. This is dealt with in the paper by Hull + * et al by using exception handling. We do this by detecting when + * computations risk underflow or overflow. The hardest part is handling the + * underflows when computing f(a, b). + * + * Note that the function f(a, b) does not appear explicitly in the paper by + * Hull et al, but the idea may be found on pages 308 and 309. Introducing the + * function f(a, b) allows us to concentrate many of the clever tricks in this + * paper into one function. + */ + +/* + * Function f(a, b, hypot_a_b) = (hypot(a, b) - b) / 2. + * Pass hypot(a, b) as the third argument. + */ +__host__ __device__ inline double f(double a, double b, double hypot_a_b) { + if (b < 0) return ((hypot_a_b - b) / 2); + if (b == 0) return (a / 2); + return (a * a / (hypot_a_b + b) / 2); +} + +/* + * All the hard work is contained in this function. + * x and y are assumed positive or zero, and less than RECIP_EPSILON. + * Upon return: + * rx = Re(casinh(z)) = -Im(cacos(y + I*x)). + * B_is_usable is set to 1 if the value of B is usable. + * If B_is_usable is set to 0, sqrt_A2my2 = sqrt(A*A - y*y), and new_y = y. + * If returning sqrt_A2my2 has potential to result in an underflow, it is + * rescaled, and new_y is similarly rescaled. + */ +__host__ __device__ inline void do_hard_work(double x, double y, double* rx, + int* B_is_usable, double* B, + double* sqrt_A2my2, double* new_y) { + double R, S, A; /* A, B, R, and S are as in Hull et al. */ + double Am1, Amy; /* A-1, A-y. */ + const double A_crossover = + 10; /* Hull et al suggest 1.5, but 10 works better */ + const double FOUR_SQRT_MIN = + 5.966672584960165394632772e-154; /* =0x1p-509; >= 4 * sqrt(DBL_MIN) */ + const double B_crossover = 0.6417; /* suggested by Hull et al */ + + R = hypot(x, y + 1); /* |z+I| */ + S = hypot(x, y - 1); /* |z-I| */ + + /* A = (|z+I| + |z-I|) / 2 */ + A = (R + S) / 2; + /* + * Mathematically A >= 1. There is a small chance that this will not + * be so because of rounding errors. So we will make certain it is + * so. + */ + if (A < 1) A = 1; + + if (A < A_crossover) { + /* + * Am1 = fp + fm, where fp = f(x, 1+y), and fm = f(x, 1-y). + * rx = log1p(Am1 + sqrt(Am1*(A+1))) + */ + if (y == 1 && x < DBL_EPSILON * DBL_EPSILON / 128) { + /* + * fp is of order x^2, and fm = x/2. + * A = 1 (inexactly). + */ + *rx = sqrt(x); + } else if (x >= DBL_EPSILON * fabs(y - 1)) { + /* + * Underflow will not occur because + * x >= DBL_EPSILON^2/128 >= FOUR_SQRT_MIN + */ + Am1 = f(x, 1 + y, R) + f(x, 1 - y, S); + *rx = log1p(Am1 + sqrt(Am1 * (A + 1))); + } else if (y < 1) { + /* + * fp = x*x/(1+y)/4, fm = x*x/(1-y)/4, and + * A = 1 (inexactly). + */ + *rx = x / sqrt((1 - y) * (1 + y)); + } else { /* if (y > 1) */ + /* + * A-1 = y-1 (inexactly). + */ + *rx = log1p((y - 1) + sqrt((y - 1) * (y + 1))); + } + } else { + *rx = log(A + sqrt(A * A - 1)); + } + + *new_y = y; + + if (y < FOUR_SQRT_MIN) { + /* + * Avoid a possible underflow caused by y/A. For casinh this + * would be legitimate, but will be picked up by invoking atan2 + * later on. For cacos this would not be legitimate. + */ + *B_is_usable = 0; + *sqrt_A2my2 = A * (2 / DBL_EPSILON); + *new_y = y * (2 / DBL_EPSILON); + return; + } + + /* B = (|z+I| - |z-I|) / 2 = y/A */ + *B = y / A; + *B_is_usable = 1; + + if (*B > B_crossover) { + *B_is_usable = 0; + /* + * Amy = fp + fm, where fp = f(x, y+1), and fm = f(x, y-1). + * sqrt_A2my2 = sqrt(Amy*(A+y)) + */ + if (y == 1 && x < DBL_EPSILON / 128) { + /* + * fp is of order x^2, and fm = x/2. + * A = 1 (inexactly). + */ + *sqrt_A2my2 = sqrt(x) * sqrt((A + y) / 2); + } else if (x >= DBL_EPSILON * fabs(y - 1)) { + /* + * Underflow will not occur because + * x >= DBL_EPSILON/128 >= FOUR_SQRT_MIN + * and + * x >= DBL_EPSILON^2 >= FOUR_SQRT_MIN + */ + Amy = f(x, y + 1, R) + f(x, y - 1, S); + *sqrt_A2my2 = sqrt(Amy * (A + y)); + } else if (y > 1) { + /* + * fp = x*x/(y+1)/4, fm = x*x/(y-1)/4, and + * A = y (inexactly). + * + * y < RECIP_EPSILON. So the following + * scaling should avoid any underflow problems. + */ + *sqrt_A2my2 = + x * (4 / DBL_EPSILON / DBL_EPSILON) * y / sqrt((y + 1) * (y - 1)); + *new_y = y * (4 / DBL_EPSILON / DBL_EPSILON); + } else { /* if (y < 1) */ + /* + * fm = 1-y >= DBL_EPSILON, fp is of order x^2, and + * A = 1 (inexactly). + */ + *sqrt_A2my2 = sqrt((1 - y) * (1 + y)); + } + } +} + +/* + * casinh(z) = z + O(z^3) as z -> 0 + * + * casinh(z) = sign(x)*clog(sign(x)*z) + O(1/z^2) as z -> infinity + * The above formula works for the imaginary part as well, because + * Im(casinh(z)) = sign(x)*atan2(sign(x)*y, fabs(x)) + O(y/z^3) + * as z -> infinity, uniformly in y + */ +__host__ __device__ inline complex casinh(complex z) { + double x, y, ax, ay, rx, ry, B, sqrt_A2my2, new_y; + int B_is_usable; + complex w; + const double RECIP_EPSILON = 1.0 / DBL_EPSILON; + const double m_ln2 = 6.9314718055994531e-1; /* 0x162e42fefa39ef.0p-53 */ + x = z.real(); + y = z.imag(); + ax = fabs(x); + ay = fabs(y); + + if (isnan(x) || isnan(y)) { + /* casinh(+-Inf + I*NaN) = +-Inf + I*NaN */ + if (isinf(x)) return (complex(x, y + y)); + /* casinh(NaN + I*+-Inf) = opt(+-)Inf + I*NaN */ + if (isinf(y)) return (complex(y, x + x)); + /* casinh(NaN + I*0) = NaN + I*0 */ + if (y == 0) return (complex(x + x, y)); + /* + * All other cases involving NaN return NaN + I*NaN. + * C99 leaves it optional whether to raise invalid if one of + * the arguments is not NaN, so we opt not to raise it. + */ + return (complex(x + 0.0 + (y + 0.0), x + 0.0 + (y + 0.0))); + } + + if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) { + /* clog...() will raise inexact unless x or y is infinite. */ + if (signbit(x) == 0) + w = clog_for_large_values(z) + m_ln2; + else + w = clog_for_large_values(-z) + m_ln2; + return (complex(copysign(w.real(), x), copysign(w.imag(), y))); + } + + /* Avoid spuriously raising inexact for z = 0. */ + if (x == 0 && y == 0) return (z); + + /* All remaining cases are inexact. */ + raise_inexact(); + + const double SQRT_6_EPSILON = + 3.6500241499888571e-8; /* 0x13988e1409212e.0p-77 */ + if (ax < SQRT_6_EPSILON / 4 && ay < SQRT_6_EPSILON / 4) return (z); + + do_hard_work(ax, ay, &rx, &B_is_usable, &B, &sqrt_A2my2, &new_y); + if (B_is_usable) + ry = asin(B); + else + ry = atan2(new_y, sqrt_A2my2); + return (complex(copysign(rx, x), copysign(ry, y))); +} + +/* + * casin(z) = reverse(casinh(reverse(z))) + * where reverse(x + I*y) = y + I*x = I*conj(z). + */ +__host__ __device__ inline complex casin(complex z) { + complex w = casinh(complex(z.imag(), z.real())); + + return (complex(w.imag(), w.real())); +} + +/* + * cacos(z) = PI/2 - casin(z) + * but do the computation carefully so cacos(z) is accurate when z is + * close to 1. + * + * cacos(z) = PI/2 - z + O(z^3) as z -> 0 + * + * cacos(z) = -sign(y)*I*clog(z) + O(1/z^2) as z -> infinity + * The above formula works for the real part as well, because + * Re(cacos(z)) = atan2(fabs(y), x) + O(y/z^3) + * as z -> infinity, uniformly in y + */ +__host__ __device__ inline complex cacos(complex z) { + double x, y, ax, ay, rx, ry, B, sqrt_A2mx2, new_x; + int sx, sy; + int B_is_usable; + complex w; + const double pio2_hi = 1.5707963267948966e0; /* 0x1921fb54442d18.0p-52 */ + const volatile double pio2_lo = + 6.1232339957367659e-17; /* 0x11a62633145c07.0p-106 */ + const double m_ln2 = 6.9314718055994531e-1; /* 0x162e42fefa39ef.0p-53 */ + + x = z.real(); + y = z.imag(); + sx = signbit(x); + sy = signbit(y); + ax = fabs(x); + ay = fabs(y); + + if (isnan(x) || isnan(y)) { + /* cacos(+-Inf + I*NaN) = NaN + I*opt(-)Inf */ + if (isinf(x)) return (complex(y + y, -infinity())); + /* cacos(NaN + I*+-Inf) = NaN + I*-+Inf */ + if (isinf(y)) return (complex(x + x, -y)); + /* cacos(0 + I*NaN) = PI/2 + I*NaN with inexact */ + if (x == 0) return (complex(pio2_hi + pio2_lo, y + y)); + /* + * All other cases involving NaN return NaN + I*NaN. + * C99 leaves it optional whether to raise invalid if one of + * the arguments is not NaN, so we opt not to raise it. + */ + return (complex(x + 0.0 + (y + 0), x + 0.0 + (y + 0))); + } + + const double RECIP_EPSILON = 1.0 / DBL_EPSILON; + if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) { + /* clog...() will raise inexact unless x or y is infinite. */ + w = clog_for_large_values(z); + rx = fabs(w.imag()); + ry = w.real() + m_ln2; + if (sy == 0) ry = -ry; + return (complex(rx, ry)); + } + + /* Avoid spuriously raising inexact for z = 1. */ + if (x == 1.0 && y == 0.0) return (complex(0, -y)); + + /* All remaining cases are inexact. */ + raise_inexact(); + + const double SQRT_6_EPSILON = + 3.6500241499888571e-8; /* 0x13988e1409212e.0p-77 */ + if (ax < SQRT_6_EPSILON / 4 && ay < SQRT_6_EPSILON / 4) + return (complex(pio2_hi - (x - pio2_lo), -y)); + + do_hard_work(ay, ax, &ry, &B_is_usable, &B, &sqrt_A2mx2, &new_x); + if (B_is_usable) { + if (sx == 0) + rx = acos(B); + else + rx = acos(-B); + } else { + if (sx == 0) + rx = atan2(sqrt_A2mx2, new_x); + else + rx = atan2(sqrt_A2mx2, -new_x); + } + if (sy == 0) ry = -ry; + return (complex(rx, ry)); +} + +/* + * cacosh(z) = I*cacos(z) or -I*cacos(z) + * where the sign is chosen so Re(cacosh(z)) >= 0. + */ +__host__ __device__ inline complex cacosh(complex z) { + complex w; + double rx, ry; + + w = cacos(z); + rx = w.real(); + ry = w.imag(); + /* cacosh(NaN + I*NaN) = NaN + I*NaN */ + if (isnan(rx) && isnan(ry)) return (complex(ry, rx)); + /* cacosh(NaN + I*+-Inf) = +Inf + I*NaN */ + /* cacosh(+-Inf + I*NaN) = +Inf + I*NaN */ + if (isnan(rx)) return (complex(fabs(ry), rx)); + /* cacosh(0 + I*NaN) = NaN + I*NaN */ + if (isnan(ry)) return (complex(ry, ry)); + return (complex(fabs(ry), copysign(rx, z.imag()))); +} + +/* + * Optimized version of clog() for |z| finite and larger than ~RECIP_EPSILON. + */ +__host__ __device__ inline complex clog_for_large_values(complex z) { + double x, y; + double ax, ay, t; + const double m_e = 2.7182818284590452e0; /* 0x15bf0a8b145769.0p-51 */ + + x = z.real(); + y = z.imag(); + ax = fabs(x); + ay = fabs(y); + if (ax < ay) { + t = ax; + ax = ay; + ay = t; + } + + /* + * Avoid overflow in hypot() when x and y are both very large. + * Divide x and y by E, and then add 1 to the logarithm. This depends + * on E being larger than sqrt(2). + * Dividing by E causes an insignificant loss of accuracy; however + * this method is still poor since it is uneccessarily slow. + */ + if (ax > DBL_MAX / 2) + return (complex(log(hypot(x / m_e, y / m_e)) + 1, atan2(y, x))); + + /* + * Avoid overflow when x or y is large. Avoid underflow when x or + * y is small. + */ + const double QUARTER_SQRT_MAX = + 5.966672584960165394632772e-154; /* = 0x1p509; <= sqrt(DBL_MAX) / 4 */ + const double SQRT_MIN = + 1.491668146240041348658193e-154; /* = 0x1p-511; >= sqrt(DBL_MIN) */ + if (ax > QUARTER_SQRT_MAX || ay < SQRT_MIN) + return (complex(log(hypot(x, y)), atan2(y, x))); + + return (complex(log(ax * ax + ay * ay) / 2, atan2(y, x))); +} + +/* + * ================= + * | catanh, catan | + * ================= + */ + +/* + * sum_squares(x,y) = x*x + y*y (or just x*x if y*y would underflow). + * Assumes x*x and y*y will not overflow. + * Assumes x and y are finite. + * Assumes y is non-negative. + * Assumes fabs(x) >= DBL_EPSILON. + */ +__host__ __device__ inline double sum_squares(double x, double y) { + const double SQRT_MIN = + 1.491668146240041348658193e-154; /* = 0x1p-511; >= sqrt(DBL_MIN) */ + /* Avoid underflow when y is small. */ + if (y < SQRT_MIN) return (x * x); + + return (x * x + y * y); +} + +/* + * real_part_reciprocal(x, y) = Re(1/(x+I*y)) = x/(x*x + y*y). + * Assumes x and y are not NaN, and one of x and y is larger than + * RECIP_EPSILON. We avoid unwarranted underflow. It is important to not use + * the code creal(1/z), because the imaginary part may produce an unwanted + * underflow. + * This is only called in a context where inexact is always raised before + * the call, so no effort is made to avoid or force inexact. + */ +__host__ __device__ inline double real_part_reciprocal(double x, double y) { + double scale; + uint32_t hx, hy; + int32_t ix, iy; + + /* + * This code is inspired by the C99 document n1124.pdf, Section G.5.1, + * example 2. + */ + get_high_word(hx, x); + ix = hx & 0x7ff00000; + get_high_word(hy, y); + iy = hy & 0x7ff00000; + //#define BIAS (DBL_MAX_EXP - 1) + const int BIAS = DBL_MAX_EXP - 1; + /* XXX more guard digits are useful iff there is extra precision. */ + //#define CUTOFF (DBL_MANT_DIG / 2 + 1) /* just half or 1 guard digit */ + const int CUTOFF = (DBL_MANT_DIG / 2 + 1); + if (ix - iy >= CUTOFF << 20 || isinf(x)) + return (1 / x); /* +-Inf -> +-0 is special */ + if (iy - ix >= CUTOFF << 20) + return (x / y / y); /* should avoid double div, but hard */ + if (ix <= (BIAS + DBL_MAX_EXP / 2 - CUTOFF) << 20) + return (x / (x * x + y * y)); + scale = 1; + set_high_word(scale, 0x7ff00000 - ix); /* 2**(1-ilogb(x)) */ + x *= scale; + y *= scale; + return (x / (x * x + y * y) * scale); +} + +/* + * catanh(z) = log((1+z)/(1-z)) / 2 + * = log1p(4*x / |z-1|^2) / 4 + * + I * atan2(2*y, (1-x)*(1+x)-y*y) / 2 + * + * catanh(z) = z + O(z^3) as z -> 0 + * + * catanh(z) = 1/z + sign(y)*I*PI/2 + O(1/z^3) as z -> infinity + * The above formula works for the real part as well, because + * Re(catanh(z)) = x/|z|^2 + O(x/z^4) + * as z -> infinity, uniformly in x + */ +#if __cplusplus >= 201103L || !defined _MSC_VER +__host__ __device__ inline complex catanh(complex z) { + double x, y, ax, ay, rx, ry; + const volatile double pio2_lo = + 6.1232339957367659e-17; /* 0x11a62633145c07.0p-106 */ + const double pio2_hi = 1.5707963267948966e0; /* 0x1921fb54442d18.0p-52 */ + + x = z.real(); + y = z.imag(); + ax = fabs(x); + ay = fabs(y); + + /* This helps handle many cases. */ + if (y == 0 && ax <= 1) return (complex(atanh(x), y)); + + /* To ensure the same accuracy as atan(), and to filter out z = 0. */ + if (x == 0) return (complex(x, atan(y))); + + if (isnan(x) || isnan(y)) { + /* catanh(+-Inf + I*NaN) = +-0 + I*NaN */ + if (isinf(x)) return (complex(copysign(0.0, x), y + y)); + /* catanh(NaN + I*+-Inf) = sign(NaN)0 + I*+-PI/2 */ + if (isinf(y)) + return ( + complex(copysign(0.0, x), copysign(pio2_hi + pio2_lo, y))); + /* + * All other cases involving NaN return NaN + I*NaN. + * C99 leaves it optional whether to raise invalid if one of + * the arguments is not NaN, so we opt not to raise it. + */ + return (complex(x + 0.0 + (y + 0), x + 0.0 + (y + 0))); + } + + const double RECIP_EPSILON = 1.0 / DBL_EPSILON; + if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) + return (complex(real_part_reciprocal(x, y), + copysign(pio2_hi + pio2_lo, y))); + + const double SQRT_3_EPSILON = + 2.5809568279517849e-8; /* 0x1bb67ae8584caa.0p-78 */ + if (ax < SQRT_3_EPSILON / 2 && ay < SQRT_3_EPSILON / 2) { + /* + * z = 0 was filtered out above. All other cases must raise + * inexact, but this is the only only that needs to do it + * explicitly. + */ + raise_inexact(); + return (z); + } + + const double m_ln2 = 6.9314718055994531e-1; /* 0x162e42fefa39ef.0p-53 */ + if (ax == 1 && ay < DBL_EPSILON) + rx = (m_ln2 - log(ay)) / 2; + else + rx = log1p(4 * ax / sum_squares(ax - 1, ay)) / 4; + + if (ax == 1) + ry = atan2(2.0, -ay) / 2; + else if (ay < DBL_EPSILON) + ry = atan2(2 * ay, (1 - ax) * (1 + ax)) / 2; + else + ry = atan2(2 * ay, (1 - ax) * (1 + ax) - ay * ay) / 2; + + return (complex(copysign(rx, x), copysign(ry, y))); +} + +/* + * catan(z) = reverse(catanh(reverse(z))) + * where reverse(x + I*y) = y + I*x = I*conj(z). + */ +__host__ __device__ inline complex catan(complex z) { + complex w = catanh(complex(z.imag(), z.real())); + return (complex(w.imag(), w.real())); +} + +#endif + +} // namespace complex + +} // namespace detail + +template +__host__ __device__ inline complex acos(const complex& z) { + const complex ret = thrust::asin(z); + const ValueType pi = ValueType(3.14159265358979323846); + return complex(pi / 2 - ret.real(), -ret.imag()); +} + +template +__host__ __device__ inline complex asin(const complex& z) { + const complex i(0, 1); + return -i * asinh(i * z); +} + +template +__host__ __device__ inline complex atan(const complex& z) { + const complex i(0, 1); + return -i * thrust::atanh(i * z); +} + +template +__host__ __device__ inline complex acosh(const complex& z) { + thrust::complex ret( + (z.real() - z.imag()) * (z.real() + z.imag()) - ValueType(1.0), + ValueType(2.0) * z.real() * z.imag()); + ret = thrust::sqrt(ret); + if (z.real() < ValueType(0.0)) { + ret = -ret; + } + ret += z; + ret = thrust::log(ret); + if (ret.real() < ValueType(0.0)) { + ret = -ret; + } + return ret; +} + +template +__host__ __device__ inline complex asinh(const complex& z) { + return thrust::log(thrust::sqrt(z * z + ValueType(1)) + z); +} + +template +__host__ __device__ inline complex atanh(const complex& z) { + ValueType imag2 = z.imag() * z.imag(); + ValueType n = ValueType(1.0) + z.real(); + n = imag2 + n * n; + + ValueType d = ValueType(1.0) - z.real(); + d = imag2 + d * d; + complex ret(ValueType(0.25) * (::log(n) - ::log(d)), 0); + + d = ValueType(1.0) - z.real() * z.real() - imag2; + + ret.imag(ValueType(0.5) * ::atan2(ValueType(2.0) * z.imag(), d)); + return ret; +} + +template <> +__host__ __device__ inline complex acos(const complex& z) { + return detail::complex::cacos(z); +} + +template <> +__host__ __device__ inline complex asin(const complex& z) { + return detail::complex::casin(z); +} + +#if __cplusplus >= 201103L || !defined _MSC_VER +template <> +__host__ __device__ inline complex atan(const complex& z) { + return detail::complex::catan(z); +} +#endif + +template <> +__host__ __device__ inline complex acosh(const complex& z) { + return detail::complex::cacosh(z); +} + +template <> +__host__ __device__ inline complex asinh(const complex& z) { + return detail::complex::casinh(z); +} + +#if __cplusplus >= 201103L || !defined _MSC_VER +template <> +__host__ __device__ inline complex atanh(const complex& z) { + return detail::complex::catanh(z); +} +#endif + +} // namespace thrust diff --git a/cupy/_core/include/cupy/complex/catrigf.h b/cupy/_core/include/cupy/complex/catrigf.h new file mode 100644 index 0000000..410d06f --- /dev/null +++ b/cupy/_core/include/cupy/complex/catrigf.h @@ -0,0 +1,444 @@ +/* + * Copyright 2008-2013 NVIDIA Corporation + * Copyright 2013 Filipe RNC Maia + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*- + * Copyright (c) 2012 Stephen Montgomery-Smith + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Adapted from FreeBSD by Filipe Maia : + * freebsd/lib/msun/src/catrig.c + */ + + +#pragma once + +#include +#include + +namespace thrust { +namespace detail { +namespace complex { + +using thrust::complex; + +__host__ __device__ inline complex clog_for_large_values(complex z); + +/* + * The algorithm is very close to that in "Implementing the complex arcsine + * and arccosine functions using exception handling" by T. E. Hull, Thomas F. + * Fairgrieve, and Ping Tak Peter Tang, published in ACM Transactions on + * Mathematical Software, Volume 23 Issue 3, 1997, Pages 299-335, + * http://dl.acm.org/citation.cfm?id=275324. + * + * See catrig.c for complete comments. + * + * XXX comments were removed automatically, and even short ones on the right + * of statements were removed (all of them), contrary to normal style. Only + * a few comments on the right of declarations remain. + */ + +__host__ __device__ inline float f(float a, float b, float hypot_a_b) { + if (b < 0.0f) return ((hypot_a_b - b) / 2.0f); + if (b == 0.0f) return (a / 2.0f); + return (a * a / (hypot_a_b + b) / 2.0f); +} + +/* + * All the hard work is contained in this function. + * x and y are assumed positive or zero, and less than RECIP_EPSILON. + * Upon return: + * rx = Re(casinh(z)) = -Im(cacos(y + I*x)). + * B_is_usable is set to 1 if the value of B is usable. + * If B_is_usable is set to 0, sqrt_A2my2 = sqrt(A*A - y*y), and new_y = y. + * If returning sqrt_A2my2 has potential to result in an underflow, it is + * rescaled, and new_y is similarly rescaled. + */ +__host__ __device__ inline void do_hard_work(float x, float y, float* rx, + int* B_is_usable, float* B, + float* sqrt_A2my2, float* new_y) { + float R, S, A; /* A, B, R, and S are as in Hull et al. */ + float Am1, Amy; /* A-1, A-y. */ + const float A_crossover = + 10; /* Hull et al suggest 1.5, but 10 works better */ + const float FOUR_SQRT_MIN = 4.336808689942017736029811e-19f; + ; /* =0x1p-61; >= 4 * sqrt(FLT_MIN) */ + const float B_crossover = 0.6417f; /* suggested by Hull et al */ + R = hypotf(x, y + 1); + S = hypotf(x, y - 1); + + A = (R + S) / 2; + if (A < 1) A = 1; + + if (A < A_crossover) { + if (y == 1 && x < FLT_EPSILON * FLT_EPSILON / 128) { + *rx = sqrtf(x); + } else if (x >= FLT_EPSILON * fabsf(y - 1)) { + Am1 = f(x, 1 + y, R) + f(x, 1 - y, S); + *rx = log1pf(Am1 + sqrtf(Am1 * (A + 1))); + } else if (y < 1) { + *rx = x / sqrtf((1 - y) * (1 + y)); + } else { + *rx = log1pf((y - 1) + sqrtf((y - 1) * (y + 1))); + } + } else { + *rx = logf(A + sqrtf(A * A - 1)); + } + + *new_y = y; + + if (y < FOUR_SQRT_MIN) { + *B_is_usable = 0; + *sqrt_A2my2 = A * (2 / FLT_EPSILON); + *new_y = y * (2 / FLT_EPSILON); + return; + } + + *B = y / A; + *B_is_usable = 1; + + if (*B > B_crossover) { + *B_is_usable = 0; + if (y == 1 && x < FLT_EPSILON / 128) { + *sqrt_A2my2 = sqrtf(x) * sqrtf((A + y) / 2); + } else if (x >= FLT_EPSILON * fabsf(y - 1)) { + Amy = f(x, y + 1, R) + f(x, y - 1, S); + *sqrt_A2my2 = sqrtf(Amy * (A + y)); + } else if (y > 1) { + *sqrt_A2my2 = + x * (4 / FLT_EPSILON / FLT_EPSILON) * y / sqrtf((y + 1) * (y - 1)); + *new_y = y * (4 / FLT_EPSILON / FLT_EPSILON); + } else { + *sqrt_A2my2 = sqrtf((1 - y) * (1 + y)); + } + } +} + +__host__ __device__ inline complex casinhf(complex z) { + float x, y, ax, ay, rx, ry, B, sqrt_A2my2, new_y; + int B_is_usable; + complex w; + const float RECIP_EPSILON = 1.0 / FLT_EPSILON; + const float m_ln2 = 6.9314718055994531e-1f; /* 0x162e42fefa39ef.0p-53 */ + x = z.real(); + y = z.imag(); + ax = fabsf(x); + ay = fabsf(y); + + if (isnan(x) || isnan(y)) { + if (isinf(x)) return (complex(x, y + y)); + if (isinf(y)) return (complex(y, x + x)); + if (y == 0) return (complex(x + x, y)); + return (complex(x + 0.0f + (y + 0), x + 0.0f + (y + 0))); + } + + if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) { + if (signbit(x) == 0) + w = clog_for_large_values(z) + m_ln2; + else + w = clog_for_large_values(-z) + m_ln2; + return (complex(copysignf(w.real(), x), copysignf(w.imag(), y))); + } + + if (x == 0 && y == 0) return (z); + + raise_inexact(); + + const float SQRT_6_EPSILON = 8.4572793338e-4f; /* 0xddb3d7.0p-34 */ + if (ax < SQRT_6_EPSILON / 4 && ay < SQRT_6_EPSILON / 4) return (z); + + do_hard_work(ax, ay, &rx, &B_is_usable, &B, &sqrt_A2my2, &new_y); + if (B_is_usable) + ry = asinf(B); + else + ry = atan2f(new_y, sqrt_A2my2); + return (complex(copysignf(rx, x), copysignf(ry, y))); +} + +__host__ __device__ inline complex casinf(complex z) { + complex w = casinhf(complex(z.imag(), z.real())); + + return (complex(w.imag(), w.real())); +} + +__host__ __device__ inline complex cacosf(complex z) { + float x, y, ax, ay, rx, ry, B, sqrt_A2mx2, new_x; + int sx, sy; + int B_is_usable; + complex w; + const float pio2_hi = 1.5707963267948966e0f; /* 0x1921fb54442d18.0p-52 */ + const volatile float pio2_lo = + 6.1232339957367659e-17f; /* 0x11a62633145c07.0p-106 */ + const float m_ln2 = 6.9314718055994531e-1f; /* 0x162e42fefa39ef.0p-53 */ + + x = z.real(); + y = z.imag(); + sx = signbit(x); + sy = signbit(y); + ax = fabsf(x); + ay = fabsf(y); + + if (isnan(x) || isnan(y)) { + if (isinf(x)) return (complex(y + y, -infinity())); + if (isinf(y)) return (complex(x + x, -y)); + if (x == 0) return (complex(pio2_hi + pio2_lo, y + y)); + return (complex(x + 0.0f + (y + 0), x + 0.0f + (y + 0))); + } + + const float RECIP_EPSILON = 1.0 / FLT_EPSILON; + if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) { + w = clog_for_large_values(z); + rx = fabsf(w.imag()); + ry = w.real() + m_ln2; + if (sy == 0) ry = -ry; + return (complex(rx, ry)); + } + + if (x == 1 && y == 0) return (complex(0, -y)); + + raise_inexact(); + + const float SQRT_6_EPSILON = 8.4572793338e-4f; /* 0xddb3d7.0p-34 */ + if (ax < SQRT_6_EPSILON / 4 && ay < SQRT_6_EPSILON / 4) + return (complex(pio2_hi - (x - pio2_lo), -y)); + + do_hard_work(ay, ax, &ry, &B_is_usable, &B, &sqrt_A2mx2, &new_x); + if (B_is_usable) { + if (sx == 0) + rx = acosf(B); + else + rx = acosf(-B); + } else { + if (sx == 0) + rx = atan2f(sqrt_A2mx2, new_x); + else + rx = atan2f(sqrt_A2mx2, -new_x); + } + if (sy == 0) ry = -ry; + return (complex(rx, ry)); +} + +__host__ __device__ inline complex cacoshf(complex z) { + complex w; + float rx, ry; + + w = cacosf(z); + rx = w.real(); + ry = w.imag(); + /* cacosh(NaN + I*NaN) = NaN + I*NaN */ + if (isnan(rx) && isnan(ry)) return (complex(ry, rx)); + /* cacosh(NaN + I*+-Inf) = +Inf + I*NaN */ + /* cacosh(+-Inf + I*NaN) = +Inf + I*NaN */ + if (isnan(rx)) return (complex(fabsf(ry), rx)); + /* cacosh(0 + I*NaN) = NaN + I*NaN */ + if (isnan(ry)) return (complex(ry, ry)); + return (complex(fabsf(ry), copysignf(rx, z.imag()))); +} + +/* + * Optimized version of clog() for |z| finite and larger than ~RECIP_EPSILON. + */ +__host__ __device__ inline complex clog_for_large_values(complex z) { + float x, y; + float ax, ay, t; + const float m_e = 2.7182818284590452e0f; /* 0x15bf0a8b145769.0p-51 */ + + x = z.real(); + y = z.imag(); + ax = fabsf(x); + ay = fabsf(y); + if (ax < ay) { + t = ax; + ax = ay; + ay = t; + } + + if (ax > FLT_MAX / 2) + return (complex(logf(hypotf(x / m_e, y / m_e)) + 1, atan2f(y, x))); + + const float QUARTER_SQRT_MAX = + 2.3058430092136939520000000e+18f; /* = 0x1p61; <= sqrt(FLT_MAX) / 4 */ + const float SQRT_MIN = + 1.084202172485504434007453e-19f; /* 0x1p-63; >= sqrt(FLT_MIN) */ + if (ax > QUARTER_SQRT_MAX || ay < SQRT_MIN) + return (complex(logf(hypotf(x, y)), atan2f(y, x))); + + return (complex(logf(ax * ax + ay * ay) / 2, atan2f(y, x))); +} + +/* + * ================= + * | catanh, catan | + * ================= + */ + +/* + * sum_squares(x,y) = x*x + y*y (or just x*x if y*y would underflow). + * Assumes x*x and y*y will not overflow. + * Assumes x and y are finite. + * Assumes y is non-negative. + * Assumes fabsf(x) >= FLT_EPSILON. + */ +__host__ __device__ inline float sum_squares(float x, float y) { + const float SQRT_MIN = + 1.084202172485504434007453e-19f; /* 0x1p-63; >= sqrt(FLT_MIN) */ + /* Avoid underflow when y is small. */ + if (y < SQRT_MIN) return (x * x); + + return (x * x + y * y); +} + +__host__ __device__ inline float real_part_reciprocal(float x, float y) { + float scale; + uint32_t hx, hy; + int32_t ix, iy; + + get_float_word(hx, x); + ix = hx & 0x7f800000; + get_float_word(hy, y); + iy = hy & 0x7f800000; + //#define BIAS (FLT_MAX_EXP - 1) + const int BIAS = FLT_MAX_EXP - 1; + //#define CUTOFF (FLT_MANT_DIG / 2 + 1) + const int CUTOFF = (FLT_MANT_DIG / 2 + 1); + if (ix - iy >= CUTOFF << 23 || isinf(x)) return (1 / x); + if (iy - ix >= CUTOFF << 23) return (x / y / y); + if (ix <= (BIAS + FLT_MAX_EXP / 2 - CUTOFF) << 23) + return (x / (x * x + y * y)); + set_float_word(scale, 0x7f800000 - ix); + x *= scale; + y *= scale; + return (x / (x * x + y * y) * scale); +} + +#if __cplusplus >= 201103L || !defined _MSC_VER +__host__ __device__ inline complex catanhf(complex z) { + float x, y, ax, ay, rx, ry; + const volatile float pio2_lo = + 6.1232339957367659e-17; /* 0x11a62633145c07.0p-106 */ + const float pio2_hi = 1.5707963267948966e0; /* 0x1921fb54442d18.0p-52 */ + + x = z.real(); + y = z.imag(); + ax = fabsf(x); + ay = fabsf(y); + + if (y == 0 && ax <= 1) return (complex(atanhf(x), y)); + + if (x == 0) return (complex(x, atanf(y))); + + if (isnan(x) || isnan(y)) { + if (isinf(x)) return (complex(copysignf(0, x), y + y)); + if (isinf(y)) + return (complex(copysignf(0, x), copysignf(pio2_hi + pio2_lo, y))); + return (complex(x + 0.0f + (y + 0.0f), x + 0.0f + (y + 0.0f))); + } + + const float RECIP_EPSILON = 1.0f / FLT_EPSILON; + if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) + return (complex(real_part_reciprocal(x, y), + copysignf(pio2_hi + pio2_lo, y))); + + const float SQRT_3_EPSILON = 5.9801995673e-4; /* 0x9cc471.0p-34 */ + if (ax < SQRT_3_EPSILON / 2 && ay < SQRT_3_EPSILON / 2) { + raise_inexact(); + return (z); + } + + const float m_ln2 = 6.9314718056e-1f; /* 0xb17218.0p-24 */ + if (ax == 1 && ay < FLT_EPSILON) + rx = (m_ln2 - logf(ay)) / 2; + else + rx = log1pf(4 * ax / sum_squares(ax - 1, ay)) / 4; + + if (ax == 1) + ry = atan2f(2, -ay) / 2; + else if (ay < FLT_EPSILON) + ry = atan2f(2 * ay, (1 - ax) * (1 + ax)) / 2; + else + ry = atan2f(2 * ay, (1 - ax) * (1 + ax) - ay * ay) / 2; + + return (complex(copysignf(rx, x), copysignf(ry, y))); +} + +__host__ __device__ inline complex catanf(complex z) { + complex w = catanhf(complex(z.imag(), z.real())); + return (complex(w.imag(), w.real())); +} +#endif + +} // namespace complex + +} // namespace detail + +template <> +__host__ __device__ inline complex acos(const complex& z) { + return detail::complex::cacosf(z); +} + +template <> +__host__ __device__ inline complex asin(const complex& z) { + return detail::complex::casinf(z); +} + +#if __cplusplus >= 201103L || !defined _MSC_VER +template <> +__host__ __device__ inline complex atan(const complex& z) { + return detail::complex::catanf(z); +} +#endif + +template <> +__host__ __device__ inline complex acosh(const complex& z) { + return detail::complex::cacoshf(z); +} + +template <> +__host__ __device__ inline complex asinh(const complex& z) { + return detail::complex::casinhf(z); +} + +#if __cplusplus >= 201103L || !defined _MSC_VER +template <> +__host__ __device__ inline complex atanh(const complex& z) { + return detail::complex::catanhf(z); +} +#endif + +} // namespace thrust diff --git a/cupy/_core/include/cupy/complex/ccosh.h b/cupy/_core/include/cupy/complex/ccosh.h new file mode 100644 index 0000000..93af077 --- /dev/null +++ b/cupy/_core/include/cupy/complex/ccosh.h @@ -0,0 +1,205 @@ +/* + * Copyright 2008-2013 NVIDIA Corporation + * Copyright 2013 Filipe RNC Maia + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*- + * Copyright (c) 2005 Bruce D. Evans and Steven G. Kargl + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* adapted from FreeBSD: + * lib/msun/src/s_ccosh.c + */ + +#pragma once + +#include +#include + +namespace thrust { +namespace detail { +namespace complex { + +/* + * Hyperbolic cosine of a complex argument z = x + i y. + * + * cosh(z) = cosh(x+iy) + * = cosh(x) cos(y) + i sinh(x) sin(y). + * + * Exceptional values are noted in the comments within the source code. + * These values and the return value were taken from n1124.pdf. + */ + +__host__ __device__ inline thrust::complex ccosh( + const thrust::complex& z) { + const double huge = 8.98846567431157953864652595395e+307; // 0x1p1023 + double x, y, h; + uint32_t hx, hy, ix, iy, lx, ly; + + x = z.real(); + y = z.imag(); + + extract_words(hx, lx, x); + extract_words(hy, ly, y); + + ix = 0x7fffffff & hx; + iy = 0x7fffffff & hy; + + /* Handle the nearly-non-exceptional cases where x and y are finite. */ + if (ix < 0x7ff00000 && iy < 0x7ff00000) { + if ((iy | ly) == 0) return (thrust::complex(::cosh(x), x * y)); + if (ix < 0x40360000) /* small x: normal case */ + return ( + thrust::complex(::cosh(x) * ::cos(y), ::sinh(x) * ::sin(y))); + + /* |x| >= 22, so cosh(x) ~= exp(|x|) */ + if (ix < 0x40862e42) { + /* x < 710: exp(|x|) won't overflow */ + h = ::exp(::fabs(x)) * 0.5; + return (thrust::complex(h * cos(y), copysign(h, x) * sin(y))); + } else if (ix < 0x4096bbaa) { + /* x < 1455: scale to avoid overflow */ + thrust::complex z_; + z_ = ldexp_cexp(thrust::complex(fabs(x), y), -1); + return (thrust::complex(z_.real(), z_.imag() * copysign(1.0, x))); + } else { + /* x >= 1455: the result always overflows */ + h = huge * x; + return (thrust::complex(h * h * cos(y), h * sin(y))); + } + } + + /* + * cosh(+-0 +- I Inf) = dNaN + I sign(d(+-0, dNaN))0. + * The sign of 0 in the result is unspecified. Choice = normally + * the same as dNaN. Raise the invalid floating-point exception. + * + * cosh(+-0 +- I NaN) = d(NaN) + I sign(d(+-0, NaN))0. + * The sign of 0 in the result is unspecified. Choice = normally + * the same as d(NaN). + */ + if ((ix | lx) == 0 && iy >= 0x7ff00000) + return (thrust::complex(y - y, copysign(0.0, x * (y - y)))); + + /* + * cosh(+-Inf +- I 0) = +Inf + I (+-)(+-)0. + * + * cosh(NaN +- I 0) = d(NaN) + I sign(d(NaN, +-0))0. + * The sign of 0 in the result is unspecified. + */ + if ((iy | ly) == 0 && ix >= 0x7ff00000) { + if (((hx & 0xfffff) | lx) == 0) + return (thrust::complex(x * x, copysign(0.0, x) * y)); + return (thrust::complex(x * x, copysign(0.0, (x + x) * y))); + } + + /* + * cosh(x +- I Inf) = dNaN + I dNaN. + * Raise the invalid floating-point exception for finite nonzero x. + * + * cosh(x + I NaN) = d(NaN) + I d(NaN). + * Optionally raises the invalid floating-point exception for finite + * nonzero x. Choice = don't raise (except for signaling NaNs). + */ + if (ix < 0x7ff00000 && iy >= 0x7ff00000) + return (thrust::complex(y - y, x * (y - y))); + + /* + * cosh(+-Inf + I NaN) = +Inf + I d(NaN). + * + * cosh(+-Inf +- I Inf) = +Inf + I dNaN. + * The sign of Inf in the result is unspecified. Choice = always +. + * Raise the invalid floating-point exception. + * + * cosh(+-Inf + I y) = +Inf cos(y) +- I Inf sin(y) + */ + if (ix >= 0x7ff00000 && ((hx & 0xfffff) | lx) == 0) { + if (iy >= 0x7ff00000) return (thrust::complex(x * x, x * (y - y))); + return (thrust::complex((x * x) * cos(y), x * sin(y))); + } + + /* + * cosh(NaN + I NaN) = d(NaN) + I d(NaN). + * + * cosh(NaN +- I Inf) = d(NaN) + I d(NaN). + * Optionally raises the invalid floating-point exception. + * Choice = raise. + * + * cosh(NaN + I y) = d(NaN) + I d(NaN). + * Optionally raises the invalid floating-point exception for finite + * nonzero y. Choice = don't raise (except for signaling NaNs). + */ + return (thrust::complex((x * x) * (y - y), (x + x) * (y - y))); +} + +__host__ __device__ inline thrust::complex ccos( + const thrust::complex& z) { + /* ccos(z) = ccosh(I * z) */ + return (ccosh(thrust::complex(-z.imag(), z.real()))); +} + +} // namespace complex + +} // namespace detail + +template +__host__ __device__ inline complex cos(const complex& z) { + const ValueType re = z.real(); + const ValueType im = z.imag(); + return complex(::cos(re) * ::cosh(im), -::sin(re) * ::sinh(im)); +} + +template +__host__ __device__ inline complex cosh(const complex& z) { + const ValueType re = z.real(); + const ValueType im = z.imag(); + return complex(::cosh(re) * ::cos(im), ::sinh(re) * ::sin(im)); +} + +template <> +__host__ __device__ inline thrust::complex cos( + const thrust::complex& z) { + return detail::complex::ccos(z); +} + +template <> +__host__ __device__ inline thrust::complex cosh( + const thrust::complex& z) { + return detail::complex::ccosh(z); +} + +} // namespace thrust diff --git a/cupy/_core/include/cupy/complex/ccoshf.h b/cupy/_core/include/cupy/complex/ccoshf.h new file mode 100644 index 0000000..744abab --- /dev/null +++ b/cupy/_core/include/cupy/complex/ccoshf.h @@ -0,0 +1,135 @@ +/* + * Copyright 2008-2013 NVIDIA Corporation + * Copyright 2013 Filipe RNC Maia + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*- + * Copyright (c) 2005 Bruce D. Evans and Steven G. Kargl + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* adapted from FreeBSD: + * lib/msun/src/s_ccoshf.c + */ + + +#pragma once + +#include +#include + +namespace thrust { +namespace detail { +namespace complex { + +using thrust::complex; + +__host__ __device__ inline complex ccoshf(const complex& z) { + float x, y, h; + uint32_t hx, hy, ix, iy; + const float huge = 1.70141183460469231731687303716e+38; // 0x1p127; + + x = z.real(); + y = z.imag(); + + get_float_word(hx, x); + get_float_word(hy, y); + + ix = 0x7fffffff & hx; + iy = 0x7fffffff & hy; + if (ix < 0x7f800000 && iy < 0x7f800000) { + if (iy == 0) { + return (complex(coshf(x), x * y)); + } + if (ix < 0x41100000) { /* small x: normal case */ + return (complex(coshf(x) * cosf(y), sinhf(x) * sinf(y))); + } + /* |x| >= 9, so cosh(x) ~= exp(|x|) */ + if (ix < 0x42b17218) { + /* x < 88.7: expf(|x|) won't overflow */ + h = expf(fabsf(x)) * 0.5f; + return (complex(h * cosf(y), copysignf(h, x) * sinf(y))); + } else if (ix < 0x4340b1e7) { + /* x < 192.7: scale to avoid overflow */ + thrust::complex z_; + z_ = ldexp_cexpf(complex(fabsf(x), y), -1); + return (complex(z_.real(), z_.imag() * copysignf(1.0f, x))); + } else { + /* x >= 192.7: the result always overflows */ + h = huge * x; + return (complex(h * h * cosf(y), h * sinf(y))); + } + } + + if (ix == 0 && iy >= 0x7f800000) { + return (complex(y - y, copysignf(0.0f, x * (y - y)))); + } + if (iy == 0 && ix >= 0x7f800000) { + if ((hx & 0x7fffff) == 0) + return (complex(x * x, copysignf(0.0f, x) * y)); + return (complex(x * x, copysignf(0.0f, (x + x) * y))); + } + + if (ix < 0x7f800000 && iy >= 0x7f800000) { + return (complex(y - y, x * (y - y))); + } + + if (ix >= 0x7f800000 && (hx & 0x7fffff) == 0) { + if (iy >= 0x7f800000) return (complex(x * x, x * (y - y))); + return (complex((x * x) * cosf(y), x * sinf(y))); + } + return (complex((x * x) * (y - y), (x + x) * (y - y))); +} + +__host__ __device__ inline complex ccosf(const complex& z) { + return (ccoshf(complex(-z.imag(), z.real()))); +} + +} // namespace complex + +} // namespace detail + +template <> +__host__ __device__ inline complex cos(const complex& z) { + return detail::complex::ccosf(z); +} + +template <> +__host__ __device__ inline complex cosh(const complex& z) { + return detail::complex::ccoshf(z); +} + +} // namespace thrust diff --git a/cupy/_core/include/cupy/complex/cexp.h b/cupy/_core/include/cupy/complex/cexp.h new file mode 100644 index 0000000..3bf1c79 --- /dev/null +++ b/cupy/_core/include/cupy/complex/cexp.h @@ -0,0 +1,173 @@ +/* + * Copyright 2008-2013 NVIDIA Corporation + * Copyright 2013 Filipe RNC Maia + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*- + * Copyright (c) 2011 David Schultz + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* adapted from FreeBSD: + * lib/msun/src/s_cexp.c + * lib/msun/src/k_exp.c + * + */ + +#pragma once + +#include +#include + +namespace thrust { +namespace detail { +namespace complex { + +/* + * Compute exp(x), scaled to avoid spurious overflow. An exponent is + * returned separately in 'expt'. + * + * Input: ln(DBL_MAX) <= x < ln(2 * DBL_MAX / DBL_MIN_DENORM) ~= 1454.91 + * Output: 2**1023 <= y < 2**1024 + */ +__host__ __device__ inline double frexp_exp(double x, int* expt) { + const uint32_t k = 1799; /* constant for reduction */ + const double kln2 = 1246.97177782734161156; /* k * ln2 */ + + double exp_x; + uint32_t hx; + + /* + * We use exp(x) = exp(x - kln2) * 2**k, carefully chosen to + * minimize |exp(kln2) - 2**k|. We also scale the exponent of + * exp_x to MAX_EXP so that the result can be multiplied by + * a tiny number without losing accuracy due to denormalization. + */ + exp_x = exp(x - kln2); + get_high_word(hx, exp_x); + *expt = (hx >> 20) - (0x3ff + 1023) + k; + set_high_word(exp_x, (hx & 0xfffff) | ((0x3ff + 1023) << 20)); + return (exp_x); +} + +__host__ __device__ inline complex ldexp_cexp(complex z, int expt) { + double x, y, exp_x, scale1, scale2; + int ex_expt, half_expt; + + x = z.real(); + y = z.imag(); + exp_x = frexp_exp(x, &ex_expt); + expt += ex_expt; + + /* + * Arrange so that scale1 * scale2 == 2**expt. We use this to + * compensate for scalbn being horrendously slow. + */ + half_expt = expt / 2; + insert_words(scale1, (0x3ff + half_expt) << 20, 0); + half_expt = expt - half_expt; + insert_words(scale2, (0x3ff + half_expt) << 20, 0); + + return (complex(::cos(y) * exp_x * scale1 * scale2, + ::sin(y) * exp_x * scale1 * scale2)); +} + +__host__ __device__ inline complex cexp(const complex& z) { + double x, y, exp_x; + uint32_t hx, hy, lx, ly; + + const uint32_t exp_ovfl = 0x40862e42, /* high bits of MAX_EXP * ln2 ~= 710 */ + cexp_ovfl = 0x4096b8e4; /* (MAX_EXP - MIN_DENORM_EXP) * ln2 */ + + x = z.real(); + y = z.imag(); + + extract_words(hy, ly, y); + hy &= 0x7fffffff; + + /* cexp(x + I 0) = exp(x) + I 0 */ + if ((hy | ly) == 0) return (complex(exp(x), y)); + extract_words(hx, lx, x); + /* cexp(0 + I y) = cos(y) + I sin(y) */ + if (((hx & 0x7fffffff) | lx) == 0) return (complex(cos(y), sin(y))); + + if (hy >= 0x7ff00000) { + if (lx != 0 || (hx & 0x7fffffff) != 0x7ff00000) { + /* cexp(finite|NaN +- I Inf|NaN) = NaN + I NaN */ + return (complex(y - y, y - y)); + } else if (hx & 0x80000000) { + /* cexp(-Inf +- I Inf|NaN) = 0 + I 0 */ + return (complex(0.0, 0.0)); + } else { + /* cexp(+Inf +- I Inf|NaN) = Inf + I NaN */ + return (complex(x, y - y)); + } + } + + if (hx >= exp_ovfl && hx <= cexp_ovfl) { + /* + * x is between 709.7 and 1454.3, so we must scale to avoid + * overflow in exp(x). + */ + return (ldexp_cexp(z, 0)); + } else { + /* + * Cases covered here: + * - x < exp_ovfl and exp(x) won't overflow (common case) + * - x > cexp_ovfl, so exp(x) * s overflows for all s > 0 + * - x = +-Inf (generated by exp()) + * - x = NaN (spurious inexact exception from y) + */ + exp_x = ::exp(x); + return (complex(exp_x * cos(y), exp_x * sin(y))); + } +} + +} // namespace complex + +} // namespace detail + +template +__host__ __device__ inline complex exp(const complex& z) { + return polar(::exp(z.real()), z.imag()); +} + +template <> +__host__ __device__ inline complex exp(const complex& z) { + return detail::complex::cexp(z); +} + +} // namespace thrust diff --git a/cupy/_core/include/cupy/complex/cexpf.h b/cupy/_core/include/cupy/complex/cexpf.h new file mode 100644 index 0000000..e3f63c8 --- /dev/null +++ b/cupy/_core/include/cupy/complex/cexpf.h @@ -0,0 +1,153 @@ +/* + * Copyright 2008-2013 NVIDIA Corporation + * Copyright 2013 Filipe RNC Maia + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*- + * Copyright (c) 2011 David Schultz + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* adapted from FreeBSD: + * lib/msun/src/s_cexpf.c + * lib/msun/src/k_exp.c + * + */ + +#pragma once + +#include +#include + +namespace thrust { +namespace detail { +namespace complex { + +__host__ __device__ inline float frexp_expf(float x, int* expt) { + const uint32_t k = 235; /* constant for reduction */ + const float kln2 = 162.88958740F; /* k * ln2 */ + + // should this be a double instead? + float exp_x; + uint32_t hx; + + exp_x = expf(x - kln2); + get_float_word(hx, exp_x); + *expt = (hx >> 23) - (0x7f + 127) + k; + set_float_word(exp_x, (hx & 0x7fffff) | ((0x7f + 127) << 23)); + return (exp_x); +} + +__host__ __device__ inline complex ldexp_cexpf(complex z, int expt) { + float x, y, exp_x, scale1, scale2; + int ex_expt, half_expt; + + x = z.real(); + y = z.imag(); + exp_x = frexp_expf(x, &ex_expt); + expt += ex_expt; + + half_expt = expt / 2; + set_float_word(scale1, (0x7f + half_expt) << 23); + half_expt = expt - half_expt; + set_float_word(scale2, (0x7f + half_expt) << 23); + + return (complex(cos(y) * exp_x * scale1 * scale2, + sin(y) * exp_x * scale1 * scale2)); +} + +__host__ __device__ inline complex cexpf(const complex& z) { + float x, y, exp_x; + uint32_t hx, hy; + + const uint32_t exp_ovfl = 0x42b17218, /* MAX_EXP * ln2 ~= 88.722839355 */ + cexp_ovfl = 0x43400074; /* (MAX_EXP - MIN_DENORM_EXP) * ln2 */ + + x = z.real(); + y = z.imag(); + + get_float_word(hy, y); + hy &= 0x7fffffff; + + /* cexp(x + I 0) = exp(x) + I 0 */ + if (hy == 0) return (complex(exp(x), y)); + get_float_word(hx, x); + /* cexp(0 + I y) = cos(y) + I sin(y) */ + if ((hx & 0x7fffffff) == 0) { + return (complex(cos(y), sin(y))); + } + if (hy >= 0x7f800000) { + if ((hx & 0x7fffffff) != 0x7f800000) { + /* cexp(finite|NaN +- I Inf|NaN) = NaN + I NaN */ + return (complex(y - y, y - y)); + } else if (hx & 0x80000000) { + /* cexp(-Inf +- I Inf|NaN) = 0 + I 0 */ + return (complex(0.0, 0.0)); + } else { + /* cexp(+Inf +- I Inf|NaN) = Inf + I NaN */ + return (complex(x, y - y)); + } + } + + if (hx >= exp_ovfl && hx <= cexp_ovfl) { + /* + * x is between 88.7 and 192, so we must scale to avoid + * overflow in expf(x). + */ + return (ldexp_cexpf(z, 0)); + } else { + /* + * Cases covered here: + * - x < exp_ovfl and exp(x) won't overflow (common case) + * - x > cexp_ovfl, so exp(x) * s overflows for all s > 0 + * - x = +-Inf (generated by exp()) + * - x = NaN (spurious inexact exception from y) + */ + exp_x = ::exp(x); + return (complex(exp_x * ::cos(y), exp_x * ::sin(y))); + } +} + +} // namespace complex + +} // namespace detail + +template <> +__host__ __device__ inline complex exp(const complex& z) { + return detail::complex::cexpf(z); +} + +} // namespace thrust diff --git a/cupy/_core/include/cupy/complex/clog.h b/cupy/_core/include/cupy/complex/clog.h new file mode 100644 index 0000000..96444d8 --- /dev/null +++ b/cupy/_core/include/cupy/complex/clog.h @@ -0,0 +1,203 @@ +/* + * Copyright 2008-2013 NVIDIA Corporation + * Copyright 2013 Filipe RNC Maia + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*- + * Copyright (c) 2012 Stephen Montgomery-Smith + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* adapted from FreeBSDs msun:*/ + +#pragma once + +#include +#include + +namespace thrust { +namespace detail { +namespace complex { + +using thrust::complex; + +/* round down to 18 = 54/3 bits */ +__host__ __device__ inline double trim(double x) { + uint32_t hi; + get_high_word(hi, x); + insert_words(x, hi & 0xfffffff8, 0); + return x; +} + +__host__ __device__ inline complex clog(const complex& z) { + // Adapted from FreeBSDs msun + double x, y; + double ax, ay; + double x0, y0, x1, y1, x2, y2, t, hm1; + double val[12]; + int i, sorted; + const double e = 2.7182818284590452354; + + x = z.real(); + y = z.imag(); + + /* Handle NaNs using the general formula to mix them right. */ + if (x != x || y != y) { + return (complex(::log(norm(z)), ::atan2(y, x))); + } + + ax = ::abs(x); + ay = ::abs(y); + if (ax < ay) { + t = ax; + ax = ay; + ay = t; + } + + /* + * To avoid unnecessary overflow, if x and y are very large, divide x + * and y by M_E, and then add 1 to the logarithm. This depends on + * M_E being larger than sqrt(2). + * There is a potential loss of accuracy caused by dividing by M_E, + * but this case should happen extremely rarely. + */ + // if (ay > 5e307){ + // For high values of ay -> hypotf(DBL_MAX,ay) = inf + // We expect that for values at or below ay = 5e307 this should not happen + if (ay > 5e307) { + return (complex(::log(hypot(x / e, y / e)) + 1.0, ::atan2(y, x))); + } + if (ax == 1.) { + if (ay < 1e-150) { + return (complex((ay * 0.5) * ay, ::atan2(y, x))); + } + return (complex(log1p(ay * ay) * 0.5, ::atan2(y, x))); + } + + /* + * Because atan2 and hypot conform to C99, this also covers all the + * edge cases when x or y are 0 or infinite. + */ + if (ax < 1e-50 || ay < 1e-50 || ax > 1e50 || ay > 1e50) { + return (complex(::log(hypot(x, y)), ::atan2(y, x))); + } + + /* + * From this point on, we don't need to worry about underflow or + * overflow in calculating ax*ax or ay*ay. + */ + + /* Some easy cases. */ + + if (ax >= 1.0) { + return (complex(log1p((ax - 1) * (ax + 1) + ay * ay) * 0.5, + atan2(y, x))); + } + + if (ax * ax + ay * ay <= 0.7) { + return (complex(::log(ax * ax + ay * ay) * 0.5, ::atan2(y, x))); + } + + /* + * Take extra care so that ULP of real part is small if hypot(x,y) is + * moderately close to 1. + */ + + x0 = trim(ax); + ax = ax - x0; + x1 = trim(ax); + x2 = ax - x1; + y0 = trim(ay); + ay = ay - y0; + y1 = trim(ay); + y2 = ay - y1; + + val[0] = x0 * x0; + val[1] = y0 * y0; + val[2] = 2 * x0 * x1; + val[3] = 2 * y0 * y1; + val[4] = x1 * x1; + val[5] = y1 * y1; + val[6] = 2 * x0 * x2; + val[7] = 2 * y0 * y2; + val[8] = 2 * x1 * x2; + val[9] = 2 * y1 * y2; + val[10] = x2 * x2; + val[11] = y2 * y2; + + /* Bubble sort. */ + + do { + sorted = 1; + for (i = 0; i < 11; i++) { + if (val[i] < val[i + 1]) { + sorted = 0; + t = val[i]; + val[i] = val[i + 1]; + val[i + 1] = t; + } + } + } while (!sorted); + + hm1 = -1; + for (i = 0; i < 12; i++) { + hm1 += val[i]; + } + return (complex(0.5 * log1p(hm1), atan2(y, x))); +} + +} // namespace complex + +} // namespace detail + +template +__host__ __device__ inline complex log(const complex& z) { + return complex(::log(thrust::abs(z)), thrust::arg(z)); +} + +template <> +__host__ __device__ inline complex log(const complex& z) { + return detail::complex::clog(z); +} + +template +__host__ __device__ inline complex log10(const complex& z) { + // Using the explicit literal prevents compile time warnings in + // devices that don't support doubles + return thrust::log(z) / ValueType(2.30258509299404568402); +} + +} // namespace thrust diff --git a/cupy/_core/include/cupy/complex/clogf.h b/cupy/_core/include/cupy/complex/clogf.h new file mode 100644 index 0000000..4084c2b --- /dev/null +++ b/cupy/_core/include/cupy/complex/clogf.h @@ -0,0 +1,192 @@ +/* + * Copyright 2008-2013 NVIDIA Corporation + * Copyright 2013 Filipe RNC Maia + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*- + * Copyright (c) 2012 Stephen Montgomery-Smith + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* adapted from FreeBSDs msun:*/ + +#pragma once + +#include +#include + +namespace thrust { +namespace detail { +namespace complex { + +using thrust::complex; + +/* round down to 8 = 24/3 bits */ +__host__ __device__ inline float trim(float x) { + uint32_t hx; + get_float_word(hx, x); + hx &= 0xffff0000; + float ret; + set_float_word(ret, hx); + return ret; +} + +__host__ __device__ inline complex clogf(const complex& z) { + // Adapted from FreeBSDs msun + float x, y; + float ax, ay; + float x0, y0, x1, y1, x2, y2, t, hm1; + float val[12]; + int i, sorted; + const float e = 2.7182818284590452354f; + + x = z.real(); + y = z.imag(); + + /* Handle NaNs using the general formula to mix them right. */ + if (x != x || y != y) { + return (complex(::log(norm(z)), ::atan2(y, x))); + } + + ax = ::abs(x); + ay = ::abs(y); + if (ax < ay) { + t = ax; + ax = ay; + ay = t; + } + + /* + * To avoid unnecessary overflow, if x and y are very large, divide x + * and y by M_E, and then add 1 to the logarithm. This depends on + * M_E being larger than sqrt(2). + * There is a potential loss of accuracy caused by dividing by M_E, + * but this case should happen extremely rarely. + */ + // For high values of ay -> hypotf(FLT_MAX,ay) = inf + // We expect that for values at or below ay = 1e34f this should not happen + if (ay > 1e34f) { + return (complex(::log(hypotf(x / e, y / e)) + 1.0f, ::atan2(y, x))); + } + if (ax == 1.f) { + if (ay < 1e-19f) { + return (complex((ay * 0.5f) * ay, ::atan2(y, x))); + } + return (complex(log1pf(ay * ay) * 0.5f, ::atan2(y, x))); + } + + /* + * Because atan2 and hypot conform to C99, this also covers all the + * edge cases when x or y are 0 or infinite. + */ + if (ax < 1e-6f || ay < 1e-6f || ax > 1e6f || ay > 1e6f) { + return (complex(::log(hypotf(x, y)), ::atan2(y, x))); + } + + /* + * From this point on, we don't need to worry about underflow or + * overflow in calculating ax*ax or ay*ay. + */ + + /* Some easy cases. */ + + if (ax >= 1.0f) { + return (complex(log1pf((ax - 1.f) * (ax + 1.f) + ay * ay) * 0.5f, + atan2(y, x))); + } + + if (ax * ax + ay * ay <= 0.7f) { + return (complex(::log(ax * ax + ay * ay) * 0.5f, ::atan2(y, x))); + } + + /* + * Take extra care so that ULP of real part is small if hypot(x,y) is + * moderately close to 1. + */ + + x0 = trim(ax); + ax = ax - x0; + x1 = trim(ax); + x2 = ax - x1; + y0 = trim(ay); + ay = ay - y0; + y1 = trim(ay); + y2 = ay - y1; + + val[0] = x0 * x0; + val[1] = y0 * y0; + val[2] = 2 * x0 * x1; + val[3] = 2 * y0 * y1; + val[4] = x1 * x1; + val[5] = y1 * y1; + val[6] = 2 * x0 * x2; + val[7] = 2 * y0 * y2; + val[8] = 2 * x1 * x2; + val[9] = 2 * y1 * y2; + val[10] = x2 * x2; + val[11] = y2 * y2; + + /* Bubble sort. */ + + do { + sorted = 1; + for (i = 0; i < 11; i++) { + if (val[i] < val[i + 1]) { + sorted = 0; + t = val[i]; + val[i] = val[i + 1]; + val[i + 1] = t; + } + } + } while (!sorted); + + hm1 = -1; + for (i = 0; i < 12; i++) { + hm1 += val[i]; + } + return (complex(0.5f * log1pf(hm1), atan2(y, x))); +} + +} // namespace complex + +} // namespace detail + +template <> +__host__ __device__ inline complex log(const complex& z) { + return detail::complex::clogf(z); +} + +} // namespace thrust diff --git a/cupy/_core/include/cupy/complex/complex.h b/cupy/_core/include/cupy/complex/complex.h new file mode 100644 index 0000000..dc5f5f1 --- /dev/null +++ b/cupy/_core/include/cupy/complex/complex.h @@ -0,0 +1,674 @@ +/* Copyright 2008-2013 NVIDIA Corporation + * Copyright 2013 Filipe RNC Maia + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*! \file complex.h + * \brief Complex numbers + */ + +#pragma once + +namespace thrust { + +template +struct _select_greater_type_impl { + typedef T type; +}; + +template +struct _select_greater_type_impl { + typedef U type; +}; + +template +struct _select_greater_type + : _select_greater_type_impl sizeof(U))> {}; + +/* + * Calls to the standard math library from inside the thrust namespace + * with real arguments require explicit scope otherwise they will fail + * to resolve as it will find the equivalent complex function but then + * fail to match the template, and give up looking for other scopes. + */ + +/*! \addtogroup numerics + * \{ + */ + +/*! \addtogroup complex_numbers Complex Numbers + * \{ + */ + +/*! \p complex is the Thrust equivalent to std::complex. It is + * functionally + * equivalent to it, but can also be used in device code which + * std::complex currently cannot. + * + * \tparam T The type used to hold the real and imaginary parts. Should be + * float + * or double. Others types are not supported. + * + */ +template +#if defined(__CUDACC__) +struct __align__(sizeof(T)*2) complex { +#else +// ROCm (hipcc) does not support `__align__` +struct complex { +#endif + public: + /*! \p value_type is the type of \p complex's real and imaginary parts. + */ + typedef T value_type; + + /* --- Constructors --- */ + + /*! Construct a complex number with an imaginary part of 0. + * + * \param re The real part of the number. + */ + inline __host__ __device__ complex(const T& re); + + /*! Construct a complex number from its real and imaginary parts. + * + * \param re The real part of the number. + * \param im The imaginary part of the number. + */ + inline __host__ __device__ complex(const T& re, const T& im); + +#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900) + /*! Default construct a complex number. + */ + inline complex() = default; + + /*! This copy constructor copies from a \p complex with a type that is + * convertible to this \p complex's \c value_type. + * + * \param z The \p complex to copy from. + */ + inline complex(const complex& z) = default; +#else + /*! Default construct a complex number. + */ + inline __host__ __device__ complex(); + + /*! This copy constructor copies from a \p complex with a type that is + * convertible to this \p complex's \c value_type. + * + * \param z The \p complex to copy from. + */ + inline __host__ __device__ complex(const complex& z); +#endif // c++11 + + /*! This copy constructor copies from a \p complex with a type that + * is convertible to this \p complex \c value_type. + * + * \param z The \p complex to copy from. + * + * \tparam X is convertible to \c value_type. + */ + template + inline __host__ __device__ complex(const complex& z); + + /* --- Assignment Operators --- */ + + /*! Assign `re` to the real part of this \p complex and set the imaginary part + * to 0. + * + * \param re The real part of the number. + */ + inline __host__ __device__ complex& operator=(const T& re); + + /*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this + * \p complex respectively. + * + * \param z The \p complex to copy from. + */ + inline __host__ __device__ complex& operator=(const complex& z); + + /*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this + * \p complex respectively. + * + * \param z The \p complex to copy from. + * + * \tparam U is convertible to \c value_type. + */ + template + inline __host__ __device__ complex& operator=(const complex& z); + + /* --- Compound Assignment Operators --- */ + + /*! Adds a \p complex to this \p complex and + * assigns the result to this \p complex. + * + * \param z The \p complex to be Added. + */ + __host__ __device__ inline complex& operator+=(const complex z); + + /*! Subtracts a \p complex from this \p complex and + * assigns the result to this \p complex. + * + * \param z The \p complex to be subtracted. + */ + __host__ __device__ inline complex& operator-=(const complex z); + + /*! Multiplies this \p complex by another \p complex and + * assigns the result to this \p complex. + * + * \param z The \p complex to be multiplied. + */ + __host__ __device__ inline complex& operator*=(const complex z); + + /*! Divides this \p complex by another \p complex and + * assigns the result to this \p complex. + * + * \param z The \p complex to be divided. + */ + __host__ __device__ inline complex& operator/=(const complex z); + + /* --- Getter functions --- + * The volatile ones are there to help for example + * with certain reductions optimizations + */ + + /*! Returns the real part of this \p complex. + */ + __host__ __device__ inline T real() const volatile { return m_data[0]; } + + /*! Returns the imaginary part of this \p complex. + */ + __host__ __device__ inline T imag() const volatile { return m_data[1]; } + + /*! Returns the real part of this \p complex. + */ + __host__ __device__ inline T real() const { return m_data[0]; } + + /*! Returns the imaginary part of this \p complex. + */ + __host__ __device__ inline T imag() const { return m_data[1]; } + + /* --- Setter functions --- + * The volatile ones are there to help for example + * with certain reductions optimizations + */ + + /*! Sets the real part of this \p complex. + * + * \param re The new real part of this \p complex. + */ + __host__ __device__ inline void real(T re) volatile { m_data[0] = re; } + + /*! Sets the imaginary part of this \p complex. + * + * \param im The new imaginary part of this \p complex.e + */ + __host__ __device__ inline void imag(T im) volatile { m_data[1] = im; } + + /*! Sets the real part of this \p complex. + * + * \param re The new real part of this \p complex. + */ + __host__ __device__ inline void real(T re) { m_data[0] = re; } + + /*! Sets the imaginary part of this \p complex. + * + * \param im The new imaginary part of this \p complex. + */ + __host__ __device__ inline void imag(T im) { m_data[1] = im; } + + private: + T m_data[2]; +}; + +/* --- General Functions --- */ + +/*! Returns the magnitude (also known as absolute value) of a \p complex. + * + * \param z The \p complex from which to calculate the absolute value. + */ +template +__host__ __device__ inline T abs(const complex& z); + +/*! Returns the phase angle (also known as argument) in radians of a \p complex. + * + * \param z The \p complex from which to calculate the phase angle. + */ +template +__host__ __device__ inline T arg(const complex& z); + +/*! Returns the square of the magnitude of a \p complex. + * + * \param z The \p complex from which to calculate the norm. + */ +template +__host__ __device__ inline T norm(const complex& z); + +/*! Returns the complex conjugate of a \p complex. + * + * \param z The \p complex from which to calculate the complex conjugate. + */ +template +__host__ __device__ inline complex conj(const complex& z); + +/*! Returns the real part of a \p complex. + * + * \param z The \p complex from which to return the real part + */ +template +__host__ __device__ inline T real(const complex& z); + +/*! Returns the imaginary part of a \p complex. + * + * \param z The \p complex from which to return the imaginary part + */ +template +__host__ __device__ inline T imag(const complex& z); + +/*! Returns a \p complex with the specified magnitude and phase. + * + * \param m The magnitude of the returned \p complex. + * \param theta The phase of the returned \p complex in radians. + */ +template +__host__ __device__ inline complex polar(const T& m, const T& theta = 0); + +/*! Returns the projection of a \p complex on the Riemann sphere. + * For all finite \p complex it returns the argument. For \p complexs + * with a non finite part returns (INFINITY,+/-0) where the sign of + * the zero matches the sign of the imaginary part of the argument. + * + * \param z The \p complex argument. + */ +template +__host__ __device__ inline complex proj(const T& z); + +/* --- Binary Arithmetic operators --- */ + +/*! Multiplies two \p complex numbers. + * + * \param lhs The first \p complex. + * \param rhs The second \p complex. + */ +template +__host__ __device__ inline complex operator*(const complex& lhs, + const complex& rhs); + +/*! Multiplies a \p complex number by a scalar. + * + * \param lhs The \p complex. + * \param rhs The scalar. + */ +template +__host__ __device__ inline complex operator*(const complex& lhs, const T& rhs); + +/*! Multiplies a scalar by a \p complex number. + * + * \param lhs The scalar. + * \param rhs The \p complex. + */ +template +__host__ __device__ inline complex operator*(const T& lhs, const complex& rhs); + +/*! Divides two \p complex numbers. + * + * \param lhs The numerator (dividend). + * \param rhs The denomimator (divisor). + */ +template +__host__ __device__ inline complex operator/(const complex& lhs, + const complex& rhs); + +/*! Divides a \p complex number by a scalar. + * + * \param lhs The complex numerator (dividend). + * \param rhs The scalar denomimator (divisor). + */ +template +__host__ __device__ inline complex operator/(const complex& lhs, const T& rhs); + +/*! Divides a scalar by a \p complex number. + * + * \param lhs The scalar numerator (dividend). + * \param rhs The complex denomimator (divisor). + */ +template +__host__ __device__ inline complex operator/(const T& lhs, const complex& rhs); + +/*! Adds two \p complex numbers. + * + * \param lhs The first \p complex. + * \param rhs The second \p complex. + */ +template +__host__ __device__ inline complex operator+(const complex& lhs, + const complex& rhs); + +/*! Adds a scalar to a \p complex number. + * + * \param lhs The \p complex. + * \param rhs The scalar. + */ +template +__host__ __device__ inline complex operator+(const complex& lhs, const T& rhs); + +/*! Adds a \p complex number to a scalar. + * + * \param lhs The scalar. + * \param rhs The \p complex. + */ +template +__host__ __device__ inline complex operator+(const T& lhs, const complex& rhs); + +/*! Subtracts two \p complex numbers. + * + * \param lhs The first \p complex (minuend). + * \param rhs The second \p complex (subtrahend). + */ +template +__host__ __device__ inline complex operator-(const complex& lhs, + const complex& rhs); + +/*! Subtracts a scalar from a \p complex number. + * + * \param lhs The \p complex (minuend). + * \param rhs The scalar (subtrahend). + */ +template +__host__ __device__ inline complex operator-(const complex& lhs, const T& rhs); + +/*! Subtracts a \p complex number from a scalar. + * + * \param lhs The scalar (minuend). + * \param rhs The \p complex (subtrahend). + */ +template +__host__ __device__ inline complex operator-(const T& lhs, const complex& rhs); + +/* --- Unary Arithmetic operators --- */ + +/*! Unary plus, returns its \p complex argument. + * + * \param rhs The \p complex argument. + */ +template +__host__ __device__ inline complex operator+(const complex& rhs); + +/*! Unary minus, returns the additive inverse (negation) of its \p complex + * argument. + * + * \param rhs The \p complex argument. + */ +template +__host__ __device__ inline complex operator-(const complex& rhs); + +/* --- Exponential Functions --- */ + +/*! Returns the complex exponential of a \p complex number. + * + * \param z The \p complex argument. + */ +template +__host__ __device__ complex exp(const complex& z); + +/*! Returns the complex natural logarithm of a \p complex number. + * + * \param z The \p complex argument. + */ +template +__host__ __device__ complex log(const complex& z); + +/*! Returns the complex base 10 logarithm of a \p complex number. + * + * \param z The \p complex argument. + */ +template +__host__ __device__ inline complex log10(const complex& z); + +/* --- Power Functions --- */ + +/*! Returns a \p complex number raised to another. + * + * \param x The base. + * \param y The exponent. + */ +template +__host__ __device__ complex pow(const complex& x, const complex& y); + +/*! Returns a \p complex number raised to a scalar. + * + * \param x The \p complex base. + * \param y The scalar exponent. + */ +template +__host__ __device__ complex pow(const complex& x, const T& y); + +/*! Returns a scalar raised to a \p complex number. + * + * \param x The scalar base. + * \param y The \p complex exponent. + */ +template +__host__ __device__ complex pow(const T& x, const complex& y); + +/*! Returns a \p complex number raised to another. The types of the two \p + * complex should be compatible + * and the type of the returned \p complex is the promoted type of the two + * arguments. + * + * \param x The base. + * \param y The exponent. + */ +template +__host__ __device__ complex::type> pow( + const complex& x, const complex& y); + +/*! Returns a \p complex number raised to a scalar. The type of the \p complex + * should be compatible with the scalar + * and the type of the returned \p complex is the promoted type of the two + * arguments. + * + * \param x The base. + * \param y The exponent. + */ +template +__host__ __device__ complex::type> pow( + const complex& x, const U& y); + +/*! Returns a scalar raised to a \p complex number. The type of the \p complex + * should be compatible with the scalar + * and the type of the returned \p complex is the promoted type of the two + * arguments. + * + * \param x The base. + * \param y The exponent. + */ +template +__host__ __device__ complex::type> pow( + const T& x, const complex& y); + +/*! Returns the complex square root of a \p complex number. + * + * \param z The \p complex argument. + */ +template +__host__ __device__ complex sqrt(const complex& z); + +/* --- Trigonometric Functions --- */ + +/*! Returns the complex cosine of a \p complex number. + * + * \param z The \p complex argument. + */ +template +__host__ __device__ complex cos(const complex& z); + +/*! Returns the complex sine of a \p complex number. + * + * \param z The \p complex argument. + */ +template +__host__ __device__ complex sin(const complex& z); + +/*! Returns the complex tangent of a \p complex number. + * + * \param z The \p complex argument. + */ +template +__host__ __device__ complex tan(const complex& z); + +/* --- Hyperbolic Functions --- */ + +/*! Returns the complex hyperbolic cosine of a \p complex number. + * + * \param z The \p complex argument. + */ +template +__host__ __device__ complex cosh(const complex& z); + +/*! Returns the complex hyperbolic sine of a \p complex number. + * + * \param z The \p complex argument. + */ +template +__host__ __device__ complex sinh(const complex& z); + +/*! Returns the complex hyperbolic tangent of a \p complex number. + * + * \param z The \p complex argument. + */ +template +__host__ __device__ complex tanh(const complex& z); + +/* --- Inverse Trigonometric Functions --- */ + +/*! Returns the complex arc cosine of a \p complex number. + * + * The range of the real part of the result is [0, Pi] and + * the range of the imaginary part is [-inf, +inf] + * + * \param z The \p complex argument. + */ +template +__host__ __device__ complex acos(const complex& z); + +/*! Returns the complex arc sine of a \p complex number. + * + * The range of the real part of the result is [-Pi/2, Pi/2] and + * the range of the imaginary part is [-inf, +inf] + * + * \param z The \p complex argument. + */ +template +__host__ __device__ complex asin(const complex& z); + +/*! Returns the complex arc tangent of a \p complex number. + * + * The range of the real part of the result is [-Pi/2, Pi/2] and + * the range of the imaginary part is [-inf, +inf] + * + * \param z The \p complex argument. + */ +template +__host__ __device__ complex atan(const complex& z); + +/* --- Inverse Hyperbolic Functions --- */ + +/*! Returns the complex inverse hyperbolic cosine of a \p complex number. + * + * The range of the real part of the result is [0, +inf] and + * the range of the imaginary part is [-Pi, Pi] + * + * \param z The \p complex argument. + */ +template +__host__ __device__ complex acosh(const complex& z); + +/*! Returns the complex inverse hyperbolic sine of a \p complex number. + * + * The range of the real part of the result is [-inf, +inf] and + * the range of the imaginary part is [-Pi/2, Pi/2] + * + * \param z The \p complex argument. + */ +template +__host__ __device__ complex asinh(const complex& z); + +/*! Returns the complex inverse hyperbolic tangent of a \p complex number. + * + * The range of the real part of the result is [-inf, +inf] and + * the range of the imaginary part is [-Pi/2, Pi/2] + * + * \param z The \p complex argument. + */ +template +__host__ __device__ complex atanh(const complex& z); + +/* --- Equality Operators --- */ + +/*! Returns true if two \p complex numbers are equal and false otherwise. + * + * \param lhs The first \p complex. + * \param rhs The second \p complex. + */ +template +__host__ __device__ inline bool operator==(const complex& lhs, const complex& rhs); + +/*! Returns true if the imaginary part of the \p complex number is zero and the + * real part is equal to the scalar. Returns false otherwise. + * + * \param lhs The scalar. + * \param rhs The \p complex. + */ +template +__host__ __device__ inline bool operator==(const T& lhs, const complex& rhs); + +/*! Returns true if the imaginary part of the \p complex number is zero and the + * real part is equal to the scalar. Returns false otherwise. + * + * \param lhs The \p complex. + * \param rhs The scalar. + */ +template +__host__ __device__ inline bool operator==(const complex& lhs, const T& rhs); + +/*! Returns true if two \p complex numbers are different and false otherwise. + * + * \param lhs The first \p complex. + * \param rhs The second \p complex. + */ +template +__host__ __device__ inline bool operator!=(const complex& lhs, const complex& rhs); + +/*! Returns true if the imaginary part of the \p complex number is not zero or + * the real part is different from the scalar. Returns false otherwise. + * + * \param lhs The scalar. + * \param rhs The \p complex. + */ +template +__host__ __device__ inline bool operator!=(const T& lhs, const complex& rhs); + +/*! Returns true if the imaginary part of the \p complex number is not zero or + * the real part is different from the scalar. Returns false otherwise. + * + * \param lhs The \p complex. + * \param rhs The scalar. + */ +template +__host__ __device__ inline bool operator!=(const complex& lhs, const T& rhs); + +} // end namespace thrust + +#include diff --git a/cupy/_core/include/cupy/complex/complex_inl.h b/cupy/_core/include/cupy/complex/complex_inl.h new file mode 100644 index 0000000..e38cfbd --- /dev/null +++ b/cupy/_core/include/cupy/complex/complex_inl.h @@ -0,0 +1,164 @@ +/* + * Copyright 2008-2013 NVIDIA Corporation + * Copyright 2013 Filipe RNC Maia + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace thrust { + +/* --- Constructors --- */ +template +inline __host__ __device__ complex::complex(const T& re) { + real(re); + imag(T()); +} + +template +inline __host__ __device__ complex::complex(const T& re, const T& im) { + real(re); + imag(im); +} + +#if ((!defined(_MSC_VER) && __cplusplus < 201103L) || \ + (defined(_MSC_VER) && _MSC_VER < 1900)) +template +inline __host__ __device__ complex::complex() { + real(T()); + imag(T()); +} + +template +inline __host__ __device__ complex::complex(const complex& z) { + real(z.real()); + imag(z.imag()); +} +#endif + +template +template +inline __host__ __device__ complex::complex(const complex& z) { + // The explicit T() is there no prevent Visual Studio from complaining + // about potential loss of precision + real(T(z.real())); + imag(T(z.imag())); +} + +/* --- Assignment Operators --- */ + +template +inline __host__ __device__ complex& complex::operator=(const T& re) { + real(re); + imag(T()); + return *this; +} + +template +inline __host__ __device__ complex& complex::operator=(const complex& z) { + real(z.real()); + imag(z.imag()); + return *this; +} + +template +template +inline __host__ __device__ complex& complex::operator=(const complex& z) { + real(T(z.real())); + imag(T(z.imag())); + return *this; +} + +/* --- Compound Assignment Operators --- */ +// TODO(leofang): support operators with argument of type T, see upstream + +template +__host__ __device__ inline complex& complex::operator+=(const complex z) { + *this = *this + z; + return *this; +} + +template +__host__ __device__ inline complex& complex::operator-=(const complex z) { + *this = *this - z; + return *this; +} + +template +__host__ __device__ inline complex& complex::operator*=(const complex z) { + *this = *this * z; + return *this; +} + +template +__host__ __device__ inline complex& complex::operator/=(const complex z) { + *this = *this / z; + return *this; +} + +/* --- Equality Operators --- */ + +template +__host__ __device__ inline bool operator==(const complex& lhs, + const complex& rhs) { + return lhs.real() == rhs.real() && lhs.imag() == rhs.imag(); +} + +template +__host__ __device__ inline bool operator==(const T& lhs, const complex& rhs) { + return lhs == rhs.real() && rhs.imag() == 0; +} + +template +__host__ __device__ inline bool operator==(const complex& lhs, const T& rhs) { + return lhs.real() == rhs && lhs.imag() == 0; +} + +template +__host__ __device__ inline bool operator!=(const complex& lhs, + const complex& rhs) { + return !(lhs == rhs); +} + +template +__host__ __device__ inline bool operator!=(const T& lhs, const complex& rhs) { + return !(lhs == rhs); +} + +template +__host__ __device__ inline bool operator!=(const complex& lhs, const T& rhs) { + return !(lhs == rhs); +} +} + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include diff --git a/cupy/_core/include/cupy/complex/cpow.h b/cupy/_core/include/cupy/complex/cpow.h new file mode 100644 index 0000000..28c19c8 --- /dev/null +++ b/cupy/_core/include/cupy/complex/cpow.h @@ -0,0 +1,44 @@ +#pragma once + +#include + +namespace thrust { + +template +__host__ __device__ inline complex pow(const complex& z, + const complex& exponent) { + return exp(log(complex(z)) * complex(exponent)); +} + +template +__host__ __device__ inline complex pow(const complex& z, const T& exponent) { + return exp(log(complex(z)) * T(exponent)); +} + +template +__host__ __device__ inline complex pow(const T& x, const complex& exponent) { + return exp(log(T(x)) * complex(exponent)); +} + +template +__host__ __device__ inline complex::type> pow( + const complex& z, const complex& exponent) { + typedef typename _select_greater_type::type PromotedType; + return pow(complex(z), complex(exponent)); +} + +template +__host__ __device__ inline complex::type> pow( + const complex& z, const U& exponent) { + typedef typename _select_greater_type::type PromotedType; + return pow(complex(z), PromotedType(exponent)); +} + +template +__host__ __device__ inline complex::type> pow( + const T& x, const complex& exponent) { + typedef typename _select_greater_type::type PromotedType; + return pow(PromotedType(x), complex(exponent)); +} + +} diff --git a/cupy/_core/include/cupy/complex/cproj.h b/cupy/_core/include/cupy/complex/cproj.h new file mode 100644 index 0000000..f5e2d33 --- /dev/null +++ b/cupy/_core/include/cupy/complex/cproj.h @@ -0,0 +1,64 @@ +/* + * Copyright 2008-2013 NVIDIA Corporation + * Copyright 2013 Filipe RNC Maia + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace thrust { +namespace detail { +namespace complex { + +using thrust::complex; + +__host__ __device__ inline complex cprojf(const complex& z) { + if (!isinf(z.real()) && !isinf(z.imag())) { + return z; + } else { + // std::numeric_limits::infinity() doesn't run on the GPU + return complex(infinity(), copysignf(0.0, z.imag())); + } +} + +__host__ __device__ inline complex cproj(const complex& z) { + if (!isinf(z.real()) && !isinf(z.imag())) { + return z; + } else { + // numeric_limits::infinity() doesn't run on the GPU + return complex(infinity(), copysign(0.0, z.imag())); + } +} +} +} + +template +__host__ __device__ inline thrust::complex proj(const thrust::complex& z) { + return detail::complex::cproj(z); +} + +template <> +__host__ __device__ inline thrust::complex proj( + const thrust::complex& z) { + return detail::complex::cproj(z); +} + +template <> +__host__ __device__ inline thrust::complex proj(const thrust::complex& z) { + return detail::complex::cprojf(z); +} +} diff --git a/cupy/_core/include/cupy/complex/csinh.h b/cupy/_core/include/cupy/complex/csinh.h new file mode 100644 index 0000000..766004d --- /dev/null +++ b/cupy/_core/include/cupy/complex/csinh.h @@ -0,0 +1,192 @@ +/* + * Copyright 2008-2013 NVIDIA Corporation + * Copyright 2013 Filipe RNC Maia + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*- + * Copyright (c) 2005 Bruce D. Evans and Steven G. Kargl + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* adapted from FreeBSD: + * lib/msun/src/s_csinh.c + */ + +#pragma once + +#include +#include + +namespace thrust { +namespace detail { +namespace complex { + +using thrust::complex; + +__host__ __device__ inline complex csinh(const complex& z) { + double x, y, h; + uint32_t hx, hy, ix, iy, lx, ly; + const double huge = 8.98846567431157953864652595395e+307; // 0x1p1023; + + x = z.real(); + y = z.imag(); + + extract_words(hx, lx, x); + extract_words(hy, ly, y); + + ix = 0x7fffffff & hx; + iy = 0x7fffffff & hy; + + /* Handle the nearly-non-exceptional cases where x and y are finite. */ + if (ix < 0x7ff00000 && iy < 0x7ff00000) { + if ((iy | ly) == 0) return (complex(sinh(x), y)); + if (ix < 0x40360000) /* small x: normal case */ + return (complex(sinh(x) * cos(y), cosh(x) * sin(y))); + + /* |x| >= 22, so cosh(x) ~= exp(|x|) */ + if (ix < 0x40862e42) { + /* x < 710: exp(|x|) won't overflow */ + h = exp(fabs(x)) * 0.5; + return (complex(copysign(h, x) * cos(y), h * sin(y))); + } else if (ix < 0x4096bbaa) { + /* x < 1455: scale to avoid overflow */ + complex z_ = ldexp_cexp(complex(fabs(x), y), -1); + return (complex(z_.real() * copysign(1.0, x), z_.imag())); + } else { + /* x >= 1455: the result always overflows */ + h = huge * x; + return (complex(h * cos(y), h * h * sin(y))); + } + } + + /* + * sinh(+-0 +- I Inf) = sign(d(+-0, dNaN))0 + I dNaN. + * The sign of 0 in the result is unspecified. Choice = normally + * the same as dNaN. Raise the invalid floating-point exception. + * + * sinh(+-0 +- I NaN) = sign(d(+-0, NaN))0 + I d(NaN). + * The sign of 0 in the result is unspecified. Choice = normally + * the same as d(NaN). + */ + if ((ix | lx) == 0 && iy >= 0x7ff00000) + return (complex(copysign(0.0, x * (y - y)), y - y)); + + /* + * sinh(+-Inf +- I 0) = +-Inf + I +-0. + * + * sinh(NaN +- I 0) = d(NaN) + I +-0. + */ + if ((iy | ly) == 0 && ix >= 0x7ff00000) { + if (((hx & 0xfffff) | lx) == 0) return (complex(x, y)); + return (complex(x, copysign(0.0, y))); + } + + /* + * sinh(x +- I Inf) = dNaN + I dNaN. + * Raise the invalid floating-point exception for finite nonzero x. + * + * sinh(x + I NaN) = d(NaN) + I d(NaN). + * Optionally raises the invalid floating-point exception for finite + * nonzero x. Choice = don't raise (except for signaling NaNs). + */ + if (ix < 0x7ff00000 && iy >= 0x7ff00000) + return (complex(y - y, x * (y - y))); + + /* + * sinh(+-Inf + I NaN) = +-Inf + I d(NaN). + * The sign of Inf in the result is unspecified. Choice = normally + * the same as d(NaN). + * + * sinh(+-Inf +- I Inf) = +Inf + I dNaN. + * The sign of Inf in the result is unspecified. Choice = always +. + * Raise the invalid floating-point exception. + * + * sinh(+-Inf + I y) = +-Inf cos(y) + I Inf sin(y) + */ + if (ix >= 0x7ff00000 && ((hx & 0xfffff) | lx) == 0) { + if (iy >= 0x7ff00000) return (complex(x * x, x * (y - y))); + return (complex(x * cos(y), infinity() * sin(y))); + } + + /* + * sinh(NaN + I NaN) = d(NaN) + I d(NaN). + * + * sinh(NaN +- I Inf) = d(NaN) + I d(NaN). + * Optionally raises the invalid floating-point exception. + * Choice = raise. + * + * sinh(NaN + I y) = d(NaN) + I d(NaN). + * Optionally raises the invalid floating-point exception for finite + * nonzero y. Choice = don't raise (except for signaling NaNs). + */ + return (complex((x * x) * (y - y), (x + x) * (y - y))); +} + +__host__ __device__ inline complex csin(complex z) { + /* csin(z) = -I * csinh(I * z) */ + z = csinh(complex(-z.imag(), z.real())); + return (complex(z.imag(), -z.real())); +} + +} // namespace complex + +} // namespace detail + +template +__host__ __device__ inline complex sin(const complex& z) { + const ValueType re = z.real(); + const ValueType im = z.imag(); + return complex(::sin(re) * ::cosh(im), ::cos(re) * ::sinh(im)); +} + +template +__host__ __device__ inline complex sinh(const complex& z) { + const ValueType re = z.real(); + const ValueType im = z.imag(); + return complex(::sinh(re) * ::cos(im), ::cosh(re) * ::sin(im)); +} + +template <> +__host__ __device__ inline complex sin(const complex& z) { + return detail::complex::csin(z); +} + +template <> +__host__ __device__ inline complex sinh(const complex& z) { + return detail::complex::csinh(z); +} + +} // namespace thrust diff --git a/cupy/_core/include/cupy/complex/csinhf.h b/cupy/_core/include/cupy/complex/csinhf.h new file mode 100644 index 0000000..cb12236 --- /dev/null +++ b/cupy/_core/include/cupy/complex/csinhf.h @@ -0,0 +1,133 @@ +/* + * Copyright 2008-2013 NVIDIA Corporation + * Copyright 2013 Filipe RNC Maia + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*- + * Copyright (c) 2005 Bruce D. Evans and Steven G. Kargl + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* adapted from FreeBSD: + * lib/msun/src/s_csinhf.c + */ + +#pragma once + +#include +#include + +namespace thrust { +namespace detail { +namespace complex { + +using thrust::complex; + +__host__ __device__ inline complex csinhf(const complex& z) { + float x, y, h; + uint32_t hx, hy, ix, iy; + + const float huge = 1.70141183460469231731687303716e+38; // 0x1p127; + + x = z.real(); + y = z.imag(); + + get_float_word(hx, x); + get_float_word(hy, y); + + ix = 0x7fffffff & hx; + iy = 0x7fffffff & hy; + + if (ix < 0x7f800000 && iy < 0x7f800000) { + if (iy == 0) return (complex(sinhf(x), y)); + if (ix < 0x41100000) /* small x: normal case */ + return (complex(sinhf(x) * cosf(y), coshf(x) * sinf(y))); + + /* |x| >= 9, so cosh(x) ~= exp(|x|) */ + if (ix < 0x42b17218) { + /* x < 88.7: expf(|x|) won't overflow */ + h = expf(fabsf(x)) * 0.5f; + return (complex(copysignf(h, x) * cosf(y), h * sinf(y))); + } else if (ix < 0x4340b1e7) { + /* x < 192.7: scale to avoid overflow */ + complex z_ = ldexp_cexpf(complex(fabsf(x), y), -1); + return (complex(z_.real() * copysignf(1.0f, x), z_.imag())); + } else { + /* x >= 192.7: the result always overflows */ + h = huge * x; + return (complex(h * cosf(y), h * h * sinf(y))); + } + } + + if (ix == 0 && iy >= 0x7f800000) + return (complex(copysignf(0, x * (y - y)), y - y)); + + if (iy == 0 && ix >= 0x7f800000) { + if ((hx & 0x7fffff) == 0) return (complex(x, y)); + return (complex(x, copysignf(0.0f, y))); + } + + if (ix < 0x7f800000 && iy >= 0x7f800000) + return (complex(y - y, x * (y - y))); + + if (ix >= 0x7f800000 && (hx & 0x7fffff) == 0) { + if (iy >= 0x7f800000) return (complex(x * x, x * (y - y))); + return (complex(x * cosf(y), infinity() * sinf(y))); + } + + return (complex((x * x) * (y - y), (x + x) * (y - y))); +} + +__host__ __device__ inline complex csinf(complex z) { + z = csinhf(complex(-z.imag(), z.real())); + return (complex(z.imag(), -z.real())); +} + +} // namespace complex + +} // namespace detail + +template <> +__host__ __device__ inline complex sin(const complex& z) { + return detail::complex::csinf(z); +} + +template <> +__host__ __device__ inline complex sinh(const complex& z) { + return detail::complex::csinhf(z); +} + +} // namespace thrust diff --git a/cupy/_core/include/cupy/complex/csqrt.h b/cupy/_core/include/cupy/complex/csqrt.h new file mode 100644 index 0000000..68d8a13 --- /dev/null +++ b/cupy/_core/include/cupy/complex/csqrt.h @@ -0,0 +1,144 @@ +/* + * Copyright 2008-2013 NVIDIA Corporation + * Copyright 2013 Filipe RNC Maia + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*- + * Copyright (c) 2007 David Schultz + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Adapted from FreeBSD by Filipe Maia : + * freebsd/lib/msun/src/s_csqrt.c + */ + +#pragma once + +#include +#include + +namespace thrust { +namespace detail { +namespace complex { + +using thrust::complex; + +__host__ __device__ inline complex csqrt(const complex& z) { + complex result; + double a, b; + double t; + int scale; + + /* We risk spurious overflow for components >= DBL_MAX / (1 + sqrt(2)). */ + const double THRESH = 7.446288774449766337959726e+307; + + a = z.real(); + b = z.imag(); + + /* Handle special cases. */ + if (z == 0.0) return (complex(0.0, b)); + if (isinf(b)) return (complex(infinity(), b)); + if (isnan(a)) { + t = (b - b) / (b - b); /* raise invalid if b is not a NaN */ + return (complex(a, t)); /* return NaN + NaN i */ + } + if (isinf(a)) { + /* + * csqrt(inf + NaN i) = inf + NaN i + * csqrt(inf + y i) = inf + 0 i + * csqrt(-inf + NaN i) = NaN +- inf i + * csqrt(-inf + y i) = 0 + inf i + */ + if (signbit(a)) + return (complex(fabs(b - b), copysign(a, b))); + else + return (complex(a, copysign(b - b, b))); + } + /* + * The remaining special case (b is NaN) is handled just fine by + * the normal code path below. + */ + + // DBL_MIN*2 + const double low_thresh = 4.450147717014402766180465e-308; + scale = 0; + + if (fabs(a) >= THRESH || fabs(b) >= THRESH) { + /* Scale to avoid overflow. */ + a *= 0.25; + b *= 0.25; + scale = 1; + } else if (fabs(a) <= low_thresh && fabs(b) <= low_thresh) { + /* Scale to avoid underflow. */ + a *= 4.0; + b *= 4.0; + scale = 2; + } + + /* Algorithm 312, CACM vol 10, Oct 1967. */ + if (a >= 0.0) { + t = sqrt((a + hypot(a, b)) * 0.5); + result = complex(t, b / (2 * t)); + } else { + t = sqrt((-a + hypot(a, b)) * 0.5); + result = complex(fabs(b) / (2 * t), copysign(t, b)); + } + + /* Rescale. */ + if (scale == 1) + return (result * 2.0); + else if (scale == 2) + return (result * 0.5); + else + return (result); +} + +} // namespace complex + +} // namespace detail + +template +__host__ __device__ inline complex sqrt(const complex& z) { + return thrust::polar(::sqrt(thrust::abs(z)), thrust::arg(z) / ValueType(2)); +} + +template <> +__host__ __device__ inline complex sqrt(const complex& z) { + return detail::complex::csqrt(z); +} + +} // namespace thrust diff --git a/cupy/_core/include/cupy/complex/csqrtf.h b/cupy/_core/include/cupy/complex/csqrtf.h new file mode 100644 index 0000000..8a5a7d4 --- /dev/null +++ b/cupy/_core/include/cupy/complex/csqrtf.h @@ -0,0 +1,141 @@ +/* + * Copyright 2008-2013 NVIDIA Corporation + * Copyright 2013 Filipe RNC Maia + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*- + * Copyright (c) 2007 David Schultz + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Adapted from FreeBSD by Filipe Maia : + * freebsd/lib/msun/src/s_csqrt.c + */ + +#pragma once + +#include +#include + +namespace thrust { +namespace detail { +namespace complex { + +using thrust::complex; + +__host__ __device__ inline complex csqrtf(const complex& z) { + float a = z.real(), b = z.imag(); + float t; + int scale; + complex result; + + /* We risk spurious overflow for components >= FLT_MAX / (1 + sqrt(2)). */ + const float THRESH = 1.40949553037932e+38f; + + /* Handle special cases. */ + if (z == 0.0f) return (complex(0, b)); + if (isinf(b)) return (complex(infinity(), b)); + if (isnan(a)) { + t = (b - b) / (b - b); /* raise invalid if b is not a NaN */ + return (complex(a, t)); /* return NaN + NaN i */ + } + if (isinf(a)) { + /* + * csqrtf(inf + NaN i) = inf + NaN i + * csqrtf(inf + y i) = inf + 0 i + * csqrtf(-inf + NaN i) = NaN +- inf i + * csqrtf(-inf + y i) = 0 + inf i + */ + if (signbit(a)) + return (complex(fabsf(b - b), copysignf(a, b))); + else + return (complex(a, copysignf(b - b, b))); + } + /* + * The remaining special case (b is NaN) is handled just fine by + * the normal code path below. + */ + + /* + * Unlike in the FreeBSD code we'll avoid using double precision as + * not all hardware supports it. + */ + + // FLT_MIN*2 + const float low_thresh = 2.35098870164458e-38f; + scale = 0; + + if (fabsf(a) >= THRESH || fabsf(b) >= THRESH) { + /* Scale to avoid overflow. */ + a *= 0.25f; + b *= 0.25f; + scale = 1; + } else if (fabsf(a) <= low_thresh && fabsf(b) <= low_thresh) { + /* Scale to avoid underflow. */ + a *= 4.f; + b *= 4.f; + scale = 2; + } + + /* Algorithm 312, CACM vol 10, Oct 1967. */ + if (a >= 0.0f) { + t = sqrtf((a + hypotf(a, b)) * 0.5f); + result = complex(t, b / (2.0f * t)); + } else { + t = sqrtf((-a + hypotf(a, b)) * 0.5f); + result = complex(fabsf(b) / (2.0f * t), copysignf(t, b)); + } + + /* Rescale. */ + if (scale == 1) + return (result * 2.0f); + else if (scale == 2) + return (result * 0.5f); + else + return (result); +} + +} // namespace complex + +} // namespace detail + +template <> +__host__ __device__ inline complex sqrt(const complex& z) { + return detail::complex::csqrtf(z); +} + +} // namespace thrust diff --git a/cupy/_core/include/cupy/complex/ctanh.h b/cupy/_core/include/cupy/complex/ctanh.h new file mode 100644 index 0000000..4929f29 --- /dev/null +++ b/cupy/_core/include/cupy/complex/ctanh.h @@ -0,0 +1,191 @@ +/* + * Copyright 2008-2013 NVIDIA Corporation + * Copyright 2013 Filipe RNC Maia + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*- + * Copyright (c) 2011 David Schultz + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Adapted from FreeBSD by Filipe Maia : + * freebsd/lib/msun/src/s_ctanh.c + */ + +/* + * Hyperbolic tangent of a complex argument z = x + i y. + * + * The algorithm is from: + * + * W. Kahan. Branch Cuts for Complex Elementary Functions or Much + * Ado About Nothing's Sign Bit. In The State of the Art in + * Numerical Analysis, pp. 165 ff. Iserles and Powell, eds., 1987. + * + * Method: + * + * Let t = tan(x) + * beta = 1/cos^2(y) + * s = sinh(x) + * rho = cosh(x) + * + * We have: + * + * tanh(z) = sinh(z) / cosh(z) + * + * sinh(x) cos(y) + i cosh(x) sin(y) + * = --------------------------------- + * cosh(x) cos(y) + i sinh(x) sin(y) + * + * cosh(x) sinh(x) / cos^2(y) + i tan(y) + * = ------------------------------------- + * 1 + sinh^2(x) / cos^2(y) + * + * beta rho s + i t + * = ---------------- + * 1 + beta s^2 + * + * Modifications: + * + * I omitted the original algorithm's handling of overflow in tan(x) after + * verifying with nearpi.c that this can't happen in IEEE single or double + * precision. I also handle large x differently. + */ + +#pragma once + +#include +#include + +namespace thrust { +namespace detail { +namespace complex { + +using thrust::complex; + +__host__ __device__ inline complex ctanh(const complex& z) { + double x, y; + double t, beta, s, rho, denom; + uint32_t hx, ix, lx; + + x = z.real(); + y = z.imag(); + + extract_words(hx, lx, x); + ix = hx & 0x7fffffff; + + /* + * ctanh(NaN + i 0) = NaN + i 0 + * + * ctanh(NaN + i y) = NaN + i NaN for y != 0 + * + * The imaginary part has the sign of x*sin(2*y), but there's no + * special effort to get this right. + * + * ctanh(+-Inf +- i Inf) = +-1 +- 0 + * + * ctanh(+-Inf + i y) = +-1 + 0 sin(2y) for y finite + * + * The imaginary part of the sign is unspecified. This special + * case is only needed to avoid a spurious invalid exception when + * y is infinite. + */ + if (ix >= 0x7ff00000) { + if ((ix & 0xfffff) | lx) /* x is NaN */ + return (complex(x, (y == 0 ? y : x * y))); + set_high_word(x, hx - 0x40000000); /* x = copysign(1, x) */ + return (complex(x, copysign(0.0, isinf(y) ? y : sin(y) * cos(y)))); + } + + /* + * ctanh(x + i NAN) = NaN + i NaN + * ctanh(x +- i Inf) = NaN + i NaN + */ + if (!isfinite(y)) return (complex(y - y, y - y)); + + /* + * ctanh(+-huge + i +-y) ~= +-1 +- i 2sin(2y)/exp(2x), using the + * approximation sinh^2(huge) ~= exp(2*huge) / 4. + * We use a modified formula to avoid spurious overflow. + */ + if (ix >= 0x40360000) { /* x >= 22 */ + double exp_mx = exp(-fabs(x)); + return (complex(copysign(1.0, x), + 4.0 * sin(y) * cos(y) * exp_mx * exp_mx)); + } + + /* Kahan's algorithm */ + t = tan(y); + beta = 1.0 + t * t; /* = 1 / cos^2(y) */ + s = sinh(x); + rho = sqrt(1.0 + s * s); /* = cosh(x) */ + denom = 1.0 + beta * s * s; + return (complex((beta * rho * s) / denom, t / denom)); +} + +__host__ __device__ inline complex ctan(complex z) { + /* ctan(z) = -I * ctanh(I * z) */ + z = ctanh(complex(-z.imag(), z.real())); + return (complex(z.imag(), -z.real())); +} + +} // namespace complex + +} // namespace detail + +template +__host__ __device__ inline complex tan(const complex& z) { + return sin(z) / cos(z); +} + +template +__host__ __device__ inline complex tanh(const complex& z) { + // This implementation seems better than the simple sin/cos + return (thrust::exp(ValueType(2) * z) - ValueType(1)) / + (thrust::exp(ValueType(2) * z) + ValueType(1)); +} + +template <> +__host__ __device__ inline complex tan(const complex& z) { + return detail::complex::ctan(z); +} + +template <> +__host__ __device__ inline complex tanh(const complex& z) { + return detail::complex::ctanh(z); +} + +} // namespace thrust diff --git a/cupy/_core/include/cupy/complex/ctanhf.h b/cupy/_core/include/cupy/complex/ctanhf.h new file mode 100644 index 0000000..8a0acc7 --- /dev/null +++ b/cupy/_core/include/cupy/complex/ctanhf.h @@ -0,0 +1,116 @@ +/* + * Copyright 2008-2013 NVIDIA Corporation + * Copyright 2013 Filipe RNC Maia + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*- + * Copyright (c) 2011 David Schultz + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Adapted from FreeBSD by Filipe Maia, filipe.c.maia@gmail.com: + * freebsd/lib/msun/src/s_ctanhf.c + */ + +/* + * Hyperbolic tangent of a complex argument z. See ctanh.c for details. + */ + +#pragma once + +#include +#include + +namespace thrust { +namespace detail { +namespace complex { + +using thrust::complex; + +__host__ __device__ inline complex ctanhf(const complex& z) { + float x, y; + float t, beta, s, rho, denom; + uint32_t hx, ix; + + x = z.real(); + y = z.imag(); + + get_float_word(hx, x); + ix = hx & 0x7fffffff; + + if (ix >= 0x7f800000) { + if (ix & 0x7fffff) return (complex(x, (y == 0.0f ? y : x * y))); + set_float_word(x, hx - 0x40000000); + return (complex(x, copysignf(0, isinf(y) ? y : sinf(y) * cosf(y)))); + } + + if (!isfinite(y)) return (complex(y - y, y - y)); + + if (ix >= 0x41300000) { /* x >= 11 */ + float exp_mx = expf(-fabsf(x)); + return (complex(copysignf(1.0f, x), + 4.0f * sinf(y) * cosf(y) * exp_mx * exp_mx)); + } + + t = tanf(y); + beta = 1.0f + t * t; + s = sinhf(x); + rho = sqrtf(1.0f + s * s); + denom = 1.0f + beta * s * s; + return (complex((beta * rho * s) / denom, t / denom)); +} + +__host__ __device__ inline complex ctanf(complex z) { + z = ctanhf(complex(-z.imag(), z.real())); + return (complex(z.imag(), -z.real())); +} + +} // namespace complex + +} // namespace detail + +template <> +__host__ __device__ inline complex tan(const complex& z) { + return detail::complex::ctanf(z); +} + +template <> +__host__ __device__ inline complex tanh(const complex& z) { + return detail::complex::ctanhf(z); +} + +} // namespace thrust diff --git a/cupy/_core/include/cupy/complex/math_private.h b/cupy/_core/include/cupy/complex/math_private.h new file mode 100644 index 0000000..7784ca1 --- /dev/null +++ b/cupy/_core/include/cupy/complex/math_private.h @@ -0,0 +1,192 @@ +/* + * Copyright 2008-2013 NVIDIA Corporation + * Copyright 2013 Filipe RNC Maia + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunPro, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + +/* adapted from FreeBSD: + * lib/msun/src/math_private.h + */ + +#pragma once + +#if defined(_MSC_VER) // see #6823 +#include +#endif // defined(_MSC_VER) + +namespace thrust { + +#if !defined(_MSC_VER) // see #6823 +const float FLT_MIN = 1.17549435e-38F; +const float FLT_MAX = 3.40282347e+38F; +const float FLT_EPSILON = 1.19209290e-07F; +const int FLT_MAX_EXP = 128; +const int FLT_MANT_DIG = 24; + +const double DBL_MIN = 2.2250738585072014e-308; +const double DBL_MAX = 1.7976931348623157e+308; +const double DBL_EPSILON = 2.2204460492503131e-16; +const int DBL_MAX_EXP = 1024; +const int DBL_MANT_DIG = 53; +#endif // !defined(_MSC_VER) + +namespace detail { +namespace complex { + +typedef int int32_t; +typedef unsigned int uint32_t; +typedef long long int64_t; +typedef unsigned long long uint64_t; + +typedef union { + float value; + uint32_t word; +} ieee_float_shape_type; + +__host__ __device__ inline void get_float_word(uint32_t& i, float d) { + ieee_float_shape_type gf_u; + gf_u.value = (d); + (i) = gf_u.word; +} + +__host__ __device__ inline void get_float_word(int32_t& i, float d) { + ieee_float_shape_type gf_u; + gf_u.value = (d); + (i) = gf_u.word; +} + +__host__ __device__ inline void set_float_word(float& d, uint32_t i) { + ieee_float_shape_type sf_u; + sf_u.word = (i); + (d) = sf_u.value; +} + +// Assumes little endian ordering +typedef union { + double value; + struct { + uint32_t lsw; + uint32_t msw; + } parts; + struct { + uint64_t w; + } xparts; +} ieee_double_shape_type; + +__host__ __device__ inline void get_high_word(uint32_t& i, double d) { + ieee_double_shape_type gh_u; + gh_u.value = (d); + (i) = gh_u.parts.msw; +} + +/* Set the more significant 32 bits of a double from an int. */ +__host__ __device__ inline void set_high_word(double& d, uint32_t v) { + ieee_double_shape_type sh_u; + sh_u.value = (d); + sh_u.parts.msw = (v); + (d) = sh_u.value; +} + +__host__ __device__ inline void insert_words(double& d, uint32_t ix0, uint32_t ix1) { + ieee_double_shape_type iw_u; + iw_u.parts.msw = (ix0); + iw_u.parts.lsw = (ix1); + (d) = iw_u.value; +} + +/* Get two 32 bit ints from a double. */ +__host__ __device__ inline void extract_words(uint32_t& ix0, uint32_t& ix1, double d) { + ieee_double_shape_type ew_u; + ew_u.value = (d); + (ix0) = ew_u.parts.msw; + (ix1) = ew_u.parts.lsw; +} + +/* Get two 32 bit ints from a double. */ +__host__ __device__ inline void extract_words(int32_t& ix0, int32_t& ix1, double d) { + ieee_double_shape_type ew_u; + ew_u.value = (d); + (ix0) = ew_u.parts.msw; + (ix1) = ew_u.parts.lsw; +} + +template +inline __host__ __device__ T infinity(); + +template <> +inline __host__ __device__ float infinity() { + float res; + set_float_word(res, 0x7f800000); + return res; +} + +template <> +inline __host__ __device__ double infinity() { + double res; + insert_words(res, 0x7ff00000, 0); + return res; +} + +using ::abs; +using ::log; +using ::acos; +using ::asin; +using ::sqrt; +using ::sinh; +using ::tan; +using ::cos; +using ::sin; +using ::exp; +using ::cosh; +using ::atan; +using ::atanh; +using ::isinf; +using ::isnan; +using ::signbit; +using ::isfinite; + +} // namespace complex + +} // namespace detail + +using ::abs; +using ::log; +using ::acos; +using ::asin; +using ::sqrt; +using ::sinh; +using ::tan; +using ::cos; +using ::sin; +using ::exp; +using ::cosh; +using ::atan; +using ::atanh; +using ::isinf; +using ::isnan; +using ::signbit; +using ::isfinite; + +} // namespace thrust diff --git a/cupy/_core/include/cupy/cuComplex_bridge.h b/cupy/_core/include/cupy/cuComplex_bridge.h new file mode 100644 index 0000000..6e57276 --- /dev/null +++ b/cupy/_core/include/cupy/cuComplex_bridge.h @@ -0,0 +1,34 @@ +/* + This header is to support the "translate_cucomplex" option + that turns cuComplex function calls to their Thrust counterparts. +*/ + +/* ------------------- single complex ------------------- */ +#define cuFloatComplex complex +#define cuComplex complex +#define cuCrealf real +#define cuCimagf imag +#define make_cuFloatComplex(A, B) complex(A, B) +#define make_cuComplex(A, B) complex(A, B) +#define cuConjf conj +#define cuCaddf(A, B) (A + B) +#define cuCsubf(A, B) (A - B) +#define cuCmulf(A, B) (A * B) +#define cuCdivf(A, B) (A / B) +#define cuCabsf abs +#define cuComplexDoubleToFloat complex +#define cuCfmaf(A, B, C) (A * B + C) + +/* ------------------- double complex ------------------- */ +#define cuDoubleComplex complex +#define cuCreal real +#define cuCimag imag +#define make_cuDoubleComplex(A, B) complex(A, B) +#define cuConj conj +#define cuCadd(A, B) (A + B) +#define cuCsub(A, B) (A - B) +#define cuCmul(A, B) (A * B) +#define cuCdiv(A, B) (A / B) +#define cuCabs abs +#define cuComplexFloatToDouble complex +#define cuCfma(A, B, C) (A * B + C) diff --git a/cupy/_core/include/cupy/cub/.gitattributes b/cupy/_core/include/cupy/cub/.gitattributes new file mode 100644 index 0000000..c6a031b --- /dev/null +++ b/cupy/_core/include/cupy/cub/.gitattributes @@ -0,0 +1 @@ +cub symlink=dir diff --git a/cupy/_core/include/cupy/cub/LICENSE.TXT b/cupy/_core/include/cupy/cub/LICENSE.TXT new file mode 120000 index 0000000..d7c89da --- /dev/null +++ b/cupy/_core/include/cupy/cub/LICENSE.TXT @@ -0,0 +1 @@ +../../../../../third_party/cub/LICENSE.TXT \ No newline at end of file diff --git a/cupy/_core/include/cupy/cub/cub b/cupy/_core/include/cupy/cub/cub new file mode 120000 index 0000000..12d0d58 --- /dev/null +++ b/cupy/_core/include/cupy/cub/cub @@ -0,0 +1 @@ +../../../../../third_party/cub/cub \ No newline at end of file diff --git a/cupy/_core/include/cupy/cuda_workaround.h b/cupy/_core/include/cupy/cuda_workaround.h new file mode 100644 index 0000000..74c2117 --- /dev/null +++ b/cupy/_core/include/cupy/cuda_workaround.h @@ -0,0 +1,13 @@ +#pragma once + +#ifdef __CUDACC_RTC__ +// cudaDeviceSynchronize() is no longer supported by nvrtc in device code on +// H100 GPUs or any GPUs in CUDA 12.x. cudaDeviceSynchronize() is used in CUB +// bundled with CuPy, resulting in comiplation error when GPU is H100 or later, +// or CUDA version is 12 or later. +#if __CUDA_ARCH__ >= 900 +cudaError_t cudaDeviceSynchronize() { return cudaSuccess; } +#elif __CUDACC_VER_MAJOR__ >= 12 +cudaError_t cudaDeviceSynchronize() { return cudaSuccess; } +#endif +#endif diff --git a/cupy/_core/include/cupy/dlpack/README.md b/cupy/_core/include/cupy/dlpack/README.md new file mode 100644 index 0000000..6d50eff --- /dev/null +++ b/cupy/_core/include/cupy/dlpack/README.md @@ -0,0 +1,4 @@ +## DLPack header + +The header `dlpack.h` is downloaded from https://github.com/dmlc/dlpack/blob/main/include/dlpack/dlpack.h. +The commit is [`365b823`](https://github.com/dmlc/dlpack/commit/365b823cedb281cd0240ca601aba9b78771f91a3). diff --git a/cupy/_core/include/cupy/dlpack/dlpack.h b/cupy/_core/include/cupy/dlpack/dlpack.h new file mode 100644 index 0000000..6d51801 --- /dev/null +++ b/cupy/_core/include/cupy/dlpack/dlpack.h @@ -0,0 +1,232 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file dlpack.h + * \brief The common header of DLPack. + */ +#ifndef DLPACK_DLPACK_H_ +#define DLPACK_DLPACK_H_ + +/** + * \brief Compatibility with C++ + */ +#ifdef __cplusplus +#define DLPACK_EXTERN_C extern "C" +#else +#define DLPACK_EXTERN_C +#endif + +/*! \brief The current version of dlpack */ +#define DLPACK_VERSION 80 + +/*! \brief The current ABI version of dlpack */ +#define DLPACK_ABI_VERSION 1 + +/*! \brief DLPACK_DLL prefix for windows */ +#ifdef _WIN32 +#ifdef DLPACK_EXPORTS +#define DLPACK_DLL __declspec(dllexport) +#else +#define DLPACK_DLL __declspec(dllimport) +#endif +#else +#define DLPACK_DLL +#endif + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif +/*! + * \brief The device type in DLDevice. + */ +#ifdef __cplusplus +typedef enum : int32_t { +#else +typedef enum { +#endif + /*! \brief CPU device */ + kDLCPU = 1, + /*! \brief CUDA GPU device */ + kDLCUDA = 2, + /*! + * \brief Pinned CUDA CPU memory by cudaMallocHost + */ + kDLCUDAHost = 3, + /*! \brief OpenCL devices. */ + kDLOpenCL = 4, + /*! \brief Vulkan buffer for next generation graphics. */ + kDLVulkan = 7, + /*! \brief Metal for Apple GPU. */ + kDLMetal = 8, + /*! \brief Verilog simulator buffer */ + kDLVPI = 9, + /*! \brief ROCm GPUs for AMD GPUs */ + kDLROCM = 10, + /*! + * \brief Pinned ROCm CPU memory allocated by hipMallocHost + */ + kDLROCMHost = 11, + /*! + * \brief Reserved extension device type, + * used for quickly test extension device + * The semantics can differ depending on the implementation. + */ + kDLExtDev = 12, + /*! + * \brief CUDA managed/unified memory allocated by cudaMallocManaged + */ + kDLCUDAManaged = 13, + /*! + * \brief Unified shared memory allocated on a oneAPI non-partititioned + * device. Call to oneAPI runtime is required to determine the device + * type, the USM allocation type and the sycl context it is bound to. + * + */ + kDLOneAPI = 14, + /*! \brief GPU support for next generation WebGPU standard. */ + kDLWebGPU = 15, + /*! \brief Qualcomm Hexagon DSP */ + kDLHexagon = 16, +} DLDeviceType; + +/*! + * \brief A Device for Tensor and operator. + */ +typedef struct { + /*! \brief The device type used in the device. */ + DLDeviceType device_type; + /*! + * \brief The device index. + * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0. + */ + int32_t device_id; +} DLDevice; + +/*! + * \brief The type code options DLDataType. + */ +typedef enum { + /*! \brief signed integer */ + kDLInt = 0U, + /*! \brief unsigned integer */ + kDLUInt = 1U, + /*! \brief IEEE floating point */ + kDLFloat = 2U, + /*! + * \brief Opaque handle type, reserved for testing purposes. + * Frameworks need to agree on the handle data type for the exchange to be well-defined. + */ + kDLOpaqueHandle = 3U, + /*! \brief bfloat16 */ + kDLBfloat = 4U, + /*! + * \brief complex number + * (C/C++/Python layout: compact struct per complex number) + */ + kDLComplex = 5U, + /*! \brief boolean */ + kDLBool = 6U, +} DLDataTypeCode; + +/*! + * \brief The data type the tensor can hold. The data type is assumed to follow the + * native endian-ness. An explicit error message should be raised when attempting to + * export an array with non-native endianness + * + * Examples + * - float: type_code = 2, bits = 32, lanes = 1 + * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4 + * - int8: type_code = 0, bits = 8, lanes = 1 + * - std::complex: type_code = 5, bits = 64, lanes = 1 + * - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits) + */ +typedef struct { + /*! + * \brief Type code of base types. + * We keep it uint8_t instead of DLDataTypeCode for minimal memory + * footprint, but the value should be one of DLDataTypeCode enum values. + * */ + uint8_t code; + /*! + * \brief Number of bits, common choices are 8, 16, 32. + */ + uint8_t bits; + /*! \brief Number of lanes in the type, used for vector types. */ + uint16_t lanes; +} DLDataType; + +/*! + * \brief Plain C Tensor object, does not manage memory. + */ +typedef struct { + /*! + * \brief The data pointer points to the allocated data. This will be CUDA + * device pointer or cl_mem handle in OpenCL. It may be opaque on some device + * types. This pointer is always aligned to 256 bytes as in CUDA. The + * `byte_offset` field should be used to point to the beginning of the data. + * + * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow, + * TVM, perhaps others) do not adhere to this 256 byte aligment requirement + * on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed + * (after which this note will be updated); at the moment it is recommended + * to not rely on the data pointer being correctly aligned. + * + * For given DLTensor, the size of memory required to store the contents of + * data is calculated as follows: + * + * \code{.c} + * static inline size_t GetDataSize(const DLTensor* t) { + * size_t size = 1; + * for (tvm_index_t i = 0; i < t->ndim; ++i) { + * size *= t->shape[i]; + * } + * size *= (t->dtype.bits * t->dtype.lanes + 7) / 8; + * return size; + * } + * \endcode + */ + void* data; + /*! \brief The device of the tensor */ + DLDevice device; + /*! \brief Number of dimensions */ + int32_t ndim; + /*! \brief The data type of the pointer*/ + DLDataType dtype; + /*! \brief The shape of the tensor */ + int64_t* shape; + /*! + * \brief strides of the tensor (in number of elements, not bytes) + * can be NULL, indicating tensor is compact and row-majored. + */ + int64_t* strides; + /*! \brief The offset in bytes to the beginning pointer to data */ + uint64_t byte_offset; +} DLTensor; + +/*! + * \brief C Tensor object, manage memory of DLTensor. This data structure is + * intended to facilitate the borrowing of DLTensor by another framework. It is + * not meant to transfer the tensor. When the borrowing framework doesn't need + * the tensor, it should call the deleter to notify the host that the resource + * is no longer needed. + */ +typedef struct DLManagedTensor { + /*! \brief DLTensor which is being memory managed */ + DLTensor dl_tensor; + /*! \brief the context of the original host framework of DLManagedTensor in + * which DLManagedTensor is used in the framework. It can also be NULL. + */ + void * manager_ctx; + /*! \brief Destructor signature void (*)(void*) - this should be called + * to destruct manager_ctx which holds the DLManagedTensor. It can be NULL + * if there is no way for the caller to provide a reasonable destructor. + * The destructors deletes the argument self as well. + */ + void (*deleter)(struct DLManagedTensor * self); +} DLManagedTensor; +#ifdef __cplusplus +} // DLPACK_EXTERN_C +#endif +#endif // DLPACK_DLPACK_H_ diff --git a/cupy/_core/include/cupy/hip_workaround.cuh b/cupy/_core/include/cupy/hip_workaround.cuh new file mode 100644 index 0000000..883fb10 --- /dev/null +++ b/cupy/_core/include/cupy/hip_workaround.cuh @@ -0,0 +1,20 @@ +#ifndef INCLUDE_GUARD_CUPY_HIP_WORKAROUND_H +#define INCLUDE_GUARD_CUPY_HIP_WORKAROUND_H + +#ifdef __HIP_DEVICE_COMPILE__ + +// ignore mask +#define __shfl_sync(mask, ...) __shfl(__VA_ARGS__) +#define __shfl_up_sync(mask, ...) __shfl_up(__VA_ARGS__) +#define __shfl_down_sync(mask, ...) __shfl_down(__VA_ARGS__) +#define __shfl_xor_sync(mask, ...) __shfl_xor(__VA_ARGS__) + +// In ROCm, threads in a warp march in lock-step, so we don't need to +// synchronize the threads. But it doesn't guarantee the memory order, +// which still make us use memory fences. +// https://rocmdocs.amd.com/en/latest/Programming_Guides/Kernel_language.html#warp-cross-lane-functions +#define __syncwarp() { __threadfence_block(); } + +#endif // __HIP_DEVICE_COMPILE__ + +#endif // INCLUDE_GUARD_CUPY_HIP_WORKAROUND_H diff --git a/cupy/_core/include/cupy/jitify/.clang-format b/cupy/_core/include/cupy/jitify/.clang-format new file mode 100644 index 0000000..4a88069 --- /dev/null +++ b/cupy/_core/include/cupy/jitify/.clang-format @@ -0,0 +1,149 @@ +--- +Language: Cpp +# BasedOnStyle: Google +AccessModifierOffset: -1 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlines: Left +AlignOperands: true +AlignTrailingComments: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: All +AllowShortIfStatementsOnASingleLine: true +AllowShortLoopsOnASingleLine: true +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: Yes +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Attach +BreakBeforeInheritanceComma: false +BreakInheritanceList: BeforeColon +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: true +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +CommentPragmas: '^\\.+' +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IncludeBlocks: Preserve +IncludeCategories: + - Regex: '^' + Priority: 2 + - Regex: '^<.*\.h>' + Priority: 1 + - Regex: '^<.*' + Priority: 2 + - Regex: '.*' + Priority: 3 +IncludeIsMainRegex: '([-_](test|unittest))?$' +IndentCaseLabels: true +IndentPPDirectives: None +IndentWidth: 2 +IndentWrappedFunctionNames: false +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Never +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Left +RawStringFormats: + - Language: Cpp + Delimiters: + - cc + - CC + - cpp + - Cpp + - CPP + - 'c++' + - 'C++' + CanonicalDelimiter: '' + BasedOnStyle: google + - Language: TextProto + Delimiters: + - pb + - PB + - proto + - PROTO + EnclosingFunctions: + - EqualsProto + - EquivToProto + - PARSE_PARTIAL_TEXT_PROTO + - PARSE_TEST_PROTO + - PARSE_TEXT_PROTO + - ParseTextOrDie + - ParseTextProtoOrDie + CanonicalDelimiter: '' + BasedOnStyle: google +ReflowComments: true +SortIncludes: true +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Auto +TabWidth: 8 +UseTab: Never +... + diff --git a/cupy/_core/include/cupy/jitify/.gitignore b/cupy/_core/include/cupy/jitify/.gitignore new file mode 100644 index 0000000..c3a73d4 --- /dev/null +++ b/cupy/_core/include/cupy/jitify/.gitignore @@ -0,0 +1,10 @@ + +jitify_example +stringify +*.jit +*.o +jitify_2nd_compilation_unit.cpp +jitify_test +googletest/ +# Backup files +*~ diff --git a/cupy/_core/include/cupy/jitify/Doxyfile b/cupy/_core/include/cupy/jitify/Doxyfile new file mode 100644 index 0000000..583bc0c --- /dev/null +++ b/cupy/_core/include/cupy/jitify/Doxyfile @@ -0,0 +1,2427 @@ +# Doxyfile 1.8.11 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all text +# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv +# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv +# for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. +# The default value is: My Project. + +PROJECT_NAME = Jitify + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = + +# With the PROJECT_LOGO tag one can specify a logo or an icon that is included +# in the documentation. The maximum height of the logo should not exceed 55 +# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy +# the logo to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = doc + +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = YES + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# page for each member. If set to NO, the documentation of a member will be part +# of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 8 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines. + +ALIASES = + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding "class=itcl::class" +# will allow you to use the command class in the itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, Javascript, +# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: +# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: +# Fortran. In the later case the parser tries to guess whether the code is fixed +# or free formatted code, this is the default for Fortran type files), VHDL. For +# instance to make doxygen treat .inc files as Fortran files (default is PHP), +# and .f files as C (default is Fortran), use: inc=Fortran f=C. +# +# Note: For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See http://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = NO + +# If one adds a struct or class to a group and this option is enabled, then also +# any nested class or struct is added to the same group. By default this option +# is disabled and one has to add nested compounds explicitly via \ingroup. +# The default value is: NO. + +GROUP_NESTED_COMPOUNDS = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = NO + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. +# The default value is: NO. + +EXTRACT_ALL = NO + +# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = NO + +# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO, +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. If set to YES, local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO, only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO, these classes will be included in the various overviews. This option +# has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# (class|struct|union) declarations. If set to NO, these declarations will be +# included in the documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = YES + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO, these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file +# names in lower-case letters. If set to YES, upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. +# The default value is: system dependent. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES, the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# append additional text to a page's title, such as Class Reference. If set to +# YES the compound reference will be hidden. +# The default value is: NO. + +HIDE_COMPOUND_REFERENCE= NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo +# list. This list is created by putting \todo commands in the documentation. +# The default value is: YES. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test +# list. This list is created by putting \test commands in the documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES, the +# list will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = YES + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO, doxygen will only warn about wrong or incomplete +# parameter documentation, but not about the absence of documentation. +# The default value is: NO. + +WARN_NO_PARAMDOC = NO + +# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when +# a warning is encountered. +# The default value is: NO. + +WARN_AS_ERROR = NO + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING +# Note: If this tag is empty the current directory is searched. + +INPUT = jitify.hpp + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: http://www.gnu.org/software/libiconv) for the list of +# possible encodings. +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# read by doxygen. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, +# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, +# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, +# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f, *.for, *.tcl, +# *.vhd, *.vhdl, *.ucf, *.qsf, *.as and *.js. + +FILE_PATTERNS = + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = NO + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# function all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = YES + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = YES + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = YES + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see http://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the +# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the +# cost of reduced performance. This can be particularly helpful with template +# rich C++ code for which doxygen's built-in parser lacks the necessary type +# information. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse-libclang=ON option for CMake. +# The default value is: NO. + +CLANG_ASSISTED_PARSING = NO + +# If clang assisted parsing is enabled you can provide the compiler with command +# line options that you would normally use when invoking the compiler. Note that +# the include paths will already be set by doxygen for the files and directories +# specified with INPUT and INCLUDE_PATH. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_OPTIONS = + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot of +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = NO + +# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in +# which the alphabetical index list will be split. +# Minimum value: 1, maximum value: 20, default value: 5. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefore more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra style sheet files is of importance (e.g. the last +# style sheet in the list overrules the setting of the previous ones in the +# list). For an example see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the style sheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# http://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to YES can help to show when doxygen was last run and thus if the +# documentation is up to date. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_TIMESTAMP = NO + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: http://developer.apple.com/tools/xcode/), introduced with +# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a +# Makefile in the HTML output directory. Running make will produce the docset in +# that directory and running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler (hhc.exe). If non-empty, +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated +# (YES) or that it should be included in the master .chm file (NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated +# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- +# folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location of Qt's +# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the +# generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 4 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# http://www.mathjax.org) which uses client side Javascript for the rendering +# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = NO + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from http://www.mathjax.org before deployment. +# The default value is: http://cdn.mathjax.org/mathjax/latest. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /