/******************************************************************************
 * Copyright (c) 2016, NVIDIA CORPORATION.  All rights meserved.
 *  Modifications Copyright© 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/
#pragma once

#include <cstdio>
#include <thrust/detail/config.h>
#include <thrust/iterator/iterator_traits.h>
// Not present in rocPRIM
#include <thrust/system/cuda/detail/execution_policy.h>
#include <thrust/system/cuda/error.h>
#include <thrust/system_error.h>

// Define the value to 0, if you want to disable printf on device side.
#ifndef THRUST_HIP_PRINTF_ENABLED
#define THRUST_HIP_PRINTF_ENABLED 1
#endif

#if THRUST_HIP_PRINTF_ENABLED == 1
  #define THRUST_HIP_PRINTF(text, ...) \
    printf(text, ##__VA_ARGS__)
#else
  #define THRUST_HIP_PRINTF(text, ...)
#endif

THRUST_NAMESPACE_BEGIN
namespace cuda_cub
{

inline __host__ __device__ cudaStream_t default_stream()
{
  #ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
    return cudaStreamPerThread;
  #else
    return cudaStreamDefault; // There's not cudaStreamLegacy
  #endif
}

template <class Derived>
cudaStream_t __host__ __device__
get_stream(execution_policy<Derived>&)
{
    return default_stream();
}

// Fallback implementation of the customization point.
template <class Derived> __host__ __device__
bool must_perform_optional_stream_synchronization(execution_policy<Derived> &)
{
  return true;
}

// Entry point/interface.
template <class Derived> __host__ __device__
bool must_perform_optional_synchronization(execution_policy<Derived> &policy)
{
  return must_perform_optional_stream_synchronization(derived_cast(policy));
}

template <class Derived>
cudaError_t synchronize_stream(execution_policy<Derived>& policy)
{
  cudaError_t result;
if (THRUST_IS_HOST_CODE) {
  #if THRUST_INCLUDE_HOST_CODE
    cudaStreamSynchronize(stream(policy));
    result = cudaGetLastError();
  #endif
} else {
  #if THRUST_INCLUDE_DEVICE_CODE
    #if __THRUST_HAS_HIPRT__
      THRUST_UNUSED_VAR(policy);
      cudaDeviceSynchronize();
      result = cudaGetLastError();
    #else
      THRUST_UNUSED_VAR(policy);
      result = cudaSuccess;
    #endif
  #endif
}
return result;
}

// Fallback implementation of the customization point.
template <class Derived> __host__ __device__
cudaError_t synchronize_stream_optional(execution_policy<Derived> &policy)
{
  cudaError_t result;
  if (THRUST_IS_HOST_CODE) {
    #if THRUST_INCLUDE_HOST_CODE
      if(must_perform_optional_synchronization(policy)){
        cudaStreamSynchronize(stream(policy));
        result = cudaGetLastError();
      }else{
        result = cudaSuccess;
      }
    #endif
  } else {
    #if THRUST_INCLUDE_DEVICE_CODE
      #if __THRUST_HAS_HIPRT__
        if(must_perform_optional_synchronization(policy)){
          cudaDeviceSynchronize();
          result = cudaGetLastError();
        }else{
          result = cudaSuccess;
        }
      #else
        THRUST_UNUSED_VAR(policy);
        result = cudaSuccess;
      #endif
    #endif
  }
  return result;
}

// Entry point/interface.
template <class Policy> __host__ __device__
cudaError_t synchronize_optional(Policy &policy)
{
  return synchronize_stream_optional(derived_cast(policy));
}

__thrust_exec_check_disable__ template <class Policy>
__host__ __device__ cudaError_t synchronize(Policy& policy)
{
#if __THRUST_HAS_HIPRT__
    return synchronize_stream(derived_cast(policy));
#else
    THRUST_UNUSED_VAR(policy);
    return cudaSuccess;
#endif
}

template <class Derived>
__host__ __device__ cudaStream_t stream(execution_policy<Derived>& policy)
{
    return get_stream(derived_cast(policy));
}

template <class Type>
cudaError_t THRUST_HIP_HOST_FUNCTION
trivial_copy_from_device(Type* dst, Type const* src, size_t count, cudaStream_t stream)
{
    cudaError_t status = cudaSuccess;
    if(count == 0)
        return status;

    // cudaMemcpyWithStream is only supported on rocm 3.1 and above
    status = ::cudaMemcpyAsync(dst, src, sizeof(Type) * count, cudaMemcpyDeviceToHost, stream);
    if(status != cudaSuccess)
      return status;
    status = cudaStreamSynchronize(stream);
    return status;
}

template <class Type>
cudaError_t THRUST_HIP_HOST_FUNCTION
trivial_copy_to_device(Type* dst, Type const* src, size_t count, cudaStream_t stream)
{
    cudaError_t status = cudaSuccess;
    if(count == 0)
        return status;

    // cudaMemcpyWithStream is only supported on rocm 3.1 and above
    status = ::cudaMemcpyAsync(dst, src, sizeof(Type) * count, cudaMemcpyHostToDevice, stream);
    if(status != cudaSuccess)
      return status;
    status = cudaStreamSynchronize(stream);
    return status;
}

template <class Policy, class Type>
__host__ __device__ cudaError_t
trivial_copy_device_to_device(Policy& policy, Type* dst, Type const* src, size_t count)
{
    cudaError_t status = cudaSuccess;
    if(count == 0)
        return status;

    cudaStream_t stream = cuda_cub::stream(policy);
    //
    status = ::cudaMemcpyAsync(dst, src, sizeof(Type) * count, cudaMemcpyDeviceToDevice, stream);
    cuda_cub::synchronize(policy);
    return status;
}

inline void __host__ __device__ terminate()
{
  if (THRUST_IS_DEVICE_CODE) {
    #if THRUST_INCLUDE_DEVICE_CODE
      abort();
    #endif
  } else {
    #if THRUST_INCLUDE_HOST_CODE
      std::terminate();
    #endif
  }
}

inline void __host__ __device__ throw_on_error(cudaError_t status, char const* msg)
{
#if __THRUST_HAS_HIPRT__
  // Clear the global HIP error state which may have been set by the last
  // call. Otherwise, errors may "leak" to unrelated kernel launches.
  cudaError_t clear_error_status = cudaGetLastError();
  THRUST_UNUSED_VAR(clear_error_status);
 #endif

  if(cudaSuccess != status)
  {
    if (THRUST_IS_HOST_CODE) {
      #if THRUST_INCLUDE_HOST_CODE
      throw thrust::system_error(status, thrust::cuda_category(), msg);
      #endif
    } else {
    #if THRUST_INCLUDE_DEVICE_CODE
      #if __THRUST_HAS_HIPRT__
        printf("Thrust HIP Backend Error %s: %s\n", cudaGetErrorString(status),msg);
      #else
        THRUST_HIP_PRINTF("Error %d :%s \n", (int)status, msg);
        #if THRUST_HIP_PRINTF_ENABLED == 0
        THRUST_UNUSED_VAR(status);
        THRUST_UNUSED_VAR(msg);
        #endif
      #endif
      cuda_cub::terminate();
    #endif
    }
  }
}

// TODO this overload should be removed and messages should be passed.
inline void __host__ __device__ throw_on_error(cudaError_t status)
{
#if __THRUST_HAS_HIPRT__
  // Clear the global HIP error state which may have been set by the last
  // call. Otherwise, errors may "leak" to unrelated kernel launches.
  cudaError_t clear_error_status = cudaGetLastError();
  THRUST_UNUSED_VAR(clear_error_status);
 #endif

    if(cudaSuccess != status)
    {
      if (THRUST_IS_HOST_CODE) {
        #if THRUST_INCLUDE_HOST_CODE
        throw thrust::system_error(status, thrust::cuda_category());
        #endif
      } else {
      #if THRUST_INCLUDE_DEVICE_CODE
        #if __THRUST_HAS_HIPRT__
          printf("Thrust HIP Backend Error %s\n", cudaGetErrorString(status));
        #else
          THRUST_HIP_PRINTF("Error %d \n", (int)status);
          #if THRUST_HIP_PRINTF_ENABLED == 0
          THRUST_UNUSED_VAR(status);
          #endif
        #endif
        cuda_cub::terminate();
      #endif
    }
  }
}

template <class ValueType, class InputIt, class UnaryOp>
struct transform_input_iterator_t
{
    typedef transform_input_iterator_t                         self_t;
    typedef typename iterator_traits<InputIt>::difference_type difference_type;
    typedef ValueType                                          value_type;
    typedef void                                               pointer;
    typedef value_type                                         reference;
    typedef std::random_access_iterator_tag                    iterator_category;

    InputIt         input;
    mutable UnaryOp op;

    THRUST_HIP_FUNCTION transform_input_iterator_t(InputIt input, UnaryOp op)
        : input(input)
        , op(op)
    {
    }


#if THRUST_CPP_DIALECT >= 2011
  transform_input_iterator_t(const self_t &) = default;
#endif

  // UnaryOp might not be copy assignable, such as when it is a lambda.  Define
  // an explicit copy assignment operator that doesn't try to assign it.
  THRUST_HIP_FUNCTION self_t& operator=(const self_t& o)
  {
    input = o.input;
    return *this;
  }

    /// Postfix increment
    THRUST_HIP_FUNCTION self_t operator++(int)
    {
        self_t retval = *this;
        ++input;
        return retval;
    }

    /// Prefix increment
    THRUST_HIP_FUNCTION self_t operator++()
    {
        ++input;
        return *this;
    }

    /// Indirection
    THRUST_HIP_FUNCTION reference operator*() const
    {
        typename thrust::iterator_value<InputIt>::type x = *input;
        return op(x);
    }
    /// Indirection
    THRUST_HIP_FUNCTION reference operator*()
    {
        typename thrust::iterator_value<InputIt>::type x = *input;
        return op(x);
    }

    /// Addition
    THRUST_HIP_FUNCTION self_t operator+(difference_type n) const
    {
        return self_t(input + n, op);
    }

    /// Addition assignment
    THRUST_HIP_FUNCTION self_t& operator+=(difference_type n)
    {
        input += n;
        return *this;
    }

    /// Subtraction
    THRUST_HIP_FUNCTION self_t operator-(difference_type n) const
    {
        return self_t(input - n, op);
    }

    /// Subtraction assignment
    THRUST_HIP_FUNCTION self_t& operator-=(difference_type n)
    {
        input -= n;
        return *this;
    }

    /// Distance
    THRUST_HIP_FUNCTION difference_type operator-(self_t other) const
    {
        return input - other.input;
    }

    /// Array subscript
    THRUST_HIP_FUNCTION reference operator[](difference_type n) const
    {
        return op(input[n]);
    }

    /// Equal to
    THRUST_HIP_FUNCTION bool operator==(const self_t& rhs) const
    {
        return (input == rhs.input);
    }

    /// Not equal to
    THRUST_HIP_FUNCTION bool operator!=(const self_t& rhs) const
    {
        return (input != rhs.input);
    }

}; // struct transform_input_iterarot_t

template <class ValueType, class InputIt1, class InputIt2, class BinaryOp>
struct transform_pair_of_input_iterators_t
{
    typedef transform_pair_of_input_iterators_t                 self_t;
    typedef typename iterator_traits<InputIt1>::difference_type difference_type;
    typedef ValueType                                           value_type;
    typedef void                                                pointer;
    typedef value_type                                          reference;
    typedef std::random_access_iterator_tag                     iterator_category;

    InputIt1         input1;
    InputIt2         input2;
    mutable BinaryOp op;

    THRUST_HIP_FUNCTION transform_pair_of_input_iterators_t(InputIt1 input1_,
                                                                            InputIt2 input2_,
                                                                            BinaryOp op_)
        : input1(input1_)
        , input2(input2_)
        , op(op_)
    {
    }

    #if THRUST_CPP_DIALECT >= 2011
      transform_pair_of_input_iterators_t(const self_t &) = default;
    #endif

      // BinaryOp might not be copy assignable, such as when it is a lambda.
      // Define an explicit copy assignment operator that doesn't try to assign it.
      self_t& operator=(const self_t& o)
      {
        input1 = o.input1;
        input2 = o.input2;
        return *this;
      }

    /// Postfix increment
    THRUST_HIP_FUNCTION self_t operator++(int)
    {
        self_t retval = *this;
        ++input1;
        ++input2;
        return retval;
    }

    /// Prefix increment
    THRUST_HIP_FUNCTION self_t operator++()
    {
        ++input1;
        ++input2;
        return *this;
    }

    /// Indirection
    THRUST_HIP_FUNCTION reference operator*() const
    {
        return op(*input1, *input2);
    }
    /// Indirection
    THRUST_HIP_FUNCTION reference operator*()
    {
        return op(*input1, *input2);
    }

    /// Addition
    THRUST_HIP_FUNCTION self_t operator+(difference_type n) const
    {
        return self_t(input1 + n, input2 + n, op);
    }

    /// Addition assignment
    THRUST_HIP_FUNCTION self_t& operator+=(difference_type n)
    {
        input1 += n;
        input2 += n;
        return *this;
    }

    /// Subtraction
    THRUST_HIP_FUNCTION self_t operator-(difference_type n) const
    {
        return self_t(input1 - n, input2 - n, op);
    }

    /// Subtraction assignment
    THRUST_HIP_FUNCTION self_t& operator-=(difference_type n)
    {
        input1 -= n;
        input2 -= n;
        return *this;
    }

    /// Distance
    THRUST_HIP_FUNCTION difference_type operator-(self_t other) const
    {
        return input1 - other.input1;
    }

    /// Array subscript
    THRUST_HIP_FUNCTION reference operator[](difference_type n) const
    {
        return op(input1[n], input2[n]);
    }

    /// Equal to
    THRUST_HIP_FUNCTION bool operator==(const self_t& rhs) const
    {
        return (input1 == rhs.input1) && (input2 == rhs.input2);
    }

    /// Not equal to
    THRUST_HIP_FUNCTION bool operator!=(const self_t& rhs) const
    {
        return (input1 != rhs.input1) || (input2 != rhs.input2);
    }

}; // struct trasnform_pair_of_input_iterators_t

struct identity
{
    template <class T>
    __host__ __device__ T const& operator()(T const& t) const
    {
        return t;
    }

    template <class T>
    __host__ __device__ T& operator()(T& t) const
    {
        return t;
    }
};

template <class T>
struct counting_iterator_t
{
    typedef counting_iterator_t             self_t;
    typedef T                               difference_type;
    typedef T                               value_type;
    typedef void                            pointer;
    typedef T                               reference;
    typedef std::random_access_iterator_tag iterator_category;

    T count;

    THRUST_HIP_FUNCTION counting_iterator_t(T count_)
        : count(count_)
    {
    }

    /// Postfix increment
    THRUST_HIP_FUNCTION self_t operator++(int)
    {
        self_t retval = *this;
        ++count;
        return retval;
    }

    /// Prefix increment
    THRUST_HIP_FUNCTION self_t operator++()
    {
        ++count;
        return *this;
    }

    /// Indirection
    THRUST_HIP_FUNCTION reference operator*() const
    {
        return count;
    }

    /// Indirection
    THRUST_HIP_FUNCTION reference operator*()
    {
        return count;
    }

    /// Addition
    THRUST_HIP_FUNCTION self_t operator+(difference_type n) const
    {
        return self_t(count + n);
    }

    /// Addition assignment
    THRUST_HIP_FUNCTION self_t& operator+=(difference_type n)
    {
        count += n;
        return *this;
    }

    /// Subtraction
    THRUST_HIP_FUNCTION self_t operator-(difference_type n) const
    {
        return self_t(count - n);
    }

    /// Subtraction assignment
    THRUST_HIP_FUNCTION self_t& operator-=(difference_type n)
    {
        count -= n;
        return *this;
    }

    /// Distance
    THRUST_HIP_FUNCTION difference_type operator-(self_t other) const
    {
        return count - other.count;
    }

    /// Array subscript
    THRUST_HIP_FUNCTION reference operator[](difference_type n) const
    {
        return count + n;
    }

    /// Equal to
    THRUST_HIP_FUNCTION bool operator==(const self_t& rhs) const
    {
        return (count == rhs.count);
    }

    /// Not equal to
    THRUST_HIP_FUNCTION bool operator!=(const self_t& rhs) const
    {
        return (count != rhs.count);
    }

}; // struct count_iterator_t

} // cuda_cub
THRUST_NAMESPACE_END
