Unverified Commit 52c3fe05 authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

Refactor for MIOpen integration (#4)

Refactor, so can bring multi-index transformation and padding support into MIOpen
parent 9aaeacc8
......@@ -5,14 +5,12 @@
#include "utility.hpp"
#include "integral_constant.hpp"
#include "number.hpp"
#include "float_type.hpp"
#include "type.hpp"
#include "tuple.hpp"
#include "math.hpp"
#include "vector_type.hpp"
#include "sequence.hpp"
#include "sequence_helper.hpp"
#include "array.hpp"
#include "array_helper.hpp"
#include "functional.hpp"
#include "functional2.hpp"
#include "functional3.hpp"
......@@ -22,8 +20,8 @@
#include "amd_inline_asm.hpp"
#endif
#if CK_USE_AMD_INTRINSIC
#include "amd_intrinsic.hpp"
#if CK_USE_AMD_BUFFER_ADDRESSING
#include "amd_buffer_addressing.hpp"
#endif
#endif
......@@ -3,23 +3,58 @@
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#include "bfloat16_dev.hpp"
// index type: unsigned or signed
#define CK_UNSIGNED_INDEX_TYPE 0
// device backend
#define CK_DEVICE_BACKEND_AMD 1
#define CK_USE_AMD_INTRINSIC 1
// AMD inline asm
#ifndef CK_USE_AMD_INLINE_ASM
#define CK_USE_AMD_INLINE_ASM 1
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#endif
#ifndef CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1
#endif
// AMD buffer addressing
#ifndef CK_USE_AMD_BUFFER_ADDRESSING
#define CK_USE_AMD_BUFFER_ADDRESSING 1
#endif
#ifndef CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
#define CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC 1
#endif
// AMD XDLOPS
#ifndef CK_USE_AMD_XDLOPS
#define CK_USE_AMD_XDLOPS 1
#endif
#ifndef CK_USE_AMD_XDLOPS_INLINE_ASM
#define CK_USE_AMD_XDLOPS_INLINE_ASM 1
#endif
// experimental implementation
#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0
#define CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
// workaround
#define CK_WORKAROUND_SWDEV_202749 1
namespace ck {
enum address_space_t
enum AddressSpace
{
generic = 0,
global = 3
generic,
global
};
#if CK_UNSIGNED_INDEX_TYPE
......@@ -28,24 +63,8 @@ using index_t = uint32_t;
using index_t = int32_t;
#endif
// For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4)));
// int32x4_t use by buffer_load and buffer_store llvm intrinsic
typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
// data type conversion
template <typename T>
struct type_convert
{
template <typename X>
__device__ T operator()(const X& x) const
{
return static_cast<T>(x);
}
};
} // namespace ck
#endif
......@@ -6,21 +6,33 @@
#include "nvToolsExt.h"
#include "helper_cuda.h"
// index type: unsigned or signed
#define CK_UNSIGNED_INDEX_TYPE 0
// device backend
#define CK_DEVICE_BACKEND_NVIDIA 1
#define CK_USE_AMD_INTRINSIC 0
// disable AMD inline asm and intrinsic
#define CK_USE_AMD_INLINE_ASM 0
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 0
#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 0
#define CK_USE_AMD_BUFFER_ADDRESSING 0
#define CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC 0
#define CK_USE_AMD_XDLOPS 0
#define CK_USE_AMD_XDLOPS_INLINE_ASM 0
// experimental implementation
#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 0
#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0
#define CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
namespace ck {
enum address_space_t
enum AddressSpace
{
generic = 0,
generic,
global = generic
};
......@@ -30,24 +42,5 @@ using index_t = uint32_t;
using index_t = int32_t;
#endif
// For some reason, CUDA need this definition, otherwise
// compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct
// instruction,
using float2_t = float2;
using float4_t = float4;
// data type conversion
template <typename T>
struct type_convert
{
template <typename X>
__device__ T operator()(const X& x) const
{
return static_cast<T>(x);
}
};
} // namespace ck
#endif
This diff is collapsed.
#ifndef CK_ARRAY_HELPER_HPP
#define CK_ARRAY_HELPER_HPP
#ifndef CK_PRINT_ARRAY_HPP
#define CK_PRINT_ARRAY_HPP
#include "array.hpp"
......
#ifndef CK_SEQUENCE_HELPER_HPP
#define CK_SEQUENCE_HELPER_HPP
#ifndef CK_PRINT_SEQUENCE_HPP
#define CK_PRINT_SEQUENCE_HPP
#include "sequence.hpp"
......
#ifndef CONV_COMMON_HPP
#define CONV_COMMON_HPP
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
// this is ugly, only for 4d
template <class InDesc, class WeiDesc>
......
#pragma once
#include "tensor.hpp"
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor_deprecated.hpp"
// this is ugly, only for 4d
template <class TConstTensorDesc>
......
This diff is collapsed.
This diff is collapsed.
#!/bin/bash
export KMOPTLLC="-mattr=+enable-ds128 -amdgpu-enable-global-sgpr-addr"
export KMDUMPISA=1
export KMDUMPLLVM=1
#export KMOPTLLC="-mattr=+enable-ds128"
export KMOPTLLC="-mattr=+enable-ds128 -amdgpu-enable-global-sgpr-addr"
export KMDUMPDIR=$PWD
make -j driver
/opt/rocm/hcc/bin/llvm-objdump -mcpu=gfx906 -source -line-numbers driver/dump-gfx906.isabin > driver/dump-gfx906.isabin.asm
make -j driver
#/opt/rocm/hcc/bin/llvm-objdump -mcpu=gfx906 -source -line-numbers driver/dump-gfx906.isabin > driver/dump-gfx906.isabin.asm
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment