Commit 2662f8e5 authored by root's avatar root
Browse files

fixed naive

parent 61a1c170
...@@ -130,13 +130,13 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -130,13 +130,13 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
// thread A, B for GEMM // thread A, B for GEMM
constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2( constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
Number<KPerThreadLoop>{}, Number<MPerThread>{}); make_tuple(Number<KPerThreadLoop>{}, Number<MPerThread>{}));
constexpr auto b_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2( constexpr auto b_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
Number<KPerThreadLoop>{}, Number<NPerThread>{}); make_tuple(Number<KPerThreadLoop>{}, Number<NPerThread>{}));
FloatA p_a_thread[a_thread_mtx.GetElementSpace()]; FloatA p_a_thread[a_thread_mtx.GetElementSpaceSize()];
FloatB p_b_thread[b_thread_mtx.GetElementSpace()]; FloatB p_b_thread[b_thread_mtx.GetElementSpaceSize()];
constexpr auto a_thread_copy = ThreadwiseMatrixSliceCopy_v3<BlockMatrixA, constexpr auto a_thread_copy = ThreadwiseMatrixSliceCopy_v3<BlockMatrixA,
decltype(a_thread_mtx), decltype(a_thread_mtx),
...@@ -342,7 +342,7 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -342,7 +342,7 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
template <typename FloatA, typename FloatB, typename FloatC> template <typename FloatA, typename FloatB, typename FloatC>
__device__ void Run(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const __device__ void Run(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const
{ {
#if CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE #if 0
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment