Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
c1eaba27
Commit
c1eaba27
authored
Aug 19, 2020
by
Chao Liu
Browse files
prototype dynamic tensor descriptor
parent
f1541994
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
350 additions
and
54 deletions
+350
-54
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
...rnel/include/kernel_algorithm/dummy_dynamic_transform.hpp
+20
-22
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
...lude/tensor_description/dynamic_multi_index_transform.hpp
+6
-5
composable_kernel/include/tensor_description/dynamic_tensor_coordinate.hpp
.../include/tensor_description/dynamic_tensor_coordinate.hpp
+270
-0
composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
.../include/tensor_description/dynamic_tensor_descriptor.hpp
+23
-12
driver/include/device_dummy_dynamic_transform.hpp
driver/include/device_dummy_dynamic_transform.hpp
+31
-15
No files found.
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
View file @
c1eaba27
...
...
@@ -2,10 +2,9 @@
#define CK_DUMMY_DYNAMIC_TRANSFORM_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "dynamic_tensor_coordinate.hpp"
namespace
ck
{
__host__
__device__
constexpr
auto
...
...
@@ -83,7 +82,7 @@ template <index_t BlockSize>
struct
DummyDynamicTransform
{
__device__
void
Run_v1
(
index_t
*
const
__restrict__
p_wei_global
,
index_
t
*
const
__restrict__
p_in_global
,
floa
t
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
const
DynamicNativeTensorDescriptor
<
4
>
wei_k_c_y_x_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
in_n_c_hi_wi_global_desc
,
...
...
@@ -522,7 +521,7 @@ struct DummyDynamicTransform
}
__device__
void
Run_v2
(
index_t
*
const
__restrict__
p_wei_global
,
index_
t
*
const
__restrict__
p_in_global
,
floa
t
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
const
DynamicNativeTensorDescriptor
<
4
>
wei_k_c_y_x_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
in_n_c_hi_wi_global_desc
,
...
...
@@ -542,24 +541,23 @@ struct DummyDynamicTransform
const
auto
in_gemmk_gemmn_global_desc
=
transformed_tensor_descs
.
At
(
Number
<
0
>
{});
#pragma unroll 1
for
(
index_t
iter
=
0
;
iter
<
100
;
++
iter
)
MultiIndex
<
2
>
idx
;
// initialize idx
for
(
index_t
i
=
0
;
i
<
2
;
++
i
)
{
//
MultiIndex
<
2
>
idx
;
idx
(
i
)
=
p_wei_global
[
get_thread_local_1d_id
()
+
i
];
}
// initialize idx
for
(
index_t
i
=
0
;
i
<
2
;
++
i
)
{
idx
(
i
)
=
p_wei_global
[
10
*
iter
+
get_thread_local_1d_id
()
+
i
];
}
const
index_t
niter
=
p_wei_global
[
10
];
auto
in_gemmk_gemmn_coord
=
make_dynamic_tensor_coordinate
(
in_gemmk_gemmn_global_desc
,
idx
);
// offset
index_t
offset
=
in_gemmk_gemmn_global_desc
.
CalculateOffset
(
idx
);
for
(
index_t
iter
=
0
;
iter
<
100
;
++
iter
)
{
constexpr
auto
gemmk1_gemmn0
=
MultiIndex
<
2
>
{
1
,
0
};
// is_in_bound
bool
is_in_bound
=
in_gemmk_gemmn_global_desc
.
IsValidUpperIndexMappedToValidLowerIndex
(
idx
);
in_gemmk_gemmn_coord
+=
gemmk1_gemmn0
;
// write
float
value
=
1
;
...
...
@@ -575,14 +573,14 @@ struct DummyDynamicTransform
true
,
1
,
p_out_global
,
o
ffset
,
i
s_in_bound
,
out_n_k_ho_wo
_global_desc
.
GetElementSpace
());
in_gemmk_gemmn_coord
.
GetO
ffset
()
,
i
n_gemmk_gemmn_coord
.
IsOffsetValidAssumingUpperIndexIsValid
()
,
in_gemmk_gemmn
_global_desc
.
GetElementSpace
());
}
}
__device__
void
Run
(
index_t
*
const
__restrict__
p_wei_global
,
index_
t
*
const
__restrict__
p_in_global
,
floa
t
*
const
__restrict__
p_in_global
,
float
*
const
__restrict__
p_out_global
,
const
DynamicNativeTensorDescriptor
<
4
>
wei_k_c_y_x_global_desc
,
const
DynamicNativeTensorDescriptor
<
4
>
in_n_c_hi_wi_global_desc
,
...
...
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
View file @
c1eaba27
...
...
@@ -344,7 +344,8 @@ struct DynamicMerge
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex
idx_low_diff_const
=
CalculateLowerIndex
(
idx_up_diff
);
LowerIndex
idx_low_diff_const
;
CalculateLowerIndex
(
idx_low_diff_const
,
idx_up_diff
);
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
...
...
@@ -361,15 +362,15 @@ struct DynamicMerge
low_lengths_[i] + idx_low_diff_const[i];
#endif
index_t
idx_low_tmp
[
i
]
=
idx_low_old
[
i
]
+
carry
;
index_t
idx_low_tmp
=
idx_low_old
[
i
]
+
carry
;
bool
do_carry
=
idx_low_tmp
[
i
]
>=
idx_low_length_minus_idx_low_diff_const
;
bool
do_carry
=
idx_low_tmp
>=
idx_low_length_minus_idx_low_diff_const
;
#if 0
bool do_borrow = idx_low_tmp
[i]
< -idx_low_diff_const[i];
bool do_borrow = idx_low_tmp < -idx_low_diff_const[i];
#endif
idx_low_diff
(
i
)
=
do_carry
?
-
idx_low_length_minus_idx_low_diff_const
:
idx_low_diff_const
;
do_carry
?
-
idx_low_length_minus_idx_low_diff_const
:
idx_low_diff_const
[
i
]
;
#if 0
idx_low_diff(i) =
do_borrow ? idx_low_length_plus_idx_low_diff_const : idx_low_diff[i];
...
...
composable_kernel/include/tensor_description/dynamic_tensor_coordinate.hpp
0 → 100644
View file @
c1eaba27
#ifndef CK_DYNAMIC_TENSOR_COORDINATE_HPP
#define CK_DYNAMIC_TENSOR_COORDINATE_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
namespace
ck
{
// A "tensor cooridnate" is an opaque object that represents a "point of location" inside a tensor
// At the bare minimun, user should be able to query the following information from a tensor
// coordinate:
// 1. Tensor descriptor
// 2. Location, represented in the form of multi-index
// 3. Location, represented in the form of the offset to the origin of the tensor
// 4. If the location is inside invalid area or not, e.g. the padding area of an implicitly padded
// tensor is considered invalid, because the padding area doesn't have any physical memory
// allocation
// A tensor cooridnate also provides following functionality:
// 1. Given step size in each dimension, update itself, or return a new tensor cooridnate, so user
// can freely move the "point of location" inside the tensor
// wrapper class for DynamicNativeTensorCoordinate and DynamicTransformedTensorCoordinate
template
<
typename
TensorDesc
>
struct
DynamicTensorCoordinate
;
// tensor coordinate for native tensor
template
<
typename
TensorDesc
>
struct
DynamicNativeTensorCoordinate
{
using
type
=
DynamicNativeTensorCoordinate
;
using
tensor_desc_type
=
TensorDesc
;
static
constexpr
index_t
NDim
=
tensor_desc_type
::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
NDim
>
;
__host__
__device__
explicit
constexpr
DynamicNativeTensorCoordinate
(
const
tensor_desc_type
&
tensor_desc
,
const
Index
&
idx
)
:
tensor_desc_
{
tensor_desc
},
idx_
{
idx
},
offset_
{
tensor_desc
.
CalculateOffset
(
idx
)}
{
}
__host__
__device__
constexpr
auto
GetTensorDescriptor
()
const
{
return
tensor_desc_
;
}
__host__
__device__
constexpr
const
auto
&
GetUpperIndex
()
const
{
return
idx_
;
}
__host__
__device__
constexpr
const
auto
&
GetIndex
()
const
{
return
idx_
;
}
__host__
__device__
constexpr
const
index_t
&
GetOffset
()
const
{
return
offset_
;
}
__host__
__device__
constexpr
type
operator
+=
(
const
Index
&
idx_diff
)
{
// idx_ is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
idx_
+=
idx_diff
;
offset_
+=
tensor_desc_
.
CalculateOffsetDiff
(
idx_diff
);
return
*
this
;
}
__host__
__device__
constexpr
type
operator
-=
(
const
Index
&
idx_diff
)
{
// idx_ is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
idx_
-=
idx_diff
;
offset_
-=
tensor_desc_
.
CalculateOffsetDiff
(
idx_diff
);
return
*
this
;
}
__host__
__device__
constexpr
type
operator
+
(
const
Index
&
idx_diff
)
const
{
type
coord
=
*
this
;
coord
+=
idx_diff
;
return
coord
;
}
__host__
__device__
constexpr
type
operator
-
(
const
Index
&
idx_diff
)
const
{
type
coord
=
*
this
;
coord
-=
idx_diff
;
return
coord
;
}
__host__
__device__
constexpr
index_t
CalculateOffsetDiff
(
const
Index
&
idx_diff
)
const
{
return
tensor_desc_
.
CalculateOffsetDiff
(
idx_diff
);
}
// evaluated at run-time
__host__
__device__
constexpr
bool
IsUpperIndexValid
()
const
{
return
tensor_desc_
.
IsUpperIndexValid
(
idx_
);
}
// evaluated at run-time
__host__
__device__
constexpr
bool
IsOffsetValid
()
const
{
// For native tensor, offset is valid if upper-index is valid
return
IsUpperIndexValid
();
}
// evaluated at compile-time
__host__
__device__
static
constexpr
bool
IsOffsetValidAssumingUpperIndexIsValid
()
{
return
true
;
}
private:
const
tensor_desc_type
tensor_desc_
;
// idx_ may be saved and updated, however, the value of some (or all) of its entries may
// never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
Index
idx_
;
index_t
offset_
;
};
// tensor coordinate for transformed tensor
template
<
typename
TensorDesc
>
struct
DynamicTransformedTensorCoordinate
{
static
constexpr
index_t
NDimUp
=
TensorDesc
::
GetNumOfDimension
();
using
UpperDesc
=
TensorDesc
;
using
UpperCoord
=
DynamicTransformedTensorCoordinate
;
using
UpperIndex
=
MultiIndex
<
NDimUp
>
;
using
LowerDesc
=
typename
UpperDesc
::
LowerDesc
;
using
LowerCoord
=
typename
DynamicTensorCoordinate
<
LowerDesc
>::
type
;
__host__
__device__
explicit
constexpr
DynamicTransformedTensorCoordinate
(
const
UpperDesc
&
tensor_desc_up
,
const
UpperIndex
&
idx_up
)
:
tensor_desc_up_
{
tensor_desc_up
},
idx_up_
{
idx_up
},
coord_low_
{
tensor_desc_up
.
GetLowerTensorDescriptor
(),
tensor_desc_up
.
CalculateLowerIndex
(
idx_up
)}
{
}
__host__
__device__
constexpr
auto
GetTensorDescriptor
()
const
{
return
tensor_desc_up_
;
}
__host__
__device__
constexpr
const
LowerCoord
&
GetLowerCoordinate
()
const
{
return
coord_low_
;
}
__host__
__device__
constexpr
const
UpperIndex
&
GetUpperIndex
()
const
{
return
idx_up_
;
}
__host__
__device__
constexpr
const
UpperIndex
&
GetIndex
()
const
{
return
idx_up_
;
}
__host__
__device__
constexpr
const
index_t
&
GetOffset
()
const
{
return
GetLowerCoordinate
().
GetOffset
();
}
__host__
__device__
constexpr
UpperCoord
operator
+=
(
const
UpperIndex
&
idx_up_diff
)
{
// For transformation of multi-index difference, not all transformation functions need to
// know the old lower-index or the old upper-index. We pass both of them to the
// transformation function. The transformation function itself decides to use them or not.
coord_low_
+=
tensor_desc_up_
.
CalculateLowerIndexDiff
(
idx_up_diff
,
GetLowerCoordinate
().
GetIndex
(),
GetIndex
());
// idx_up_ is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
idx_up_
+=
idx_up_diff
;
return
*
this
;
}
__host__
__device__
constexpr
UpperCoord
operator
-=
(
const
UpperIndex
&
idx_up_diff
)
{
coord_low_
-=
tensor_desc_up_
.
CalculateLowerIndexDiff
(
idx_up_diff
,
GetIndex
(),
GetLowerCoordinate
().
GetIndex
());
// mIndex is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
idx_up_
-=
idx_up_diff
;
return
*
this
;
}
__host__
__device__
constexpr
UpperCoord
operator
+
(
const
UpperIndex
&
idx_up_diff
)
const
{
UpperCoord
coord_up
=
*
this
;
coord_up
+=
idx_up_diff
;
return
coord_up
;
}
__host__
__device__
constexpr
UpperCoord
operator
-
(
const
UpperIndex
&
idx_up_diff
)
const
{
UpperCoord
coord_up
=
*
this
;
coord_up
-=
idx_up_diff
;
return
coord_up
;
}
// Calculate offset diff without updating tensor-coordinate
// If idx_up_diff is know at compile time, and has only non-zero entries on linear dimensions,
// then all calculation can be done at compile-time.
// TODO: this function is not compiled to expected ISA
__host__
__device__
constexpr
index_t
CalculateOffsetDiff
(
const
UpperIndex
&
idx_up_diff
)
const
{
// For transformation of multi-index difference, not all transformation functions need to
// know the old lower-index or the old upper-index. We pass both of them to the
// transformation function. The transformation function itself decides to use them or not.
const
auto
idx_low_diff
=
tensor_desc_up_
.
CalculateLowerIndexDiff
(
idx_up_diff
,
coord_low_
.
GetIndex
(),
idx_up_
);
return
coord_low_
.
CalculateOffsetDiff
(
idx_low_diff
);
}
// evaluated at run-time
__host__
__device__
constexpr
bool
IsUpperIndexValid
()
const
{
return
tensor_desc_up_
.
IsUpperIndexValid
(
idx_up_
);
}
// evaluted at run-time
__host__
__device__
constexpr
bool
IsOffsetValid
()
const
{
return
IsUpperIndexValid
()
&&
coord_low_
.
IsOffsetValidAssumingUpperIndexIsValid
();
}
// most evaluatation is done at comile-time
__host__
__device__
constexpr
bool
IsOffsetValidAssumingUpperIndexIsValid
()
const
{
return
tensor_desc_up_
.
IsValidUpperIndexMappedToValidLowerIndex
(
idx_up_
)
&&
coord_low_
.
IsOffsetValidAssumingUpperIndexIsValid
();
}
private:
const
UpperDesc
tensor_desc_up_
;
// idx_up_ may be calculated and updated, however, the value of some (or all) of its entries
// may never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
UpperIndex
idx_up_
;
LowerCoord
coord_low_
;
};
template
<
index_t
NDim
>
__host__
__device__
constexpr
auto
make_dynamic_tensor_coordinate
(
const
DynamicNativeTensorDescriptor
<
NDim
>&
tensor_desc
,
const
MultiIndex
<
NDim
>&
idx
)
{
return
DynamicNativeTensorCoordinate
<
DynamicNativeTensorDescriptor
<
NDim
>>
{
tensor_desc
,
idx
};
}
template
<
index_t
NDim
,
typename
...
Ts
>
__host__
__device__
constexpr
auto
make_dynamic_tensor_coordinate
(
const
DynamicTransformedTensorDescriptor
<
Ts
...
>&
tensor_desc
,
const
MultiIndex
<
NDim
>&
idx
)
{
static_assert
(
DynamicTransformedTensorDescriptor
<
Ts
...
>::
GetNumOfDimension
()
==
NDim
,
"wrong! inconsistent # of dimensions"
);
return
DynamicTransformedTensorCoordinate
<
DynamicTransformedTensorDescriptor
<
Ts
...
>>
{
tensor_desc
,
idx
};
}
template
<
typename
TensorDesc
>
struct
DynamicTensorCoordinate
{
static
constexpr
index_t
NDim
=
TensorDesc
::
GetNumOfDimension
();
using
type
=
decltype
(
make_dynamic_tensor_coordinate
(
TensorDesc
{},
MultiIndex
<
NDim
>
{}));
};
}
// namespace ck
#endif
composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
View file @
c1eaba27
...
...
@@ -20,6 +20,11 @@ struct DynamicNativeTensorDescriptor
{
}
__host__
__device__
explicit
constexpr
DynamicNativeTensorDescriptor
()
:
lengths_
{
make_zero_array
<
index_t
,
NDim
>
()},
strides_
{
make_zero_array
<
index_t
,
NDim
>
()}
{
}
__host__
__device__
static
constexpr
index_t
GetNumOfDimension
()
{
return
NDim
;
}
__host__
__device__
constexpr
auto
GetLengths
()
const
{
return
lengths_
;
}
...
...
@@ -62,12 +67,10 @@ struct DynamicNativeTensorDescriptor
return
offset
;
}
template
<
typename
UpIdxDiff
,
typename
UpIdx
,
typename
LowIdx
>
__host__
__device__
constexpr
index_t
CalculateOffsetDiff
(
const
UpIdxDiff
&
idx_up_diff
,
const
LowIdx
&
/* idx_low_old */
,
const
UpIdx
&
/* idx_up_old */
)
const
template
<
typename
IdxDiff
>
__host__
__device__
constexpr
index_t
CalculateOffsetDiff
(
const
IdxDiff
&
idx_diff
)
const
{
return
CalculateOffset
(
idx_
up_
diff
);
return
CalculateOffset
(
idx_diff
);
}
template
<
typename
Idx
>
...
...
@@ -92,13 +95,16 @@ template <typename LowTensorDescriptor, // DynamicNativeTensorDescriptor or
typename
UpDimensionIds
>
// Tuple<Sequence<...>>
struct
DynamicTransformedTensorDescriptor
{
const
LowTensorDescriptor
low_tensor_desc_
;
const
Transforms
transforms_
;
using
LowerDesc
=
LowTensorDescriptor
;
using
UpperDesc
=
DynamicTransformedTensorDescriptor
;
static
constexpr
index_t
NTransform
=
Transforms
::
Size
();
const
LowerDesc
low_tensor_desc_
;
const
Transforms
transforms_
;
__host__
__device__
static
constexpr
index_t
GetNumOfLowerDimension
()
{
return
Low
TensorDescriptor
::
GetNumOfDimension
();
return
Low
erDesc
::
GetNumOfDimension
();
}
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
...
...
@@ -138,7 +144,7 @@ struct DynamicTransformedTensorDescriptor
};
__host__
__device__
explicit
constexpr
DynamicTransformedTensorDescriptor
(
const
Low
TensorDescriptor
&
low_tensor_desc
,
const
Transforms
&
transforms
)
const
Low
erDesc
&
low_tensor_desc
,
const
Transforms
&
transforms
)
:
low_tensor_desc_
{
low_tensor_desc
},
transforms_
{
transforms
}
{
static_assert
(
NTransform
==
Transforms
::
Size
()
&&
NTransform
==
LowDimensionIds
::
Size
()
&&
...
...
@@ -178,7 +184,12 @@ struct DynamicTransformedTensorDescriptor
// of lower-tensor-descriptor
}
__host__
__device__
static
constexpr
auto
GetNumOfDimension
()
__host__
__device__
explicit
constexpr
DynamicTransformedTensorDescriptor
()
:
low_tensor_desc_
{},
transforms_
{}
{
}
__host__
__device__
static
constexpr
index_t
GetNumOfDimension
()
{
return
GetNumOfUpperDimension
();
}
...
...
@@ -281,7 +292,7 @@ struct DynamicTransformedTensorDescriptor
{
LowerIndex
idx_low_diff
;
CalculateLowerIndex
(
idx_low_diff
,
idx_up_diff
,
idx_low_old
,
idx_up_old
);
CalculateLowerIndex
Diff
(
idx_low_diff
,
idx_up_diff
,
idx_low_old
,
idx_up_old
);
return
idx_low_diff
;
}
...
...
@@ -321,7 +332,7 @@ struct DynamicTransformedTensorDescriptor
const
auto
up_dims_part
=
UpDimensionIds
{}.
At
(
itran
);
const
auto
idx_up_part
=
pick_array_element
(
idx_up
,
up_dims_part
);
flag
=
flag
&&
IsValidUpperIndexMappedToValidLowerIndex
(
idx_up_part
);
flag
=
flag
&&
tran
.
IsValidUpperIndexMappedToValidLowerIndex
(
idx_up_part
);
}
});
...
...
driver/include/device_dummy_dynamic_transform.hpp
View file @
c1eaba27
...
...
@@ -50,19 +50,37 @@ void device_dummy_dynamic_transform(InDesc,
in_right_pads
);
const
auto
in_gemmk_gemmn_global_desc
=
tensor_descs
.
At
(
Number
<
0
>
{});
print_array
(
"cpu: in_gemmk_gemmn_global_desc:"
,
in_gemmk_gemmn_global_desc
.
GetLengths
());
const
auto
idx0
=
MultiIndex
<
2
>
({
2591
,
36991
});
const
auto
idx1
=
in_gemmk_gemmn_global_desc
.
CalculateLowerIndex
(
idx0
);
const
auto
idx2
=
in_gemmk_gemmn_global_desc
.
GetLowerTensorDescriptor
().
CalculateLowerIndex
(
idx1
);
auto
in_gemmk_gemmn_coord
=
make_dynamic_tensor_coordinate
(
in_gemmk_gemmn_global_desc
,
MultiIndex
<
2
>
{
0
,
0
});
const
index_t
offset
=
in_gemmk_gemmn_global_desc
.
CalculateOffset
(
idx0
);
print_array
(
"idx0:"
,
idx0
);
print_array
(
"idx1:"
,
idx1
);
print_array
(
"idx2:"
,
idx2
);
printf
(
"offset %d
\n
"
,
offset
);
for
(
index_t
iter
=
0
;
iter
<
100
;
++
iter
)
{
constexpr
auto
gemmk1_gemmn0
=
MultiIndex
<
2
>
{
1
,
0
};
printf
(
"iter %d
\n
"
,
iter
);
print_array
(
"idx0: "
,
in_gemmk_gemmn_coord
.
GetIndex
());
print_array
(
"idx1: "
,
in_gemmk_gemmn_coord
.
GetLowerCoordinate
().
GetIndex
());
print_array
(
"idx2: "
,
in_gemmk_gemmn_coord
.
GetLowerCoordinate
().
GetLowerCoordinate
().
GetIndex
());
print_array
(
"idx3: "
,
in_gemmk_gemmn_coord
.
GetLowerCoordinate
()
.
GetLowerCoordinate
()
.
GetLowerCoordinate
()
.
GetIndex
());
print_array
(
"idx4: "
,
in_gemmk_gemmn_coord
.
GetLowerCoordinate
()
.
GetLowerCoordinate
()
.
GetLowerCoordinate
()
.
GetLowerCoordinate
()
.
GetIndex
());
printf
(
"offset: %d
\n
"
,
in_gemmk_gemmn_coord
.
GetOffset
());
printf
(
"
\n
"
);
in_gemmk_gemmn_coord
+=
gemmk1_gemmn0
;
}
}
std
::
size_t
data_sz
=
sizeof
(
T
);
...
...
@@ -79,7 +97,6 @@ void device_dummy_dynamic_transform(InDesc,
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
#if 0
using
dummy_transform
=
DummyDynamicTransform
<
BlockSize
>
;
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
...
...
@@ -93,7 +110,7 @@ void device_dummy_dynamic_transform(InDesc,
{
launch_kernel
(
run_gridwise_operation
<
dummy_transform
,
index_t
*
const
,
index_
t* const,
floa
t
*
const
,
float
*
const
,
const
DynamicNativeTensorDescriptor
<
4
>
,
const
DynamicNativeTensorDescriptor
<
4
>
,
...
...
@@ -106,8 +123,8 @@ void device_dummy_dynamic_transform(InDesc,
dim3
(
BlockSize
),
0
,
0
,
static_cast<index_t*>(in_nchw_device_buf.GetDeviceBuffer()),
static_cast
<
index_t
*>
(
wei_kcyx_device_buf
.
GetDeviceBuffer
()),
static_cast
<
float
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
float
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()),
wei_kcyx_desc
,
in_nchw_desc
,
...
...
@@ -118,7 +135,6 @@ void device_dummy_dynamic_transform(InDesc,
in_right_pads
);
}
}
#endif
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment