Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
d51b8158
Commit
d51b8158
authored
Oct 19, 2018
by
Chao Liu
Browse files
cpu direct conv
parent
06c9f9fe
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
193 additions
and
148 deletions
+193
-148
CMakeLists.txt
CMakeLists.txt
+2
-0
driver/conv.cpp
driver/conv.cpp
+55
-19
src/CMakeLists.txt
src/CMakeLists.txt
+1
-3
src/include/tensor.hpp
src/include/tensor.hpp
+135
-126
No files found.
CMakeLists.txt
View file @
d51b8158
...
...
@@ -41,5 +41,7 @@ link_libraries(${PYTHON_LIBRARIES})
include_directories
(
BEFORE
${
CUDA_COMMON_INCLUDE_DIR
}
)
#
include_directories
(
BEFORE src/include
)
add_subdirectory
(
src
)
add_subdirectory
(
driver
)
driver/conv.cpp
View file @
d51b8158
#include <iostream>
#include "tensor.hpp"
int
main
()
template
<
typename
T
>
void
direct_convolution
(
const
Tensor
<
T
>&
in
,
const
Tensor
<
T
>&
wei
,
Tensor
<
T
>&
out
,
std
::
size_t
num_thread
)
{
auto
f
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
double
v
=
0
;
for
(
int
c
=
0
;
c
<
wei
.
mDesc
.
GetLengths
()[
1
];
++
c
)
{
for
(
int
y
=
0
;
y
<
wei
.
mDesc
.
GetLengths
()[
2
];
++
y
)
{
int
hi
=
ho
+
y
;
for
(
int
x
=
0
;
x
<
wei
.
mDesc
.
GetLengths
()[
3
];
++
x
)
{
int
wi
=
wo
+
x
;
v
+=
in
(
n
,
c
,
hi
,
wi
)
*
wei
(
k
,
c
,
y
,
x
);
}
}
}
out
(
n
,
k
,
ho
,
wo
)
=
v
;
};
int
len_in
=
100
;
int
len_wei
=
3
;
int
len_out
=
len_in
-
len_wei
+
1
;
std
::
vector
<
float
>
in
(
len_in
,
1
);
std
::
vector
<
float
>
wei
(
len_wei
,
1
);
std
::
vector
<
float
>
out
(
len_out
,
1
);
auto
f_par
=
make_ParallelTensorFunctor
(
f
,
out
.
mDesc
.
GetLengths
()[
0
],
out
.
mDesc
.
GetLengths
()[
1
],
out
.
mDesc
.
GetLengths
()[
2
],
out
.
mDesc
.
GetLengths
()[
3
]);
direct_convolution
(
in
.
data
(),
wei
.
data
(),
out
.
data
(),
len_in
,
len_wei
);
f_par
(
num_thread
);
}
template
<
typename
T
>
void
direct_convolution
(
const
T
*
in
,
const
T
*
wei
,
T
*
out
,
const
int
len_in
,
const
int
len_wei
)
template
<
class
T
>
struct
Generator
{
int
len_out
=
len_in
-
len_wei
+
1
;
for
(
int
i_out
=
0
;
i_out
<
len_out
++
i_out
)
{
double
acc
=
0
;
for
(
int
i_wei
=
0
;
i_wei
<
len_wei
;
++
i_wei
)
template
<
class
...
Is
>
T
operator
()(
Is
...
is
)
{
acc
+=
in
[
i_out
+
i_wei
]
*
*
wei
[
i_wei
];
}
out
[
i_out
]
=
acc
;
return
1
;
}
};
int
main
()
{
Tensor
<
float
>
in
({
3
,
16
,
128
,
128
});
Tensor
<
float
>
wei
({
4
,
16
,
3
,
3
});
Tensor
<
float
>
out
({
3
,
4
,
126
,
126
});
int
num_thread
=
std
::
thread
::
hardware_concurrency
();
std
::
cout
<<
__func__
<<
": num_thread "
<<
num_thread
<<
std
::
endl
;
in
.
GenerateTensorValue
(
Generator
<
float
>
{},
num_thread
);
wei
.
GenerateTensorValue
(
Generator
<
float
>
{},
num_thread
);
direct_convolution
(
in
,
wei
,
out
,
num_thread
);
std
::
cout
<<
__func__
<<
": done"
<<
std
::
endl
;
LogRange
(
std
::
cout
,
in
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
,
wei
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
,
out
.
mData
,
","
)
<<
std
::
endl
;
}
src/CMakeLists.txt
View file @
d51b8158
include_directories
(
BEFORE include
)
set
(
SOURCE
tensor.cpp;
)
...
...
@@ -11,7 +9,7 @@ set_target_properties(convolution PROPERTIES PREFIX "")
target_link_libraries
(
convolution boost_python3
)
# cuda
target_link_libraries
(
convolution nvToolsExt
)
target_link_libraries
(
convolution nvToolsExt
cudart
)
target_compile_features
(
convolution PUBLIC
)
set_target_properties
(
convolution PROPERTIES POSITION_INDEPENDENT_CODE ON
)
set_target_properties
(
convolution PROPERTIES CUDA_SEPARABLE_COMPILATION OFF
)
...
...
src/include/tensor.hpp
View file @
d51b8158
#include <thread>
#include <vector>
#include <numeric>
#include <algorithm>
#include <utility>
#include <cassert>
#include <iostream>
#include "cuda_runtime.h"
#include "helper_cuda.h"
template
<
class
Range
>
std
::
ostream
&
LogRange
(
std
::
ostream
&
os
,
Range
&&
r
,
std
::
string
delim
)
{
bool
first
=
true
;
for
(
auto
&&
x
:
r
)
{
if
(
first
)
first
=
false
;
else
os
<<
delim
;
os
<<
x
;
}
return
os
;
}
typedef
enum
{
Half
=
0
,
...
...
@@ -19,6 +37,34 @@ struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float>
{
};
template
<
class
F
,
class
T
,
std
::
size_t
...
Is
>
auto
call_f_unpack_args_impl
(
F
f
,
T
args
,
std
::
index_sequence
<
Is
...
>
)
{
return
f
(
std
::
get
<
Is
>
(
args
)...);
}
template
<
class
F
,
class
T
>
auto
call_f_unpack_args
(
F
f
,
T
args
)
{
constexpr
std
::
size_t
N
=
std
::
tuple_size
<
T
>::
value
;
return
call_f_unpack_args_impl
(
f
,
args
,
std
::
make_index_sequence
<
N
>
{});
}
template
<
class
F
,
class
T
,
std
::
size_t
...
Is
>
auto
construct_f_unpack_args_impl
(
T
args
,
std
::
index_sequence
<
Is
...
>
)
{
return
F
(
std
::
get
<
Is
>
(
args
)...);
}
template
<
class
F
,
class
T
>
auto
construct_f_unpack_args
(
F
,
T
args
)
{
constexpr
std
::
size_t
N
=
std
::
tuple_size
<
T
>::
value
;
return
construct_f_unpack_args_impl
<
F
>
(
args
,
std
::
make_index_sequence
<
N
>
{});
}
struct
TensorDescriptor
{
TensorDescriptor
()
=
delete
;
...
...
@@ -50,12 +96,12 @@ struct TensorDescriptor
const
std
::
vector
<
std
::
size_t
>&
GetLengths
()
const
;
const
std
::
vector
<
std
::
size_t
>&
GetStrides
()
const
;
template
<
class
...
X
s
>
std
::
size_t
Get1dIndex
(
X
s
...
x
s
)
const
template
<
class
...
I
s
>
std
::
size_t
Get1dIndex
(
I
s
...
i
s
)
const
{
assert
(
sizeof
...(
X
s
)
==
this
->
GetDimension
());
std
::
initializer_list
<
std
::
size_t
>
is
{
xs
...};
return
std
::
inner_product
(
is
.
begin
(),
is
.
end
(),
mStrides
.
begin
(),
std
::
size_t
{
0
});
assert
(
sizeof
...(
I
s
)
==
this
->
GetDimension
());
std
::
initializer_list
<
std
::
size_t
>
is
s
{
static_cast
<
std
::
size_t
>
(
is
)
...};
return
std
::
inner_product
(
is
s
.
begin
(),
is
s
.
end
(),
mStrides
.
begin
(),
std
::
size_t
{
0
});
}
private:
...
...
@@ -65,90 +111,6 @@ struct TensorDescriptor
DataType_t
mDataType
;
};
template
<
class
T
>
struct
Tensor
{
template
<
class
X
>
Tensor
(
std
::
initializer_list
<
X
>
lens
)
:
mDesc
(
DataType
<
T
>
{},
lens
),
mData
(
mDesc
.
GetElementSpace
())
{
}
template
<
class
X
>
Tensor
(
std
::
vector
<
X
>
lens
)
:
mDesc
(
DataType
<
T
>
{},
lens
),
mData
(
mDesc
.
GetElementSpace
())
{
}
template
<
class
X
,
class
Y
>
Tensor
(
std
::
vector
<
X
>
lens
,
std
::
vector
<
Y
>
strides
)
:
mDesc
(
DataType
<
T
>
{},
lens
,
strides
),
mData
(
mDesc
.
GetElementSpace
())
{
}
template
<
class
G
>
void
GenerateTensorValue
(
G
g
)
{
// ParallelTensorFunctor([&](Xs... xs) { mData(mDesc.Get1dIndex(xs...)) = g(xs...); },
// mDesc.mLens)();
switch
(
mDesc
.
GetDimension
())
{
case
1
:
{
ParallelTensorFunctor
([
&
](
auto
i
)
{
mData
(
mDesc
.
Get1dIndex
(
i
))
=
g
(
i
);
},
mDesc
.
GetLengths
()[
0
])();
break
;
}
case
2
:
{
ParallelTensorFunctor
(
[
&
](
auto
i0
,
auto
i1
)
{
mData
(
mDesc
.
Get1dIndex
(
i0
,
i1
))
=
g
(
i0
,
i1
);
},
mDesc
.
GetLengths
()[
0
],
mDesc
.
GetLengths
()[
1
])();
break
;
}
case
3
:
{
ParallelTensorFunctor
(
[
&
](
auto
i0
,
auto
i1
,
auto
i2
)
{
mData
(
mDesc
.
Get1dIndex
(
i0
,
i1
,
i2
))
=
g
(
i0
,
i1
,
i2
);
},
mDesc
.
GetLengths
()[
0
],
mDesc
.
GetLengths
()[
1
],
mDesc
.
GetLengths
()[
2
])();
break
;
}
case
4
:
{
ParallelTensorFunctor
(
[
&
](
auto
i0
,
auto
i1
,
auto
i2
,
auto
i3
)
{
mData
(
mDesc
.
Get1dIndex
(
i0
,
i1
,
i2
,
i3
))
=
g
(
i0
,
i1
,
i2
,
i3
);
},
mDesc
.
GetLengths
()[
0
],
mDesc
.
GetLengths
()[
1
],
mDesc
.
GetLengths
()[
3
],
mDesc
.
GetLengths
()[
4
])();
break
;
}
default:
throw
std
::
runtime_error
(
"unspported dimension"
);
}
}
T
&
operator
[](
std
::
size_t
i
)
{
return
mData
.
at
(
i
);
}
const
T
&
operator
[](
std
::
size_t
i
)
const
{
return
mData
.
at
(
i
);
}
typename
std
::
vector
<
T
>::
iterator
begin
()
{
return
mData
.
begin
();
}
typename
std
::
vector
<
T
>::
iterator
end
()
{
return
mData
.
end
();
}
typename
std
::
vector
<
T
>::
const_iterator
begin
()
const
{
return
mData
.
begin
();
}
typename
std
::
vector
<
T
>::
const_iterator
end
()
const
{
return
mData
.
end
();
}
TensorDescriptor
mDesc
;
std
::
vector
<
T
>
mData
;
};
struct
GpuMem
{
GpuMem
()
=
delete
;
...
...
@@ -194,12 +156,6 @@ struct joinable_thread : std::thread
template
<
class
F
,
class
...
Xs
>
struct
ParallelTensorFunctor
{
enum
ParallelMethod_t
{
Serial
=
0
,
Parallel
=
1
,
};
F
mF
;
static
constexpr
std
::
size_t
NDIM
=
sizeof
...(
Xs
);
std
::
array
<
std
::
size_t
,
NDIM
>
mLens
;
...
...
@@ -229,16 +185,7 @@ struct ParallelTensorFunctor
return
indices
;
}
void
operator
()(
std
::
integral_constant
<
ParallelMethod_t
,
ParallelMethod_t
::
Serial
>
)
{
for
(
std
::
size_t
i
=
0
;
i
<
mN1d
;
++
i
)
{
call_f_unpack_args
(
mF
,
GetNdIndices
(
i
));
}
}
void
operator
()(
std
::
integral_constant
<
ParallelMethod_t
,
ParallelMethod_t
::
Parallel
>
,
std
::
size_t
num_thread
)
void
operator
()(
std
::
size_t
num_thread
)
const
{
std
::
size_t
work_per_thread
=
(
mN1d
+
num_thread
-
1
)
/
num_thread
;
...
...
@@ -247,7 +194,7 @@ struct ParallelTensorFunctor
for
(
std
::
size_t
it
=
0
;
it
<
num_thread
;
++
it
)
{
std
::
size_t
iw_begin
=
it
*
work_per_thread
;
std
::
size_t
iw_end
=
std
::
min
((
(
it
+
1
)
*
work_per_thread
,
mN1d
)
)
;
std
::
size_t
iw_end
=
std
::
min
((
it
+
1
)
*
work_per_thread
,
mN1d
);
auto
f
=
[
=
]
{
for
(
std
::
size_t
iw
=
iw_begin
;
iw
<
iw_end
;
++
iw
)
...
...
@@ -260,30 +207,92 @@ struct ParallelTensorFunctor
}
};
template
<
class
F
,
class
T
>
auto
call_f_unpack_args
(
F
f
,
T
arg
s
)
template
<
class
F
,
class
...
Xs
>
auto
make_ParallelTensorFunctor
(
F
f
,
Xs
...
x
s
)
{
static
constexpr
std
::
size_t
N
=
std
::
tuple_size
<
T
>::
value
;
return
call_f_unpack_args_impl
(
f
,
args
,
std
::
make_index_sequence
<
N
>
{});
return
ParallelTensorFunctor
<
F
,
Xs
...
>
(
f
,
xs
...);
}
template
<
class
F
,
class
T
,
class
...
Is
>
auto
call_f_unpack_args_impl
(
F
f
,
T
args
,
std
::
integer_sequence
<
Is
...
>
)
template
<
class
T
>
struct
Tensor
{
return
f
(
std
::
get
<
Is
>
(
args
)...);
}
template
<
class
X
>
Tensor
(
std
::
initializer_list
<
X
>
lens
)
:
mDesc
(
DataType
<
T
>
{},
lens
),
mData
(
mDesc
.
GetElementSpace
())
{
}
template
<
class
F
,
class
T
,
class
...
Is
>
auto
construct_f_unpack_args_impl
(
T
args
,
std
::
integer_sequence
<
Is
...
>
)
{
return
F
(
std
::
get
<
Is
>
(
args
)...);
}
template
<
class
X
>
Tensor
(
std
::
vector
<
X
>
lens
)
:
mDesc
(
DataType
<
T
>
{},
lens
),
mData
(
mDesc
.
GetElementSpace
())
{
}
template
<
class
F
,
class
T
>
auto
construct_f_unpack_args
(
F
,
T
args
)
{
static
constexpr
std
::
size_t
N
=
std
::
tuple_size
<
T
>::
value
;
template
<
class
X
,
class
Y
>
Tensor
(
std
::
vector
<
X
>
lens
,
std
::
vector
<
Y
>
strides
)
:
mDesc
(
DataType
<
T
>
{},
lens
,
strides
),
mData
(
mDesc
.
GetElementSpace
())
{
}
return
construct_f_unpack_args_impl
<
F
>
(
args
,
std
::
make_index_sequence
<
N
>
{});
}
template
<
class
G
>
void
GenerateTensorValue
(
G
g
,
std
::
size_t
num_thread
=
1
)
{
switch
(
mDesc
.
GetDimension
())
{
case
1
:
{
auto
f
=
[
&
](
auto
i
)
{
(
*
this
)(
i
)
=
g
(
i
);
};
make_ParallelTensorFunctor
(
f
,
mDesc
.
GetLengths
()[
0
])(
num_thread
);
break
;
}
case
2
:
{
auto
f
=
[
&
](
auto
i0
,
auto
i1
)
{
(
*
this
)(
i0
,
i1
)
=
g
(
i0
,
i1
);
};
make_ParallelTensorFunctor
(
f
,
mDesc
.
GetLengths
()[
0
],
mDesc
.
GetLengths
()[
1
])(
num_thread
);
break
;
}
case
3
:
{
auto
f
=
[
&
](
auto
i0
,
auto
i1
,
auto
i2
)
{
(
*
this
)(
i0
,
i1
,
i2
)
=
g
(
i0
,
i1
,
i2
);
};
make_ParallelTensorFunctor
(
f
,
mDesc
.
GetLengths
()[
0
],
mDesc
.
GetLengths
()[
1
],
mDesc
.
GetLengths
()[
2
])(
num_thread
);
break
;
}
case
4
:
{
auto
f
=
[
&
](
auto
i0
,
auto
i1
,
auto
i2
,
auto
i3
)
{
(
*
this
)(
i0
,
i1
,
i2
,
i3
)
=
g
(
i0
,
i1
,
i2
,
i3
);
};
make_ParallelTensorFunctor
(
f
,
mDesc
.
GetLengths
()[
0
],
mDesc
.
GetLengths
()[
1
],
mDesc
.
GetLengths
()[
2
],
mDesc
.
GetLengths
()[
3
])(
num_thread
);
break
;
}
default:
throw
std
::
runtime_error
(
"unspported dimension"
);
}
}
template
<
class
...
Is
>
T
&
operator
()(
Is
...
is
)
{
return
mData
[
mDesc
.
Get1dIndex
(
is
...)];
}
template
<
class
...
Is
>
const
T
&
operator
()(
Is
...
is
)
const
{
return
mData
[
mDesc
.
Get1dIndex
(
is
...)];
}
typename
std
::
vector
<
T
>::
iterator
begin
()
{
return
mData
.
begin
();
}
typename
std
::
vector
<
T
>::
iterator
end
()
{
return
mData
.
end
();
}
typename
std
::
vector
<
T
>::
const_iterator
begin
()
const
{
return
mData
.
begin
();
}
typename
std
::
vector
<
T
>::
const_iterator
end
()
const
{
return
mData
.
end
();
}
TensorDescriptor
mDesc
;
std
::
vector
<
T
>
mData
;
};
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment