Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
c1b3fb95
"git@developer.sourcefind.cn:modelzoo/resnet50_tensorflow.git" did not exist on "52e4ded85ff2448fc82e0112242c0cbb523632ec"
Commit
c1b3fb95
authored
Aug 07, 2019
by
Jehandad Khan
Browse files
host verification in progress
parent
eb8a1bf9
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
89 additions
and
31 deletions
+89
-31
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+4
-6
driver/include/conv_common.hpp
driver/include/conv_common.hpp
+2
-0
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+4
-2
driver/include/host_conv.hpp
driver/include/host_conv.hpp
+16
-4
driver/include/tensor.hpp
driver/include/tensor.hpp
+16
-0
driver/src/driver.cpp
driver/src/driver.cpp
+37
-13
driver/src/tensor.cpp
driver/src/tensor.cpp
+4
-0
script/cmake-cuda.sh
script/cmake-cuda.sh
+4
-4
script/cmake-cuda_docker.sh
script/cmake-cuda_docker.sh
+2
-2
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
c1b3fb95
...
@@ -333,8 +333,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -333,8 +333,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
// LDS doubel buffer: load next data from device mem
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
RunLoadRegisterClipboard
(
p_in_global
,
p_in_register_clipboard
);
blockwise_in_copy
.
RunLoadRegisterClipboard
(
p_in_global
,
p_in_register_clipboard
);
blockwise_wei_copy
.
RunLoadRegisterClipboard
(
p_wei_block_on_global
,
blockwise_wei_copy
.
RunLoadRegisterClipboard
(
p_wei_block_on_global
,
p_wei_register_clipboard
);
p_wei_register_clipboard
);
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
...
@@ -399,8 +399,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -399,8 +399,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
// origin of dst in device memory
// origin of dst in device memory
Float
*
p_out_thread_on_global
=
Float
*
p_out_thread_on_global
=
p_out_global
+
p_out_global
+
out_k_n1_b_n2_global_merged_desc
.
GetOffsetFromMultiIndex
(
out_k_n1_b_n2_global_merged_desc
.
GetOffsetFromMultiIndex
(
k_thread_data_on_global
,
0
,
b_thread_data_on_global
,
0
);
k_thread_data_on_global
,
0
,
b_thread_data_on_global
,
0
);
#if 1
#if 1
threadwise_generic_tensor_slice_copy_v1
(
threadwise_generic_tensor_slice_copy_v1
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
,
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
,
...
@@ -412,8 +412,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -412,8 +412,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
.
GetLengths
(),
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
.
GetLengths
(),
arithmetic_sequence_gen
<
0
,
8
,
1
>::
type
{},
arithmetic_sequence_gen
<
0
,
8
,
1
>::
type
{},
Number
<
1
>
{});
Number
<
1
>
{});
#elif 0
p_out_global
[
0
]
=
p_out_thread
[
0
];
#endif
#endif
}
}
}
}
...
...
driver/include/conv_common.hpp
View file @
c1b3fb95
...
@@ -3,6 +3,8 @@
...
@@ -3,6 +3,8 @@
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor.hpp"
typedef
enum
ConvolutionDir
{
Forward
=
0
,
BackwardData
=
1
,
BackwardWeights
=
2
};
// this is ugly, only for 4d
// this is ugly, only for 4d
template
<
class
InDesc
,
class
WeiDesc
>
template
<
class
InDesc
,
class
WeiDesc
>
constexpr
auto
get_convolution_output_default_4d_tensor_descriptor
(
InDesc
,
WeiDesc
)
constexpr
auto
get_convolution_output_default_4d_tensor_descriptor
(
InDesc
,
WeiDesc
)
...
...
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
c1b3fb95
...
@@ -16,9 +16,9 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -16,9 +16,9 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
// the input desc needs to be reordered for wrw : cnhw would be the new order
// the input desc needs to be reordered for wrw : cnhw would be the new order
// the forward kernel always assumes red on the second dim and this would make it reduce on the n dimension due to the switchibng we did
// the forward kernel always assumes red on the second dim and this would make it reduce on the n dimension due to the switchibng we did
const
Tensor
<
T
>&
in_nchw
,
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvStrides
,
...
@@ -252,4 +252,6 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -252,4 +252,6 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
}
}
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
in_nchw_device_buf
.
FromDevice
(
in_nchw
.
mData
.
data
());
wei_kcyx_device_buf
.
FromDevice
(
wei_kcyx
.
mData
.
data
());
}
}
driver/include/host_conv.hpp
View file @
c1b3fb95
#pragma once
#pragma once
#include "tensor.hpp"
#include "tensor.hpp"
#include "common_header.hpp"
#include "common_header.hpp"
#include "conv_common.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor.hpp"
// this is ugly, only for 4d
// this is ugly, only for 4d
...
@@ -52,15 +53,26 @@ template <class TIn,
...
@@ -52,15 +53,26 @@ template <class TIn,
class
ConvDilations
,
class
ConvDilations
,
class
LowerPads
,
class
LowerPads
,
class
UpperPads
>
class
UpperPads
>
void
host_direct_convolution
(
const
Tensor
<
TIn
>&
in_nchw
,
void
host_direct_convolution
(
Tensor
<
TIn
>&
in_nchw
,
const
Tensor
<
TWei
>&
wei_kcyx
,
Tensor
<
TWei
>&
wei_kcyx
,
Tensor
<
TOut
>&
out_nkhw
,
Tensor
<
TOut
>&
out_nkhw
,
ConvStrides
,
ConvStrides
,
ConvDilations
,
ConvDilations
,
LowerPads
,
LowerPads
,
UpperPads
)
UpperPads
,
ConvolutionDir
dir
)
{
{
using
namespace
ck
;
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
#if 1
// wrw
in_nchw
.
mDesc
.
ReorderGivenNew2Old
({
1
,
0
,
2
,
3
});
wei_kcyx
.
mDesc
.
ReorderGivenNew2Old
({
1
,
0
,
2
,
3
});
out_nkhw
.
mDesc
.
ReorderGivenNew2Old
({
1
,
0
,
2
,
3
});
#endif
index_t
h_pad_low
=
LowerPads
{}.
Get
(
Number
<
0
>
{});
index_t
h_pad_low
=
LowerPads
{}.
Get
(
Number
<
0
>
{});
index_t
w_pad_low
=
LowerPads
{}.
Get
(
Number
<
1
>
{});
index_t
w_pad_low
=
LowerPads
{}.
Get
(
Number
<
1
>
{});
...
@@ -81,7 +93,7 @@ void host_direct_convolution(const Tensor<TIn>& in_nchw,
...
@@ -81,7 +93,7 @@ void host_direct_convolution(const Tensor<TIn>& in_nchw,
if
(
hi
>=
0
&&
hi
<
in_nchw
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
if
(
hi
>=
0
&&
hi
<
in_nchw
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
<
in_nchw
.
mDesc
.
GetLengths
()[
3
])
wi
<
in_nchw
.
mDesc
.
GetLengths
()[
3
])
{
{
v
+=
double
(
in_nchw
(
n
,
c
,
hi
,
wi
))
*
double
(
wei_kcyx
(
k
,
c
,
y
,
x
));
v
+=
double
(
in_nchw
(
n
,
c
,
hi
,
wi
))
/*
double(wei_kcyx(k, c, y, x))
*/
;
}
}
}
}
}
}
...
...
driver/include/tensor.hpp
View file @
c1b3fb95
...
@@ -101,6 +101,22 @@ struct TensorDescriptor
...
@@ -101,6 +101,22 @@ struct TensorDescriptor
std
::
initializer_list
<
std
::
size_t
>
iss
{
static_cast
<
std
::
size_t
>
(
is
)...};
std
::
initializer_list
<
std
::
size_t
>
iss
{
static_cast
<
std
::
size_t
>
(
is
)...};
return
std
::
inner_product
(
iss
.
begin
(),
iss
.
end
(),
mStrides
.
begin
(),
std
::
size_t
{
0
});
return
std
::
inner_product
(
iss
.
begin
(),
iss
.
end
(),
mStrides
.
begin
(),
std
::
size_t
{
0
});
}
}
void
ReorderGivenNew2Old
(
std
::
vector
<
std
::
size_t
>
is
)
{
assert
(
mLens
.
size
()
==
is
.
size
());
assert
(
mStrides
.
size
()
==
is
.
size
());
std
::
vector
<
std
::
size_t
>
newLens
(
mLens
.
size
());
std
::
vector
<
std
::
size_t
>
newStrides
(
mStrides
.
size
());
auto
cnt
=
0
;
for
(
auto
&
idx
:
is
)
{
newLens
[
cnt
]
=
mLens
[
idx
];
newStrides
[
cnt
]
=
mStrides
[
idx
];
++
cnt
;
}
mLens
=
newLens
;
mStrides
=
newStrides
;
}
private:
private:
std
::
vector
<
std
::
size_t
>
mLens
;
std
::
vector
<
std
::
size_t
>
mLens
;
...
...
driver/src/driver.cpp
View file @
c1b3fb95
...
@@ -67,12 +67,14 @@ struct GeneratorTensor_Checkboard
...
@@ -67,12 +67,14 @@ struct GeneratorTensor_Checkboard
}
}
};
};
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
{
{
using
namespace
ck
;
using
namespace
ck
;
ConvolutionDir
dir
=
Forward
;
#if 1
#if 1
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
1536
;
constexpr
index_t
C
=
1536
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
WI
=
8
;
...
@@ -85,6 +87,7 @@ int main(int argc, char* argv[])
...
@@ -85,6 +87,7 @@ int main(int argc, char* argv[])
constexpr
index_t
HPad
=
0
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
constexpr
index_t
WPad
=
0
;
dir
=
BackwardWeights
;
#elif 0
#elif 0
// 3x3, 34x34
// 3x3, 34x34
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
...
@@ -477,8 +480,10 @@ int main(int argc, char* argv[])
...
@@ -477,8 +480,10 @@ int main(int argc, char* argv[])
using
in_data_t
=
float
;
using
in_data_t
=
float
;
using
out_data_t
=
float
;
using
out_data_t
=
float
;
Tensor
<
in_data_t
>
in_nchw
(
make_TensorDescriptor
(
in_nchw_desc
));
Tensor
<
in_data_t
>
in_nchw_device
(
make_TensorDescriptor
(
in_nchw_desc
));
Tensor
<
in_data_t
>
wei_kcyx
(
make_TensorDescriptor
(
wei_kcyx_desc
));
Tensor
<
in_data_t
>
wei_kcyx_device
(
make_TensorDescriptor
(
wei_kcyx_desc
));
Tensor
<
in_data_t
>
in_nchw_host
(
make_TensorDescriptor
(
in_nchw_desc
));
Tensor
<
in_data_t
>
wei_kcyx_host
(
make_TensorDescriptor
(
wei_kcyx_desc
));
Tensor
<
out_data_t
>
out_nkhw_host
(
make_TensorDescriptor
(
out_nkhw_desc
));
Tensor
<
out_data_t
>
out_nkhw_host
(
make_TensorDescriptor
(
out_nkhw_desc
));
Tensor
<
out_data_t
>
out_nkhw_device
(
make_TensorDescriptor
(
out_nkhw_desc
));
Tensor
<
out_data_t
>
out_nkhw_device
(
make_TensorDescriptor
(
out_nkhw_desc
));
...
@@ -505,8 +510,24 @@ int main(int argc, char* argv[])
...
@@ -505,8 +510,24 @@ int main(int argc, char* argv[])
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_3
{},
num_thread
);
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_3
{},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
#elif 1
#elif 1
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
in_nchw_device
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
assert
(
in_nchw_device
.
mData
.
size
()
==
in_nchw_host
.
mData
.
size
());
for
(
auto
i
=
0
;
i
<
in_nchw_device
.
mData
.
size
();
++
i
)
{
in_nchw_host
.
mData
[
i
]
=
in_nchw_device
.
mData
[
i
];
}
wei_kcyx_device
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
assert
(
wei_kcyx_device
.
mData
.
size
()
==
wei_kcyx_host
.
mData
.
size
());
for
(
auto
i
=
0
;
i
<
wei_kcyx_device
.
mData
.
size
();
++
i
)
{
wei_kcyx_host
.
mData
[
i
]
=
wei_kcyx_device
.
mData
[
i
];
}
out_nkhw_device
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
assert
(
out_nkhw_device
.
mData
.
size
()
==
out_nkhw_host
.
mData
.
size
());
for
(
auto
i
=
0
;
i
<
out_nkhw_device
.
mData
.
size
();
++
i
)
{
out_nkhw_host
.
mData
[
i
]
=
out_nkhw_device
.
mData
[
i
];
}
#elif 0
#elif 0
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
...
@@ -536,9 +557,9 @@ int main(int argc, char* argv[])
...
@@ -536,9 +557,9 @@ int main(int argc, char* argv[])
// this is the same as MIOpen
// this is the same as MIOpen
// I should modify this one
// I should modify this one
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw
(
in_nchw_desc
,
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
in_nchw
_device
,
wei_kcyx_desc
,
wei_kcyx_desc
,
wei_kcyx
,
wei_kcyx
_device
,
out_nkhw_desc
,
out_nkhw_desc
,
out_nkhw_device
,
out_nkhw_device
,
ConvStrides
{},
ConvStrides
{},
...
@@ -588,7 +609,7 @@ int main(int argc, char* argv[])
...
@@ -588,7 +609,7 @@ int main(int argc, char* argv[])
if
(
do_verification
)
if
(
do_verification
)
{
{
#if
1
#if
0
if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 &&
if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 &&
ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1)
ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1)
{
{
...
@@ -597,15 +618,18 @@ int main(int argc, char* argv[])
...
@@ -597,15 +618,18 @@ int main(int argc, char* argv[])
else
else
#endif
#endif
{
{
host_direct_convolution
(
in_nchw
,
host_direct_convolution
(
in_nchw_host
,
wei_kcyx
,
out_nkhw_host
,
out_nkhw_host
,
ConvStrides
{}
,
wei_kcyx_host
,
ConvDilations
{},
ConvDilations
{},
ConvStrides
{},
lower_pads
,
lower_pads
,
upper_pads
);
upper_pads
,
dir
);
}
}
check_error
(
out_nkhw_host
,
out_nkhw_device
);
if
(
dir
==
Forward
)
check_error
(
out_nkhw_host
,
out_nkhw_device
);
else
check_error
(
wei_kcyx_host
,
wei_kcyx_device
);
#if 0
#if 0
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
...
...
driver/src/tensor.cpp
View file @
c1b3fb95
...
@@ -7,6 +7,10 @@ TensorDescriptor::TensorDescriptor(std::initializer_list<std::size_t> lens) : mL
...
@@ -7,6 +7,10 @@ TensorDescriptor::TensorDescriptor(std::initializer_list<std::size_t> lens) : mL
{
{
this
->
CalculateStrides
();
this
->
CalculateStrides
();
}
}
TensorDescriptor
::
TensorDescriptor
(
std
::
initializer_list
<
std
::
size_t
>
lens
,
std
::
initializer_list
<
std
::
size_t
>
strides
)
:
mLens
(
lens
),
mStrides
(
strides
)
{
}
TensorDescriptor
::
TensorDescriptor
(
std
::
vector
<
std
::
size_t
>
lens
,
std
::
vector
<
std
::
size_t
>
strides
)
TensorDescriptor
::
TensorDescriptor
(
std
::
vector
<
std
::
size_t
>
lens
,
std
::
vector
<
std
::
size_t
>
strides
)
:
mLens
(
lens
),
mStrides
(
strides
)
:
mLens
(
lens
),
mStrides
(
strides
)
...
...
script/cmake-cuda.sh
View file @
c1b3fb95
...
@@ -4,17 +4,17 @@ rm -f CMakeCache.txt
...
@@ -4,17 +4,17 @@ rm -f CMakeCache.txt
rm
-f
*
.cmake
rm
-f
*
.cmake
rm
-rf
CMakeFiles
rm
-rf
CMakeFiles
MY_PROJECT_SOURCE
=
/home/chao/code/modular_convolution
MY_PROJECT_SOURCE
=
..
MY_PROJECT_INSTALL
=
../install.dir
MY_PROJECT_INSTALL
=
../install.dir
cmake
\
cmake
\
-D
CMAKE_INSTALL_PREFIX
=
${
MY_PROJECT_INSTALL
}
\
-D
CMAKE_INSTALL_PREFIX
=
${
MY_PROJECT_INSTALL
}
\
-D
CMAKE_CXX_COMPILER
=
clang++
\
-D
CMAKE_CXX_COMPILER
=
clang++
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_BUILD_TYPE
=
Debug
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
DEVICE_BACKEND
=
NVIDIA
\
-D
DEVICE_BACKEND
=
NVIDIA
\
-D
CUDA_COMMON_INCLUDE_DIR
=
"/
package/install/cuda/10.1
/NVIDIA_CUDA-10.1_Samples/common/inc"
\
-D
CUDA_COMMON_INCLUDE_DIR
=
"/
home/jehandad
/NVIDIA_CUDA-10.1_Samples/common/inc"
\
-D
CMAKE_CUDA_FLAGS
=
"-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_
61
,code=sm_
61
"
\
-D
CMAKE_CUDA_FLAGS
=
"
-g -G -Xcompiler -O0 -Xptxas -O0 -lineinfo -O0
-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_
52
,code=sm_
52
"
\
${
MY_PROJECT_SOURCE
}
${
MY_PROJECT_SOURCE
}
#-D BOOST_ROOT="/package/install/boost_1.67.0" \
#-D BOOST_ROOT="/package/install/boost_1.67.0" \
...
...
script/cmake-cuda_docker.sh
View file @
c1b3fb95
#!/bin/bash
#!/bin/bash
MY_PROJECT_SOURCE
=
../
../../
MY_PROJECT_SOURCE
=
../
MY_PROJECT_INSTALL
=
../install.dir
MY_PROJECT_INSTALL
=
../install.dir
export
CUDA_ROOT
=
/usr/local/cuda
export
CUDA_ROOT
=
/usr/local/cuda
...
@@ -15,7 +15,7 @@ cmake
...
@@ -15,7 +15,7 @@ cmake
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
DEVICE_BACKEND
=
NVIDIA
\
-D
DEVICE_BACKEND
=
NVIDIA
\
-D
CUDA_COMMON_INCLUDE_DIR
=
"/root/NVIDIA_CUDA-10.1_Samples/common/inc"
\
-D
CUDA_COMMON_INCLUDE_DIR
=
"/root/NVIDIA_CUDA-10.1_Samples/common/inc"
\
-D
CMAKE_CUDA_FLAGS
=
"-ccbin clang++-6.0 -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_60,code=sm_60 -Xptxas -v -gencode=arch=compute_
70
,code=sm_
70
"
\
-D
CMAKE_CUDA_FLAGS
=
"
-g -G
-ccbin clang++-6.0 -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_60,code=sm_60 -Xptxas -v -gencode=arch=compute_
52
,code=sm_
52
"
\
${
MY_PROJECT_SOURCE
}
${
MY_PROJECT_SOURCE
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment