Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
Paddle
Commits
992bec46
Commit
992bec46
authored
Oct 08, 2023
by
“yuguo”
Browse files
2.5
parent
0259837d
Changes
357
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4270 additions
and
0 deletions
+4270
-0
paddle/cinn/backends/_x86_builtin_source.cc
paddle/cinn/backends/_x86_builtin_source.cc
+435
-0
paddle/cinn/backends/codegen_c.cc
paddle/cinn/backends/codegen_c.cc
+933
-0
paddle/cinn/backends/codegen_c.h
paddle/cinn/backends/codegen_c.h
+131
-0
paddle/cinn/backends/codegen_c_test.cc
paddle/cinn/backends/codegen_c_test.cc
+455
-0
paddle/cinn/backends/codegen_c_x86.cc
paddle/cinn/backends/codegen_c_x86.cc
+167
-0
paddle/cinn/backends/codegen_c_x86.h
paddle/cinn/backends/codegen_c_x86.h
+150
-0
paddle/cinn/backends/codegen_c_x86_test.cc
paddle/cinn/backends/codegen_c_x86_test.cc
+77
-0
paddle/cinn/backends/codegen_cuda_dev.cc
paddle/cinn/backends/codegen_cuda_dev.cc
+423
-0
paddle/cinn/backends/codegen_cuda_dev.h
paddle/cinn/backends/codegen_cuda_dev.h
+119
-0
paddle/cinn/backends/codegen_cuda_generate_test.cc
paddle/cinn/backends/codegen_cuda_generate_test.cc
+68
-0
paddle/cinn/backends/codegen_cuda_host.cc
paddle/cinn/backends/codegen_cuda_host.cc
+186
-0
paddle/cinn/backends/codegen_cuda_host.h
paddle/cinn/backends/codegen_cuda_host.h
+62
-0
paddle/cinn/backends/codegen_cuda_util.cc
paddle/cinn/backends/codegen_cuda_util.cc
+30
-0
paddle/cinn/backends/codegen_cuda_util.h
paddle/cinn/backends/codegen_cuda_util.h
+148
-0
paddle/cinn/backends/codegen_debug_test.cc
paddle/cinn/backends/codegen_debug_test.cc
+127
-0
paddle/cinn/backends/compiler.cc
paddle/cinn/backends/compiler.cc
+259
-0
paddle/cinn/backends/compiler.h
paddle/cinn/backends/compiler.h
+129
-0
paddle/cinn/backends/compiler_test.cc
paddle/cinn/backends/compiler_test.cc
+215
-0
paddle/cinn/backends/cuda_util.cc
paddle/cinn/backends/cuda_util.cc
+56
-0
paddle/cinn/backends/cuda_util.h
paddle/cinn/backends/cuda_util.h
+100
-0
No files found.
Too many changes to show.
To preserve performance only
357 of 357+
files are displayed.
Plain diff
Email patch
paddle/cinn/backends/_x86_builtin_source.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// Predefined utilities in CINN BEGIN(
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#include <immintrin.h>
#include <math.h>
#include <omp.h>
#include <stdint.h>
#include <vector>
#include "paddle/cinn/runtime/cpu/thread_backend.h"
#ifndef _CINN_X86_BUILTIN_SOURCE_
#define _CINN_X86_BUILTIN_SOURCE_
//! Vector in stack, this can only used in generated .cc file.
template
<
typename
T
,
size_t
Num
>
struct
StackVec
{
typedef
T
value_type
;
typedef
StackVec
<
T
,
Num
>
self_type
;
self_type
&
operator
=
(
const
StackVec
&
src
)
{
if
(
this
!=
&
src
)
{
memcpy
(
data_
,
src
.
data_
,
num_bytes
());
}
return
*
this
;
}
StackVec
()
{
memset
(
data_
,
0
,
num_bytes
());
}
explicit
StackVec
(
const
T
*
externl
)
:
external_data_
(
externl
)
{}
static
self_type
Broadcast
(
const
value_type
&
v
)
{
self_type
res
;
for
(
size_t
i
=
0
;
i
<
Num
;
i
++
)
res
.
data_
[
i
]
=
v
;
return
res
;
}
static
self_type
Ramp
(
const
value_type
&
base
,
const
value_type
&
stride
)
{
self_type
res
;
for
(
size_t
i
=
0
;
i
<
Num
;
i
++
)
{
res
.
data_
[
i
]
=
base
+
stride
*
i
;
}
}
static
self_type
Load
(
const
void
*
base
,
int32_t
offset
)
{
self_type
res
;
memcpy
(
&
res
.
data_
[
0
],
(
const
value_type
*
)
base
+
offset
,
num_bytes
());
}
static
self_type
Load
(
const
void
*
base
,
const
StackVec
<
int32_t
,
Num
>&
offset
)
{
self_type
res
;
for
(
size_t
i
=
0
;
i
<
Num
;
i
++
)
{
res
.
data_
[
i
]
=
((
const
value_type
*
)
base
)[
offset
[
i
]];
}
}
void
Store
(
void
*
base
,
int32_t
offset
)
const
{
mempcpy
((
value_type
*
)
base
+
offset
,
&
data_
[
0
],
num_bytes
());
// NOLINT
}
inline
value_type
&
operator
[](
size_t
i
)
{
return
data_
[
i
];
}
inline
value_type
operator
[](
size_t
i
)
const
{
return
data_
[
i
];
}
// binary operator between two vectors
// @{
#define __(op__) \
friend self_type operator op__(const self_type& a, const self_type& b) { \
self_type res; \
for (size_t i = 0; i < Num; i++) { \
res.data_[i] = a[i] op__ b[i]; \
} \
return res; \
}
__
(
+
)
__
(
-
)
__
(
*
)
__
(
/
)
__
(
%
)
// @}
#undef __
// binary operator between a vector and a scalar
// @{
#define __(op__) \
friend self_type operator op__(const self_type& a, const value_type& b) { \
self_type res; \
for (size_t i = 0; i < Num; i++) { \
res.data_[i] = a[i] op__ b; \
} \
return res; \
}
__
(
+
)
__
(
-
)
__
(
*
)
__
(
/
)
__
(
%
)
#undef __
// @}
static
constexpr
size_t
num_bytes
()
{
return
sizeof
(
data_
);
}
private:
T
data_
[
Num
];
T
*
external_data_
{
nullptr
};
};
/**
* The vector with external data.
*/
template
<
typename
T
,
size_t
Num
>
struct
ExternalVec
{
typedef
T
value_type
;
typedef
ExternalVec
<
T
,
Num
>
self_type
;
explicit
ExternalVec
(
T
*
data
)
:
data_
(
data
)
{}
self_type
&
operator
=
(
const
self_type
&
src
)
{
if
(
data_
!=
src
.
data_
)
{
memcpy
(
data_
,
src
.
data_
,
num_bytes
());
}
return
*
this
;
}
static
self_type
Load
(
const
void
*
base
,
int32_t
offset
)
{
self_type
res
((
T
*
)
base
+
offset
);
// NOLINT
return
res
;
}
static
constexpr
size_t
num_bytes
()
{
return
sizeof
(
value_type
)
*
Num
;
}
private:
T
*
data_
{
nullptr
};
};
// AVX256 load
//@{
inline
__m256
cinn_avx256_load
(
const
float
*
dst
)
{
return
_mm256_load_ps
(
dst
);
}
inline
__m256d
cinn_avx256_load
(
const
double
*
dst
)
{
return
_mm256_load_pd
(
dst
);
}
//@}
// AVX512 load
//@{
inline
__m512
cinn_avx512_load
(
const
float
*
dst
)
{
return
_mm512_load_ps
(
dst
);
}
inline
__m512d
cinn_avx512_load
(
const
double
*
dst
)
{
return
_mm512_load_pd
(
dst
);
}
//@}
// FP32x8 * FP32x8
// @{
inline
void
cinn_avx256_add
(
float
*
dst
,
float
*
a
,
float
*
b
)
{
_mm256_store_ps
(
dst
,
_mm256_add_ps
(
_mm256_load_ps
(
a
),
_mm256_load_ps
(
b
)));
}
inline
void
cinn_avx256_sub
(
float
*
dst
,
float
*
a
,
float
*
b
)
{
_mm256_store_ps
(
dst
,
_mm256_sub_ps
(
_mm256_load_ps
(
a
),
_mm256_load_ps
(
b
)));
}
inline
void
cinn_avx256_mul
(
float
*
dst
,
float
*
a
,
float
*
b
)
{
_mm256_store_ps
(
dst
,
_mm256_mul_ps
(
_mm256_load_ps
(
a
),
_mm256_load_ps
(
b
)));
}
inline
void
cinn_avx256_div
(
float
*
dst
,
float
*
a
,
float
*
b
)
{
_mm256_store_ps
(
dst
,
_mm256_div_ps
(
_mm256_load_ps
(
a
),
_mm256_load_ps
(
b
)));
}
// @}
// FP32x4 * float
// @{
inline
void
cinn_avx256_add
(
float
*
dst
,
float
*
a
,
float
b
)
{
_mm256_store_ps
(
dst
,
_mm256_add_ps
(
_mm256_load_ps
(
a
),
_mm256_set1_ps
(
b
)));
}
inline
void
cinn_avx256_sub
(
float
*
dst
,
float
*
a
,
float
b
)
{
_mm256_store_ps
(
dst
,
_mm256_sub_ps
(
_mm256_load_ps
(
a
),
_mm256_set1_ps
(
b
)));
}
inline
void
cinn_avx256_mul
(
float
*
dst
,
float
*
a
,
float
b
)
{
_mm256_store_ps
(
dst
,
_mm256_mul_ps
(
_mm256_load_ps
(
a
),
_mm256_set1_ps
(
b
)));
}
inline
void
cinn_avx256_div
(
float
*
dst
,
float
*
a
,
float
b
)
{
_mm256_store_ps
(
dst
,
_mm256_div_ps
(
_mm256_load_ps
(
a
),
_mm256_set1_ps
(
b
)));
}
// @}
// float * FP32x4
// @{
inline
void
cinn_avx256_add
(
float
*
dst
,
float
a
,
float
*
b
)
{
_mm256_store_ps
(
dst
,
_mm256_add_ps
(
_mm256_set1_ps
(
a
),
_mm256_load_ps
(
b
)));
}
inline
void
cinn_avx256_sub
(
float
*
dst
,
float
a
,
float
*
b
)
{
_mm256_store_ps
(
dst
,
_mm256_sub_ps
(
_mm256_set1_ps
(
a
),
_mm256_load_ps
(
b
)));
}
inline
void
cinn_avx256_mul
(
float
*
dst
,
float
a
,
float
*
b
)
{
_mm256_store_ps
(
dst
,
_mm256_mul_ps
(
_mm256_set1_ps
(
a
),
_mm256_load_ps
(
b
)));
}
inline
void
cinn_avx256_div
(
float
*
dst
,
float
a
,
float
*
b
)
{
_mm256_store_ps
(
dst
,
_mm256_div_ps
(
_mm256_set1_ps
(
a
),
_mm256_load_ps
(
b
)));
}
// @}
// 4 x float64
// @{
inline
void
cinn_avx256_add
(
double
*
dst
,
double
*
a
,
double
*
b
)
{
_mm256_store_pd
(
dst
,
_mm256_add_pd
(
_mm256_load_pd
(
a
),
_mm256_load_pd
(
b
)));
}
inline
void
cinn_avx256_sub
(
double
*
dst
,
double
*
a
,
double
*
b
)
{
_mm256_store_pd
(
dst
,
_mm256_sub_pd
(
_mm256_load_pd
(
a
),
_mm256_load_pd
(
b
)));
}
inline
void
cinn_avx256_mul
(
double
*
dst
,
double
*
a
,
double
*
b
)
{
_mm256_store_pd
(
dst
,
_mm256_mul_pd
(
_mm256_load_pd
(
a
),
_mm256_load_pd
(
b
)));
}
inline
void
cinn_avx256_div
(
double
*
dst
,
double
*
a
,
double
*
b
)
{
_mm256_store_pd
(
dst
,
_mm256_div_pd
(
_mm256_load_pd
(
a
),
_mm256_load_pd
(
b
)));
}
// @}
// FP32x4 * FP64
// @{
inline
void
cinn_avx256_add
(
double
*
dst
,
double
*
a
,
double
b
)
{
_mm256_store_pd
(
dst
,
_mm256_add_pd
(
_mm256_load_pd
(
a
),
_mm256_set1_pd
(
b
)));
}
inline
void
cinn_avx256_sub
(
double
*
dst
,
double
*
a
,
double
b
)
{
_mm256_store_pd
(
dst
,
_mm256_sub_pd
(
_mm256_load_pd
(
a
),
_mm256_set1_pd
(
b
)));
}
inline
void
cinn_avx256_mul
(
double
*
dst
,
double
*
a
,
double
b
)
{
_mm256_store_pd
(
dst
,
_mm256_mul_pd
(
_mm256_load_pd
(
a
),
_mm256_set1_pd
(
b
)));
}
inline
void
cinn_avx256_div
(
double
*
dst
,
double
*
a
,
double
b
)
{
_mm256_store_pd
(
dst
,
_mm256_div_pd
(
_mm256_load_pd
(
a
),
_mm256_set1_pd
(
b
)));
}
// @}
// float * FP32x4
// @{
inline
void
cinn_avx256_add
(
double
*
dst
,
double
a
,
double
*
b
)
{
_mm256_store_pd
(
dst
,
_mm256_add_pd
(
_mm256_set1_pd
(
a
),
_mm256_load_pd
(
b
)));
}
inline
void
cinn_avx256_sub
(
double
*
dst
,
double
a
,
double
*
b
)
{
_mm256_store_pd
(
dst
,
_mm256_sub_pd
(
_mm256_set1_pd
(
a
),
_mm256_load_pd
(
b
)));
}
inline
void
cinn_avx256_mul
(
double
*
dst
,
double
a
,
double
*
b
)
{
_mm256_store_pd
(
dst
,
_mm256_mul_pd
(
_mm256_set1_pd
(
a
),
_mm256_load_pd
(
b
)));
}
inline
void
cinn_avx256_div
(
double
*
dst
,
double
a
,
double
*
b
)
{
_mm256_store_pd
(
dst
,
_mm256_div_pd
(
_mm256_set1_pd
(
a
),
_mm256_load_pd
(
b
)));
}
// @}
//! 32 x float32 operations.
// @{
inline
void
cinn_avx512_add
(
float
*
dst
,
float
*
a
,
float
*
b
)
{
_mm512_store_ps
(
dst
,
_mm512_add_ps
(
_mm512_load_ps
(
a
),
_mm512_load_ps
(
b
)));
}
inline
void
cinn_avx512_sub
(
float
*
dst
,
float
*
a
,
float
*
b
)
{
_mm512_store_ps
(
dst
,
_mm512_sub_ps
(
_mm512_load_ps
(
a
),
_mm512_load_ps
(
b
)));
}
inline
void
cinn_avx512_mul
(
float
*
dst
,
float
*
a
,
float
*
b
)
{
_mm512_store_ps
(
dst
,
_mm512_mul_ps
(
_mm512_load_ps
(
a
),
_mm512_load_ps
(
b
)));
}
inline
void
cinn_avx512_div
(
float
*
dst
,
float
*
a
,
float
*
b
)
{
_mm512_store_ps
(
dst
,
_mm512_div_ps
(
_mm512_load_ps
(
a
),
_mm512_load_ps
(
b
)));
}
// @}
// FP32x4 * FP64
// @{
inline
void
cinn_avx512_add
(
float
*
dst
,
float
*
a
,
float
b
)
{
_mm512_store_pd
(
dst
,
_mm512_add_pd
(
_mm512_load_pd
(
a
),
_mm512_set1_pd
(
b
)));
}
inline
void
cinn_avx512_sub
(
float
*
dst
,
float
*
a
,
float
b
)
{
_mm512_store_pd
(
dst
,
_mm512_sub_pd
(
_mm512_load_pd
(
a
),
_mm512_set1_pd
(
b
)));
}
inline
void
cinn_avx512_mul
(
float
*
dst
,
float
*
a
,
float
b
)
{
_mm512_store_pd
(
dst
,
_mm512_mul_pd
(
_mm512_load_pd
(
a
),
_mm512_set1_pd
(
b
)));
}
inline
void
cinn_avx512_div
(
float
*
dst
,
float
*
a
,
float
b
)
{
_mm512_store_pd
(
dst
,
_mm512_div_pd
(
_mm512_load_pd
(
a
),
_mm512_set1_pd
(
b
)));
}
// @}
// float * FP32x4
// @{
inline
void
cinn_avx512_add
(
float
*
dst
,
float
a
,
float
*
b
)
{
_mm512_store_pd
(
dst
,
_mm512_add_pd
(
_mm512_set1_pd
(
a
),
_mm512_load_pd
(
b
)));
}
inline
void
cinn_avx512_sub
(
float
*
dst
,
float
a
,
float
*
b
)
{
_mm512_store_pd
(
dst
,
_mm512_sub_pd
(
_mm512_set1_pd
(
a
),
_mm512_load_pd
(
b
)));
}
inline
void
cinn_avx512_mul
(
float
*
dst
,
float
a
,
float
*
b
)
{
_mm512_store_pd
(
dst
,
_mm512_mul_pd
(
_mm512_set1_pd
(
a
),
_mm512_load_pd
(
b
)));
}
inline
void
cinn_avx512_div
(
float
*
dst
,
float
a
,
float
*
b
)
{
_mm512_store_pd
(
dst
,
_mm512_div_pd
(
_mm512_set1_pd
(
a
),
_mm512_load_pd
(
b
)));
}
// @}
//! 16 x float32 operations.
// @{
inline
void
cinn_avx512_add
(
double
*
dst
,
double
*
a
,
double
*
b
)
{
_mm512_store_pd
(
dst
,
_mm512_add_pd
(
_mm512_load_pd
(
a
),
_mm512_load_pd
(
b
)));
}
inline
void
cinn_avx512_sub
(
double
*
dst
,
double
*
a
,
double
*
b
)
{
_mm512_store_pd
(
dst
,
_mm512_sub_pd
(
_mm512_load_pd
(
a
),
_mm512_load_pd
(
b
)));
}
inline
void
cinn_avx512_mul
(
double
*
dst
,
double
*
a
,
double
*
b
)
{
_mm512_store_pd
(
dst
,
_mm512_mul_pd
(
_mm512_load_pd
(
a
),
_mm512_load_pd
(
b
)));
}
inline
void
cinn_avx512_div
(
double
*
dst
,
double
*
a
,
double
*
b
)
{
_mm512_store_pd
(
dst
,
_mm512_div_pd
(
_mm512_load_pd
(
a
),
_mm512_load_pd
(
b
)));
}
// @}
inline
__m512
cinn_avx512_add
(
const
__m512
&
a
,
const
__m512
&
b
);
inline
__m256
cinn_avx256_add_float
(
const
__m256
&
a
,
const
__m256
&
b
)
{
return
_mm256_add_ps
(
a
,
b
);
}
inline
__m256d
cinn_avx256_add_double
(
const
__m256d
&
a
,
const
__m256d
&
b
)
{
return
_mm256_add_pd
(
a
,
b
);
}
inline
__m512
cinn_avx512_add_float
(
const
__m512
&
a
,
const
__m512
&
b
)
{
return
_mm512_add_ps
(
a
,
b
);
}
inline
__m512d
cinn_avx512_add_double
(
const
__m512d
&
a
,
const
__m512d
&
b
)
{
return
_mm512_add_pd
(
a
,
b
);
}
//! set1
// @{
inline
__m256
cinn_avx256_set1
(
float
value
)
{
return
_mm256_set1_ps
(
value
);
}
inline
__m256d
cinn_avx256_set1
(
double
value
)
{
return
_mm256_set1_pd
(
value
);
}
inline
__m512
cinn_avx512_set1
(
float
value
)
{
return
_mm512_set1_ps
(
value
);
}
inline
__m512d
cinn_avx512_set1
(
double
value
)
{
return
_mm512_set1_pd
(
value
);
}
// @}
//! store
// @{
inline
void
cinn_avx512_store
(
float
*
dst
,
const
__m512
&
x
)
{
_mm512_store_ps
(
dst
,
x
);
}
inline
void
cinn_avx512_store
(
double
*
dst
,
const
__m512d
&
x
)
{
_mm512_store_pd
(
dst
,
x
);
}
inline
void
cinn_avx256_store
(
float
*
dst
,
const
__m256
&
x
)
{
_mm256_store_ps
(
dst
,
x
);
}
inline
void
cinn_avx256_store
(
double
*
dst
,
const
__m256d
&
x
)
{
_mm256_store_pd
(
dst
,
x
);
}
// @}
//! add
// @{
inline
__m256
cinn_avx256_add
(
const
__m256
&
a
,
const
__m256
&
b
)
{
return
_mm256_add_ps
(
a
,
b
);
}
inline
__m256d
cinn_avx256_add
(
const
__m256d
&
a
,
const
__m256d
&
b
)
{
return
_mm256_add_pd
(
a
,
b
);
}
inline
__m512
cinn_avx512_add
(
const
__m512
&
a
,
const
__m512
&
b
)
{
return
_mm512_add_ps
(
a
,
b
);
}
inline
__m512d
cinn_avx512_add
(
const
__m512d
&
a
,
const
__m512d
&
b
)
{
return
_mm512_add_pd
(
a
,
b
);
}
// @}
//! mul
// @{
inline
__m256
cinn_avx256_mul
(
const
__m256
&
a
,
const
__m256
&
b
)
{
return
_mm256_mul_ps
(
a
,
b
);
}
inline
__m256d
cinn_avx256_mul
(
const
__m256d
&
a
,
const
__m256d
&
b
)
{
return
_mm256_mul_pd
(
a
,
b
);
}
inline
__m512
cinn_avx512_mul
(
const
__m512
&
a
,
const
__m512
&
b
)
{
return
_mm512_mul_ps
(
a
,
b
);
}
inline
__m512d
cinn_avx512_mul
(
const
__m512d
&
a
,
const
__m512d
&
b
)
{
return
_mm512_mul_pd
(
a
,
b
);
}
// @}
//! fma
// @{
inline
__m128
cinn_avx128_fma
(
const
__m128
&
a
,
const
__m128
&
b
,
const
__m128
&
c
)
{
return
_mm_fmadd_ps
(
a
,
b
,
c
);
}
inline
__m128d
cinn_avx128_fma
(
const
__m128d
&
a
,
const
__m128d
&
b
,
const
__m128d
&
c
)
{
return
_mm_fmadd_pd
(
a
,
b
,
c
);
}
inline
__m256
cinn_avx256_fma
(
const
__m256
&
a
,
const
__m256
&
b
,
const
__m256
&
c
)
{
return
_mm256_fmadd_ps
(
a
,
b
,
c
);
}
inline
__m256d
cinn_avx256_fma
(
const
__m256d
&
a
,
const
__m256d
&
b
,
const
__m256d
&
c
)
{
return
_mm256_fmadd_pd
(
a
,
b
,
c
);
}
inline
__m512
cinn_avx512_fma
(
const
__m512
&
a
,
const
__m512
&
b
,
const
__m512
&
c
)
{
return
_mm512_fmadd_ps
(
a
,
b
,
c
);
}
inline
__m512d
cinn_avx512_fma
(
const
__m512d
&
a
,
const
__m512d
&
b
,
const
__m512d
&
c
)
{
return
_mm512_fmadd_pd
(
a
,
b
,
c
);
}
// @}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// )END Predefined utilities in CINN
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#endif // _CINN_X86_BUILTIN_SOURCE_
paddle/cinn/backends/codegen_c.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/backends/codegen_c.h"
#include <fstream>
#include <string>
#include "paddle/cinn/backends/extern_func_emitter.h"
#include "paddle/cinn/backends/extern_func_emitter_builtin.h"
#include "paddle/cinn/ir/lowered_func.h"
#include "paddle/cinn/ir/op/ir_operators.h"
#include "paddle/cinn/ir/utils/ir_verify.h"
#include "paddle/cinn/optim/ir_simplify.h"
#include "paddle/cinn/optim/remove_nested_block.h"
#include "paddle/cinn/runtime/cpu/thread_backend.h"
#include "paddle/cinn/runtime/intrinsic.h"
#include "paddle/cinn/utils/string.h"
//! Root of the builtin code.
DECLARE_string
(
cinn_x86_builtin_code_root
);
namespace
cinn
{
namespace
backends
{
using
namespace
utils
;
// NOLINT
using
cinn
::
common
::
float16
;
const
char
*
kCKeywordRestrict
=
"__restrict__"
;
void
CodeGenC
::
Compile
(
const
ir
::
Module
&
module
,
const
Outputs
&
outputs
)
{
ir
::
IrVerify
(
Expr
(
module
));
if
(
!
outputs
.
c_header_name
.
empty
())
{
auto
source
=
Compile
(
module
,
OutputKind
::
CHeader
);
str_
=
""
;
std
::
ofstream
file
(
outputs
.
c_header_name
);
CHECK
(
file
.
is_open
())
<<
"failed to open file "
<<
outputs
.
c_header_name
;
file
<<
source
;
file
.
close
();
LOG
(
WARNING
)
<<
"Output C header to file "
<<
outputs
.
c_header_name
;
}
if
(
!
outputs
.
c_source_name
.
empty
())
{
auto
source
=
Compile
(
module
,
OutputKind
::
CImpl
);
str_
=
""
;
std
::
ofstream
file
(
outputs
.
c_source_name
);
CHECK
(
file
.
is_open
())
<<
"failed to open file "
<<
outputs
.
c_source_name
;
file
<<
source
;
file
.
close
();
LOG
(
WARNING
)
<<
"Output C source to file "
<<
outputs
.
c_source_name
;
}
}
CodeGenC
::
CodeGenC
(
Target
target
)
:
ir
::
IrPrinter
(
ss_
)
{}
std
::
string
CodeGenC
::
Compile
(
const
ir
::
Module
&
module
,
OutputKind
output_kind
)
{
if
(
output_kind
==
OutputKind
::
CHeader
)
{
GenerateHeaderFile
(
module
);
}
else
if
(
output_kind
==
OutputKind
::
CImpl
)
{
PrintIncludes
();
if
(
inline_builtin_codes_
)
PrintBuiltinCodes
();
for
(
auto
&
func
:
module
.
functions
())
{
Compile
(
func
);
}
}
else
{
LOG
(
FATAL
)
<<
"Not supported OutputKind"
;
}
return
str_
;
}
// TODO(LiuYang): Here the Ret type seems unuseful
void
CodeGenC
::
Compile
(
const
ir
::
LoweredFunc
&
function
)
{
CHECK
(
function
.
defined
());
IrPrinter
::
Visit
(
function
);
str_
+=
"
\n\n
"
;
}
std
::
string
CodeGenC
::
GetTypeName
(
Type
type
)
{
// common scalar type
#define GET_SCALAR_TYPE(pred_expr, scalar_name) \
if (pred_expr) { \
return scalar_name; \
}
GET_SCALAR_TYPE
(
type
.
is_void
(),
"void"
);
GET_SCALAR_TYPE
(
type
.
is_bool
(),
"bool"
);
GET_SCALAR_TYPE
(
type
.
is_int
(
8
),
"int8_t"
);
GET_SCALAR_TYPE
(
type
.
is_int
(
16
),
"int16_t"
);
GET_SCALAR_TYPE
(
type
.
is_int
(
32
),
"int32_t"
);
GET_SCALAR_TYPE
(
type
.
is_int
(
64
),
"int64_t"
);
GET_SCALAR_TYPE
(
type
.
is_uint
(
8
),
"uint8_t"
);
GET_SCALAR_TYPE
(
type
.
is_uint
(
16
),
"uint16_t"
);
GET_SCALAR_TYPE
(
type
.
is_uint
(
32
),
"uint32_t"
);
GET_SCALAR_TYPE
(
type
.
is_uint
(
64
),
"uint64_t"
);
GET_SCALAR_TYPE
(
type
.
is_bfloat16
(),
"bfloat16"
);
GET_SCALAR_TYPE
(
type
.
is_float16
(),
"float16"
);
GET_SCALAR_TYPE
(
type
.
is_float
(
32
),
"float"
)
GET_SCALAR_TYPE
(
type
.
is_float
(
64
),
"double"
)
#undef GET_SCALAR_TYPE
// customized_type
if
(
type
.
is_customized_type
())
{
CHECK
(
!
type
.
customized_type
().
empty
())
<<
"customized_type can't be empty."
;
auto
customized_name
=
type
.
customized_type
();
// get name of a cuda built-in vector type, it is started with a
// 'CudaVectorType::' prefix
if
(
utils
::
Startswith
(
customized_name
,
common
::
customized_type
::
kcuda_builtin_vector_t
))
{
customized_name
.
erase
(
0
,
strlen
(
common
::
customized_type
::
kcuda_builtin_vector_t
));
}
return
customized_name
;
}
// other types are not implemented yet
CINN_NOT_IMPLEMENTED
return
""
;
}
std
::
string
CodeGenC
::
GetTypeRepr
(
Type
type
)
{
std
::
string
str
;
if
(
type
.
is_cpp_const
())
{
str
=
"const "
;
}
str
+=
GetTypeName
(
type
);
if
(
type
.
is_cpp_handle
())
{
str
+=
"*"
;
}
else
if
(
type
.
is_cpp_handle2
())
{
str
+=
"**"
;
}
return
str
;
}
void
CodeGenC
::
Visit
(
const
ir
::
IntImm
*
op
)
{
IrPrinter
::
Visit
(
op
);
}
void
CodeGenC
::
Visit
(
const
ir
::
UIntImm
*
op
)
{
IrPrinter
::
Visit
(
op
);
}
void
CodeGenC
::
Visit
(
const
ir
::
FloatImm
*
op
)
{
IrPrinter
::
Visit
(
op
);
}
void
CodeGenC
::
Visit
(
const
ir
::
StringImm
*
op
)
{
IrPrinter
::
Visit
(
op
);
}
void
CodeGenC
::
Visit
(
const
ir
::
Add
*
op
)
{
IrPrinter
::
Visit
(
op
);
}
void
CodeGenC
::
Visit
(
const
ir
::
Sub
*
op
)
{
IrPrinter
::
Visit
(
op
);
}
void
CodeGenC
::
Visit
(
const
ir
::
Mul
*
op
)
{
IrPrinter
::
Visit
(
op
);
}
void
CodeGenC
::
Visit
(
const
ir
::
Div
*
op
)
{
IrPrinter
::
Visit
(
op
);
}
void
CodeGenC
::
Visit
(
const
ir
::
Mod
*
op
)
{
auto
copied
=
op
->
b
();
optim
::
Simplify
(
&
copied
);
if
(
copied
.
is_constant
())
{
int
temp
=
static_cast
<
int
>
(
copied
.
get_constant
());
if
((
temp
&
(
temp
-
1
))
==
0
)
{
str_
+=
"("
;
IrPrinter
::
Visit
(
op
->
a
());
str_
+=
" & "
;
str_
+=
std
::
to_string
(
temp
-
1
);
str_
+=
")"
;
return
;
}
}
PrintBinaryOp
(
"%"
,
op
);
}
void
CodeGenC
::
Visit
(
const
ir
::
EQ
*
op
)
{
IrPrinter
::
Visit
(
op
);
}
void
CodeGenC
::
Visit
(
const
ir
::
NE
*
op
)
{
IrPrinter
::
Visit
(
op
);
}
void
CodeGenC
::
Visit
(
const
ir
::
LT
*
op
)
{
IrPrinter
::
Visit
(
op
);
}
void
CodeGenC
::
Visit
(
const
ir
::
LE
*
op
)
{
IrPrinter
::
Visit
(
op
);
}
void
CodeGenC
::
Visit
(
const
ir
::
GT
*
op
)
{
IrPrinter
::
Visit
(
op
);
}
void
CodeGenC
::
Visit
(
const
ir
::
GE
*
op
)
{
IrPrinter
::
Visit
(
op
);
}
void
CodeGenC
::
Visit
(
const
ir
::
And
*
op
)
{
PrintBinaryOp
(
"&&"
,
op
);
}
void
CodeGenC
::
Visit
(
const
ir
::
Or
*
op
)
{
PrintBinaryOp
(
"||"
,
op
);
}
void
CodeGenC
::
Visit
(
const
ir
::
Min
*
op
)
{
IrPrinter
::
Visit
(
op
);
}
void
CodeGenC
::
Visit
(
const
ir
::
Max
*
op
)
{
IrPrinter
::
Visit
(
op
);
}
void
CodeGenC
::
Visit
(
const
ir
::
Minus
*
op
)
{
IrPrinter
::
Visit
(
op
);
}
void
CodeGenC
::
Visit
(
const
ir
::
Not
*
op
)
{
str_
+=
"(!"
;
IrPrinter
::
Visit
(
op
->
v
());
str_
+=
")"
;
}
void
CodeGenC
::
Visit
(
const
ir
::
Cast
*
op
)
{
PrintCastExpr
(
op
->
type
(),
op
->
v
());
}
void
CodeGenC
::
Visit
(
const
ir
::
For
*
op
)
{
Expr
extent
=
op
->
extent
;
Expr
min
=
op
->
min
;
int
num_task
=
1
;
if
(
op
->
is_parallel
())
{
str_
+=
"int num_task = max_concurrency();
\n
"
;
DoIndent
();
str_
+=
"omp_set_num_threads(num_task);
\n
"
;
DoIndent
();
str_
+=
"auto flambda = [=](int task_id, int num_task) -> int {
\n
"
;
IncIndent
();
DoIndent
();
str_
+=
"int n_per_task = "
;
Expr
num_task_var
=
Var
(
"num_task"
);
IrPrinter
::
Visit
((
op
->
extent
+
num_task_var
-
1
)
/
num_task_var
);
str_
+=
";
\n
"
;
CHECK_EQ
(
min
.
as_int32
(),
0
);
auto
task_id
=
Var
(
"task_id"
);
auto
n_per_task
=
Var
(
"n_per_task"
);
min
=
task_id
*
n_per_task
;
extent
=
(
task_id
+
1
)
*
n_per_task
;
DoIndent
();
}
str_
+=
"for ("
;
str_
+=
GetTypeRepr
(
Int
(
32
));
str_
+=
" "
;
str_
+=
op
->
loop_var
->
name
;
str_
+=
" = "
;
IrPrinter
::
Visit
(
min
);
str_
+=
"; "
;
str_
+=
op
->
loop_var
->
name
;
str_
+=
" < "
;
IrPrinter
::
Visit
(
op
->
extent
);
if
(
op
->
is_parallel
())
{
str_
+=
" && "
;
str_
+=
op
->
loop_var
->
name
;
str_
+=
" < "
;
IrPrinter
::
Visit
(
extent
);
}
str_
+=
"; "
;
str_
+=
op
->
loop_var
->
name
;
str_
+=
" += 1"
;
str_
+=
") "
;
IrPrinter
::
Visit
(
op
->
body
);
if
(
op
->
is_parallel
())
{
str_
+=
"
\n
"
;
DoIndent
();
str_
+=
"return 0;
\n
"
;
DecIndent
();
DoIndent
();
str_
+=
"};
\n
"
;
str_
+=
"#pragma omp parallel num_threads(num_task)
\n
"
;
DoIndent
();
str_
+=
"{
\n
"
;
IncIndent
();
DoIndent
();
str_
+=
"int task_id = omp_get_thread_num();
\n
"
;
DoIndent
();
str_
+=
"flambda(task_id, num_task);
\n
"
;
DecIndent
();
DoIndent
();
str_
+=
"}"
;
}
}
void
CodeGenC
::
Visit
(
const
ir
::
PolyFor
*
op
)
{
str_
+=
"for ("
;
str_
+=
GetTypeRepr
(
Int
(
32
));
str_
+=
" "
;
str_
+=
op
->
iterator
->
name
;
str_
+=
" = "
;
IrPrinter
::
Visit
(
op
->
init
);
str_
+=
"; "
;
IrPrinter
::
Visit
(
op
->
condition
);
str_
+=
"; "
;
str_
+=
op
->
iterator
->
name
;
str_
+=
" += "
;
IrPrinter
::
Visit
(
op
->
inc
);
str_
+=
") "
;
IrPrinter
::
Visit
(
op
->
body
);
}
void
CodeGenC
::
Visit
(
const
ir
::
Select
*
op
)
{
str_
+=
"("
;
str_
+=
"("
;
IrPrinter
::
Visit
(
op
->
condition
);
str_
+=
") ? "
;
IrPrinter
::
Visit
(
op
->
true_value
);
str_
+=
" : "
;
IrPrinter
::
Visit
(
op
->
false_value
);
str_
+=
")"
;
}
void
CodeGenC
::
Visit
(
const
ir
::
IfThenElse
*
op
)
{
str_
+=
"if ("
;
IrPrinter
::
Visit
(
op
->
condition
);
str_
+=
") {
\n
"
;
if
(
!
op
->
true_case
.
As
<
ir
::
Block
>
())
IncIndent
();
DoIndent
();
IrPrinter
::
Visit
(
op
->
true_case
);
if
(
!
op
->
true_case
.
As
<
ir
::
Block
>
())
str_
+=
";"
;
str_
+=
"
\n
"
;
if
(
!
op
->
true_case
.
As
<
ir
::
Block
>
())
DecIndent
();
DoIndent
();
str_
+=
"}"
;
if
(
op
->
false_case
.
defined
())
{
str_
+=
" else {
\n
"
;
if
(
!
op
->
true_case
.
As
<
ir
::
Block
>
())
IncIndent
();
DoIndent
();
IrPrinter
::
Visit
(
op
->
false_case
);
if
(
!
op
->
false_case
.
As
<
ir
::
Block
>
())
str_
+=
";"
;
str_
+=
"
\n
"
;
if
(
!
op
->
true_case
.
As
<
ir
::
Block
>
())
DecIndent
();
DoIndent
();
str_
+=
"}"
;
}
}
void
CodeGenC
::
Visit
(
const
ir
::
Block
*
op
)
{
str_
+=
"{
\n
"
;
IncIndent
();
for
(
int
i
=
0
;
i
<
op
->
stmts
.
size
()
-
1
;
i
++
)
{
DoIndent
();
IrPrinter
::
Visit
(
op
->
stmts
[
i
]);
str_
+=
";
\n
"
;
}
if
(
op
->
stmts
.
size
()
>=
1
)
{
DoIndent
();
IrPrinter
::
Visit
(
op
->
stmts
.
back
());
str_
+=
";"
;
}
DecIndent
();
str_
+=
"
\n
"
;
DoIndent
();
str_
+=
"}"
;
}
void
CodeGenC
::
Visit
(
const
ir
::
Call
*
op
)
{
if
(
op
->
name
==
runtime
::
intrinsic
::
buffer_malloc
)
{
PrintCall_buffer_malloc
(
op
);
}
else
if
(
op
->
name
==
runtime
::
intrinsic
::
pod_values_to_array_repr
)
{
PrintCall_pod_values_to_array
(
op
);
}
else
if
(
op
->
is_intrinsic_call
())
{
str_
+=
op
->
name
;
str_
+=
"("
;
PrintCallArgs
(
op
);
str_
+=
")"
;
}
else
if
(
op
->
is_cinn_call
())
{
// call CINN LoweredFunc
str_
+=
op
->
name
;
str_
+=
"("
;
PrintCallArgs
(
op
);
str_
+=
")"
;
}
else
if
(
op
->
is_extern_call
())
{
const
auto
&
fn_name
=
ExternFunctionEmitterRegistry
::
Global
().
Lookup
(
ExternFuncID
{
backend_C
,
op
->
name
.
c_str
()});
if
(
!
fn_name
.
empty
())
{
ExternFunctionLLVMEmitter
emitter
(
fn_name
);
emitter
.
BindCodeGen
(
this
);
emitter
.
Emit
(
op
);
}
else
{
CHECK
(
!
op
->
read_args
.
empty
()
||
!
op
->
write_args
.
empty
());
str_
+=
op
->
name
;
str_
+=
"("
;
PrintCallArgs
(
op
);
str_
+=
")"
;
}
}
else
{
CINN_NOT_IMPLEMENTED
}
}
void
CodeGenC
::
PrintCallArgs
(
const
ir
::
Call
*
op
)
{
if
(
!
op
->
read_args
.
empty
())
{
for
(
int
i
=
0
;
i
<
op
->
read_args
.
size
()
-
1
;
i
++
)
{
IrPrinter
::
Visit
(
op
->
read_args
[
i
]);
str_
+=
", "
;
}
IrPrinter
::
Visit
(
op
->
read_args
.
back
());
}
if
(
!
op
->
write_args
.
empty
())
{
if
(
!
op
->
read_args
.
empty
())
str_
+=
", "
;
for
(
int
i
=
0
;
i
<
op
->
write_args
.
size
()
-
1
;
i
++
)
{
IrPrinter
::
Visit
(
op
->
write_args
[
i
]);
str_
+=
", "
;
}
IrPrinter
::
Visit
(
op
->
write_args
.
back
());
}
}
void
CodeGenC
::
PrintCall_buffer_malloc
(
const
ir
::
Call
*
op
)
{
CHECK_EQ
(
op
->
read_args
.
size
(),
2UL
);
str_
+=
op
->
name
;
str_
+=
"("
;
PrintCastExpr
(
"void*"
,
op
->
read_args
[
0
]);
str_
+=
", "
;
IrPrinter
::
Visit
(
op
->
read_args
[
1
]);
str_
+=
")"
;
}
void
CodeGenC
::
PrintCall_cinn_pod_value_to_
(
const
ir
::
Call
*
op
)
{
CHECK_EQ
(
op
->
read_args
.
size
(),
1UL
);
str_
+=
op
->
name
;
str_
+=
"("
;
str_
+=
"&("
;
IrPrinter
::
Visit
(
op
->
read_args
[
0
]);
str_
+=
")"
;
str_
+=
")"
;
}
void
CodeGenC
::
PrintCall_get_address
(
const
ir
::
Call
*
op
)
{
CHECK_EQ
(
op
->
read_args
.
size
(),
1UL
);
CHECK
(
op
->
write_args
.
empty
());
auto
*
read_var
=
op
->
read_args
.
front
().
as_var
();
auto
*
read_buf
=
op
->
read_args
.
front
().
as_buffer
();
CHECK
(
read_var
||
read_buf
)
<<
"Only Var or Buffer can get address"
;
if
(
read_var
)
{
if
(
read_var
->
type
().
lanes
()
<=
1
)
str_
+=
"&"
;
str_
+=
read_var
->
name
;
}
else
if
(
read_buf
)
{
if
(
read_buf
->
type
().
lanes
()
<=
1
)
str_
+=
"&"
;
str_
+=
read_buf
->
name
;
}
else
{
CINN_NOT_IMPLEMENTED
}
}
void
CodeGenC
::
PrintCall_pod_values_to_array
(
const
ir
::
Call
*
op
)
{
CHECK
(
!
op
->
read_args
.
empty
());
CHECK_EQ
(
op
->
write_args
.
size
(),
1UL
);
auto
output_var
=
op
->
write_args
.
front
().
as_var_ref
();
CHECK
(
output_var
.
defined
());
std
::
vector
<
std
::
string
>
arg_names
;
for
(
auto
&
arg
:
op
->
read_args
)
{
auto
arg_var
=
arg
.
as_var
();
CHECK
(
arg_var
);
arg_names
.
push_back
(
arg_var
->
name
);
}
str_
+=
"cinn_pod_value_t "
;
str_
+=
output_var
->
name
;
str_
+=
"[] = "
;
str_
+=
"{ "
;
str_
+=
utils
::
Join
(
arg_names
,
", "
);
str_
+=
" }"
;
}
void
CodeGenC
::
Visit
(
const
ir
::
_Module_
*
op
)
{
CINN_NOT_IMPLEMENTED
}
void
CodeGenC
::
Visit
(
const
ir
::
_Var_
*
op
)
{
str_
+=
op
->
name
;
}
void
CodeGenC
::
Visit
(
const
ir
::
Load
*
op
)
{
Expr
dense_strided_ramp
=
detail
::
StridedRampBase
(
op
->
index
(),
1
);
if
(
dense_strided_ramp
.
defined
())
{
// Loading a continuous Ramp address.
CHECK
(
op
->
type
().
is_vector
());
PrintStackVecType
(
op
->
type
().
ElementOf
(),
op
->
index
().
type
().
lanes
());
str_
+=
"::"
;
str_
+=
"Load("
;
str_
+=
op
->
tensor
.
As
<
ir
::
_Tensor_
>
()
->
name
;
str_
+=
","
;
IrPrinter
::
Visit
(
dense_strided_ramp
);
str_
+=
")"
;
}
else
if
(
op
->
index
().
type
().
is_vector
())
{
// gather
CHECK
(
op
->
type
().
is_vector
());
PrintStackVecType
(
op
->
type
().
ElementOf
(),
op
->
index
().
type
().
lanes
());
str_
+=
"::Load("
;
str_
+=
op
->
tensor
.
As
<
ir
::
_Tensor_
>
()
->
name
;
str_
+=
","
;
IrPrinter
::
Visit
(
op
->
index
());
str_
+=
")"
;
}
else
if
(
op
->
is_addr_tensor
())
{
auto
*
tensor
=
op
->
tensor
.
As
<
ir
::
_Tensor_
>
();
str_
+=
tensor
->
name
;
str_
+=
"["
;
IrPrinter
::
Visit
(
op
->
index
());
str_
+=
"]"
;
}
else
{
IrPrinter
::
Visit
(
op
);
}
}
void
CodeGenC
::
Visit
(
const
ir
::
Store
*
op
)
{
CHECK
(
op
->
is_addr_tensor
());
auto
*
tensor
=
op
->
tensor
.
As
<
ir
::
_Tensor_
>
();
CHECK
(
tensor
);
str_
+=
tensor
->
name
;
str_
+=
"["
;
IrPrinter
::
Visit
(
op
->
index
());
str_
+=
"]"
;
str_
+=
" = "
;
IrPrinter
::
Visit
(
op
->
value
);
}
void
CodeGenC
::
Visit
(
const
ir
::
Alloc
*
op
)
{
str_
+=
runtime
::
intrinsic
::
buffer_malloc
;
str_
+=
"("
;
str_
+=
"(void*)(0), "
;
auto
*
buffer
=
op
->
destination
.
As
<
ir
::
_Buffer_
>
();
str_
+=
buffer
->
name
;
str_
+=
")"
;
}
void
CodeGenC
::
Visit
(
const
ir
::
Free
*
op
)
{
str_
+=
runtime
::
intrinsic
::
buffer_free
;
str_
+=
"("
;
str_
+=
"(void*)(0), "
;
auto
*
buffer
=
op
->
destination
.
As
<
ir
::
_Buffer_
>
();
str_
+=
buffer
->
name
;
str_
+=
")"
;
}
void
CodeGenC
::
Visit
(
const
ir
::
_Buffer_
*
op
)
{
str_
+=
op
->
name
;
}
void
CodeGenC
::
Visit
(
const
ir
::
_Tensor_
*
op
)
{
str_
+=
op
->
buffer
->
name
;
}
void
CodeGenC
::
Visit
(
const
ir
::
Let
*
op
)
{
bool
is_vec
=
false
;
CHECK
(
op
->
type
().
valid
());
if
(
op
->
body
.
defined
()
&&
op
->
body
.
As
<
ir
::
Broadcast
>
())
{
// broadcast's type is hard to print, so use c++11 auto instead.
str_
+=
"auto"
;
is_vec
=
true
;
}
else
{
str_
+=
GetTypeRepr
(
op
->
type
());
}
str_
+=
" "
;
IrPrinter
::
Visit
(
op
->
symbol
);
// native C array.
if
(
op
->
type
().
lanes
()
>
1
&&
!
is_vec
)
{
str_
+=
"["
;
str_
+=
std
::
to_string
(
op
->
type
().
lanes
());
str_
+=
"]"
;
}
if
(
op
->
body
.
defined
())
{
str_
+=
" = "
;
IrPrinter
::
Visit
(
op
->
body
);
}
}
void
CodeGenC
::
Visit
(
const
ir
::
Reduce
*
op
)
{
LOG
(
FATAL
)
<<
"Reduce IR is just for internal representation, should not be "
"used for CodeGen."
;
}
void
CodeGenC
::
Visit
(
const
ir
::
Ramp
*
op
)
{
str_
+=
"StackVec<"
;
str_
+=
std
::
to_string
(
op
->
lanes
);
str_
+=
","
;
str_
+=
GetTypeRepr
(
op
->
type
().
ElementOf
());
str_
+=
">::Ramp("
;
IrPrinter
::
Visit
(
op
->
base
);
str_
+=
", "
;
IrPrinter
::
Visit
(
op
->
stride
);
str_
+=
", "
;
str_
+=
std
::
to_string
(
op
->
lanes
);
str_
+=
")"
;
}
void
CodeGenC
::
Visit
(
const
ir
::
Broadcast
*
op
)
{
str_
+=
"StackVec<"
;
str_
+=
std
::
to_string
(
op
->
lanes
);
str_
+=
","
;
str_
+=
GetTypeRepr
(
op
->
type
().
ElementOf
());
str_
+=
">::Broadcast("
;
IrPrinter
::
Visit
(
op
->
value
);
str_
+=
", "
;
str_
+=
std
::
to_string
(
op
->
lanes
);
str_
+=
")"
;
}
void
CodeGenC
::
Visit
(
const
ir
::
FracOp
*
op
)
{
ir
::
IrPrinter
::
Visit
(
op
);
}
void
CodeGenC
::
Visit
(
const
ir
::
Sum
*
op
)
{
ir
::
IrPrinter
::
Visit
(
op
);
}
void
CodeGenC
::
Visit
(
const
ir
::
Product
*
op
)
{
ir
::
IrPrinter
::
Visit
(
op
);
}
void
CodeGenC
::
PrintCastExpr
(
const
Type
&
type
,
Expr
e
)
{
str_
+=
"(("
;
str_
+=
GetTypeRepr
(
type
);
str_
+=
")"
;
str_
+=
"("
;
IrPrinter
::
Visit
(
e
);
str_
+=
"))"
;
}
void
CodeGenC
::
PrintCastExpr
(
const
std
::
string
&
type
,
Expr
e
)
{
str_
+=
"("
;
str_
+=
type
;
str_
+=
")"
;
str_
+=
"("
;
IrPrinter
::
Visit
(
e
);
str_
+=
")"
;
}
void
CodeGenC
::
PrintShape
(
const
std
::
vector
<
Expr
>
&
shape
,
char
leftb
,
char
rightb
)
{
str_
+=
leftb
;
str_
+=
" "
;
for
(
int
i
=
0
;
i
<
shape
.
size
()
-
1
;
i
++
)
{
IrPrinter
::
Visit
(
shape
[
i
]);
str_
+=
", "
;
}
if
(
shape
.
size
()
>
1
)
IrPrinter
::
Visit
(
shape
.
back
());
str_
+=
" "
;
str_
+=
rightb
;
}
void
CodeGenC
::
Visit
(
const
ir
::
_LoweredFunc_
*
op
)
{
PrintFunctionDeclaration
(
op
);
str_
+=
"
\n
"
;
DoIndent
();
CHECK_EQ
(
op
->
alloc_output_buffer_exprs
.
size
(),
op
->
dealloc_output_buffer_exprs
.
size
())
<<
"the count of allocation and deallocaton expressions is not match"
;
std
::
vector
<
Expr
>
new_body
;
std
::
vector
<
Expr
>
create_temp_buffers
=
op
->
PrepareCreateTempBufferExprs
();
std
::
vector
<
Expr
>
alloca_temp_buffers
=
op
->
PrepareAllocTempBufferExprs
();
std
::
vector
<
Expr
>
dealloca_temp_buffers
=
op
->
PrepareDeallocTempBufferExprs
();
#define APPEND_TO_NEW_BODY(field__) \
new_body.insert( \
std::end(new_body), std::begin(op->field__), std::end(op->field__));
APPEND_TO_NEW_BODY
(
argument_prepare_exprs
)
new_body
.
insert
(
std
::
end
(
new_body
),
std
::
begin
(
create_temp_buffers
),
std
::
end
(
create_temp_buffers
));
APPEND_TO_NEW_BODY
(
alloc_output_buffer_exprs
)
new_body
.
insert
(
std
::
end
(
new_body
),
std
::
begin
(
alloca_temp_buffers
),
std
::
end
(
alloca_temp_buffers
));
APPEND_TO_NEW_BODY
(
buffer_data_cast_exprs
)
new_body
.
push_back
(
op
->
body
);
new_body
.
insert
(
std
::
end
(
new_body
),
std
::
begin
(
dealloca_temp_buffers
),
std
::
end
(
dealloca_temp_buffers
));
APPEND_TO_NEW_BODY
(
dealloc_output_buffer_exprs
)
Expr
func_body
=
ir
::
Block
::
Make
(
new_body
);
optim
::
RemoveNestedBlock
(
&
func_body
);
IrPrinter
::
Visit
(
func_body
);
}
void
CodeGenC
::
PrintIncludes
()
{
str_
+=
"#include <cinn_runtime.h>
\n
"
;
str_
+=
"#include <stdio.h>
\n
"
;
str_
+=
"
\n
"
;
}
void
CodeGenC
::
PrintFileGuardOpen
(
const
std
::
string
&
name
)
{
str_
+=
utils
::
StringFormat
(
"#ifndef _%s_CINN_H_
\n
"
,
Uppercase
(
name
).
c_str
());
str_
+=
utils
::
StringFormat
(
"#define _%s_CINN_H_
\n
"
,
Uppercase
(
name
).
c_str
());
str_
+=
"
\n
"
;
}
void
CodeGenC
::
PrintFileGuardClose
(
const
std
::
string
&
module_name
)
{
str_
+=
utils
::
StringFormat
(
"#endif // _%s_CINN_H_
\n
"
,
Uppercase
(
module_name
).
c_str
());
}
void
CodeGenC
::
PrintBufferCreation
(
const
std
::
vector
<
ir
::
Buffer
>
&
buffers
)
{
for
(
auto
&
buffer
:
buffers
)
{
// Ignore the buffer in other devices.
if
(
!
buffer
->
is_on_host
())
continue
;
DoIndent
();
auto
buffer_ptr_type
=
Type
()
.
set_customized_type
(
common
::
customized_type
::
kbuffer_t
)
.
set_cpp_handle
();
Var
variable
=
ir
::
_Var_
::
Make
(
buffer
->
name
,
buffer_ptr_type
);
auto
expr
=
ir
::
intrinsics
::
BufferCreate
::
Make
(
buffer
);
expr
=
ir
::
Let
::
Make
(
variable
,
expr
);
IrPrinter
::
Visit
(
expr
);
str_
+=
";
\n
"
;
}
}
void
CodeGenC
::
PrintBufferDestroy
(
const
std
::
vector
<
ir
::
Buffer
>
&
buffers
)
{
for
(
auto
&
buffer
:
buffers
)
{
DoIndent
();
IrPrinter
::
Visit
(
buffer
.
DestroyExpr
());
str_
+=
";
\n
"
;
}
}
void
CodeGenC
::
GenerateHeaderFile
(
const
ir
::
Module
&
module
)
{
PrintFileGuardOpen
(
module
.
name
());
PrintIncludes
();
for
(
auto
&
func
:
module
.
functions
())
{
PrintFunctionDeclaration
(
func
.
As
<
ir
::
_LoweredFunc_
>
());
str_
+=
";
\n
"
;
str_
+=
"
\n\n
"
;
}
PrintFileGuardClose
(
module
.
name
());
}
void
CodeGenC
::
PrintFuncArg
(
const
ir
::
Argument
&
arg
)
{
if
(
arg
.
is_buffer
())
{
if
(
arg
.
is_input
())
{
str_
+=
"const struct cinn_buffer_t *"
;
}
else
{
str_
+=
"struct cinn_buffer_t *"
;
}
}
else
if
(
arg
.
is_var
())
{
str_
+=
GetTypeRepr
(
arg
.
type
());
str_
+=
" "
;
str_
+=
arg
.
name
();
}
else
{
CINN_NOT_IMPLEMENTED
}
str_
+=
arg
.
name
();
}
void
CodeGenC
::
PrintRuntimeType
(
const
cinn_type_t
&
type
)
{
if
(
type
==
cinn_bool_t
())
{
str_
+=
"cinn_bool_t()"
;
}
else
if
(
type
==
cinn_int8_t
())
{
str_
+=
"cinn_int8_t()"
;
}
else
if
(
type
==
cinn_int16_t
())
{
str_
+=
"cinn_int16_t()"
;
}
else
if
(
type
==
cinn_int32_t
())
{
str_
+=
"cinn_int32_t()"
;
}
else
if
(
type
==
cinn_int64_t
())
{
str_
+=
"cinn_int64_t()"
;
}
else
if
(
type
==
cinn_uint8_t
())
{
str_
+=
"cinn_uint8_t()"
;
}
else
if
(
type
==
cinn_uint16_t
())
{
str_
+=
"cinn_uint16_t()"
;
}
else
if
(
type
==
cinn_uint32_t
())
{
str_
+=
"cinn_uint32_t()"
;
}
else
if
(
type
==
cinn_uint64_t
())
{
str_
+=
"cinn_uint64_t()"
;
}
else
if
(
type
==
cinn_bfloat16_t
())
{
str_
+=
"cinn_bfloat16_t()"
;
}
else
if
(
type
==
cinn_float16_t
())
{
str_
+=
"cinn_float16_t()"
;
}
else
if
(
type
==
cinn_float32_t
())
{
str_
+=
"cinn_float32_t()"
;
}
else
if
(
type
==
cinn_float64_t
())
{
str_
+=
"cinn_float64_t()"
;
}
else
{
LOG
(
FATAL
)
<<
"Unknown type is not supported to print"
;
}
}
void
CodeGenC
::
PrintStackVecType
(
Type
type
,
int
lanes
)
{
str_
+=
"StackedVec<"
;
str_
+=
GetTypeRepr
(
type
);
str_
+=
","
;
str_
+=
std
::
to_string
(
lanes
);
str_
+=
">"
;
}
void
CodeGenC
::
Visit
(
const
ir
::
PrimitiveNode
*
op
)
{
CINN_NOT_IMPLEMENTED
}
void
CodeGenC
::
Visit
(
const
ir
::
_BufferRange_
*
op
)
{
CINN_NOT_IMPLEMENTED
}
void
CodeGenC
::
Visit
(
const
ir
::
ScheduleBlock
*
op
)
{
CINN_NOT_IMPLEMENTED
}
void
CodeGenC
::
Visit
(
const
ir
::
ScheduleBlockRealize
*
op
)
{
CINN_NOT_IMPLEMENTED
}
void
CodeGenC
::
Visit
(
const
ir
::
IntrinsicOp
*
op
)
{
switch
(
op
->
getKind
())
{
#define __(x) \
case ir::IntrinsicKind::k##x: \
Visit(llvm::dyn_cast<ir::intrinsics::x>(op)); \
break;
INTRINSIC_KIND_FOR_EACH
(
__
)
#undef __
}
}
void
CodeGenC
::
Visit
(
const
ir
::
intrinsics
::
BufferGetDataHandle
*
op
)
{
str_
+=
op
->
buffer
.
as_buffer
()
->
name
;
str_
+=
"->"
;
str_
+=
"memory"
;
}
void
CodeGenC
::
Visit
(
const
ir
::
intrinsics
::
BufferGetDataConstHandle
*
op
)
{
str_
+=
op
->
buffer
.
as_buffer
()
->
name
;
str_
+=
"->"
;
str_
+=
"memory"
;
}
void
CodeGenC
::
Visit
(
const
ir
::
intrinsics
::
PodValueToX
*
op
)
{
auto
to_type
=
op
->
GetOutputType
(
0
);
if
(
to_type
==
type_of
<
float
>
())
{
str_
+=
runtime
::
intrinsic
::
pod_value_to_float
;
}
else
if
(
to_type
==
type_of
<
double
>
())
{
str_
+=
runtime
::
intrinsic
::
pod_value_to_double
;
}
else
if
(
to_type
==
type_of
<
float16
>
())
{
str_
+=
runtime
::
intrinsic
::
pod_value_to_float16
;
}
else
if
(
to_type
==
type_of
<
bool
>
())
{
str_
+=
runtime
::
intrinsic
::
pod_value_to_bool
;
}
else
if
(
to_type
==
type_of
<
int8_t
>
())
{
str_
+=
runtime
::
intrinsic
::
pod_value_to_int8
;
}
else
if
(
to_type
==
type_of
<
int16_t
>
())
{
str_
+=
runtime
::
intrinsic
::
pod_value_to_int16
;
}
else
if
(
to_type
==
type_of
<
int32_t
>
())
{
str_
+=
runtime
::
intrinsic
::
pod_value_to_int32
;
}
else
if
(
to_type
==
type_of
<
int64_t
>
())
{
str_
+=
runtime
::
intrinsic
::
pod_value_to_int64
;
}
else
if
(
to_type
==
type_of
<
uint8_t
>
())
{
str_
+=
runtime
::
intrinsic
::
pod_value_to_uint8
;
}
else
if
(
to_type
==
type_of
<
uint16_t
>
())
{
str_
+=
runtime
::
intrinsic
::
pod_value_to_uint16
;
}
else
if
(
to_type
==
type_of
<
uint32_t
>
())
{
str_
+=
runtime
::
intrinsic
::
pod_value_to_uint32
;
}
else
if
(
to_type
==
type_of
<
uint64_t
>
())
{
str_
+=
runtime
::
intrinsic
::
pod_value_to_uint64
;
}
else
if
(
to_type
==
type_of
<
void
*>
())
{
str_
+=
runtime
::
intrinsic
::
pod_value_to_void_p
;
}
else
if
(
to_type
==
type_of
<
cinn_buffer_t
*>
())
{
str_
+=
runtime
::
intrinsic
::
pod_value_to_buffer_p
;
}
else
{
LOG
(
FATAL
)
<<
"Not supported type: "
<<
to_type
;
}
str_
+=
"("
;
IrPrinter
::
Visit
(
op
->
pod_value_ptr
);
str_
+=
")"
;
}
void
CodeGenC
::
Visit
(
const
ir
::
intrinsics
::
BufferCreate
*
op
)
{
const
ir
::
_Buffer_
*
buffer_arg
=
op
->
buffer
.
as_buffer
();
CHECK
(
buffer_arg
);
str_
+=
runtime
::
intrinsic
::
buffer_create
;
str_
+=
"("
;
PrintCastExpr
(
"cinn_device_kind_t"
,
Expr
(
buffer_arg
->
target
.
runtime_arch
()));
str_
+=
"/*target*/, "
;
PrintRuntimeType
(
runtime
::
ToRuntimeType
(
buffer_arg
->
dtype
.
ElementOf
()));
str_
+=
", "
;
PrintShape
(
buffer_arg
->
shape
);
if
(
buffer_arg
->
data_alignment
>
0
)
{
str_
+=
", "
;
str_
+=
std
::
to_string
(
buffer_arg
->
data_alignment
);
str_
+=
"/*align*/"
;
}
str_
+=
")"
;
}
void
CodeGenC
::
Visit
(
const
ir
::
intrinsics
::
GetAddr
*
op
)
{
if
(
op
->
data
.
as_buffer
())
{
str_
+=
"&"
;
str_
+=
op
->
data
.
as_buffer
()
->
name
;
}
else
if
(
op
->
data
.
as_var
())
{
str_
+=
"&"
;
str_
+=
op
->
data
.
as_var
()
->
name
;
}
else
{
str_
+=
"&("
;
IrPrinter
::
Visit
(
op
->
data
);
str_
+=
")"
;
}
}
void
CodeGenC
::
Visit
(
const
ir
::
intrinsics
::
ArgsConstruct
*
op
)
{
str_
+=
runtime
::
intrinsic
::
args_construct_repr
;
str_
+=
"("
;
str_
+=
op
->
var
->
name
;
str_
+=
", "
;
str_
+=
std
::
to_string
(
op
->
args
.
size
());
str_
+=
", "
;
for
(
int
i
=
0
;
i
<
op
->
args
.
size
()
-
1
;
i
++
)
{
IrPrinter
::
Visit
(
op
->
args
[
i
]);
str_
+=
", "
;
}
if
(
!
op
->
args
.
empty
())
{
IrPrinter
::
Visit
(
op
->
args
.
back
());
}
str_
+=
")"
;
}
void
CodeGenC
::
Visit
(
const
ir
::
intrinsics
::
BuiltinIntrin
*
op
)
{
str_
+=
op
->
name
;
str_
+=
"("
;
if
(
!
op
->
args
.
empty
())
{
for
(
int
i
=
0
;
i
<
op
->
args
.
size
()
-
1
;
i
++
)
{
IrPrinter
::
Visit
(
op
->
args
[
i
]);
str_
+=
", "
;
}
IrPrinter
::
Visit
(
op
->
args
.
back
());
}
str_
+=
")"
;
}
std
::
string
ReadWholeFile
(
const
std
::
string
&
path
)
{
CHECK
(
!
path
.
empty
());
std
::
ifstream
file
(
path
);
CHECK
(
file
.
is_open
())
<<
"Failed to open file: "
<<
path
;
std
::
stringstream
ss
;
ss
<<
file
.
rdbuf
();
return
ss
.
str
();
}
void
CodeGenC
::
PrintBuiltinCodes
()
{
CHECK
(
!
FLAGS_cinn_x86_builtin_code_root
.
empty
())
<<
"The flag cinn_x86_builtin_code_root should be set first"
;
const
std
::
string
x86_code_file
=
"_x86_builtin_source.cc"
;
auto
source
=
ReadWholeFile
(
FLAGS_cinn_x86_builtin_code_root
+
"/"
+
x86_code_file
);
str_
+=
source
;
str_
+=
"
\n
"
;
}
namespace
detail
{
Expr
StridedRampBase
(
Expr
e
,
int
stride
)
{
auto
*
ramp_n
=
e
.
As
<
ir
::
Ramp
>
();
if
(
ramp_n
)
{
auto
*
iv
=
ramp_n
->
stride
.
As
<
ir
::
IntImm
>
();
if
(
iv
&&
iv
->
value
==
stride
)
return
ramp_n
->
base
;
}
return
Expr
();
}
}
// namespace detail
}
// namespace backends
}
// namespace cinn
paddle/cinn/backends/codegen_c.h
0 → 100644
View file @
992bec46
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <gflags/gflags.h>
#include <string>
#include <vector>
#include "paddle/cinn/common/common.h"
#include "paddle/cinn/ir/intrinsic_ops.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/lowered_func.h"
#include "paddle/cinn/ir/module.h"
#include "paddle/cinn/ir/utils/ir_printer.h"
#include "paddle/cinn/lang/packed_func.h"
#include "paddle/cinn/runtime/cinn_runtime.h"
namespace
cinn
{
namespace
ir
{
class
Module
;
}
// namespace ir
namespace
backends
{
//! keyword of __restrict__.
extern
const
char
*
kCKeywordRestrict
;
class
CodeGenC
:
public
ir
::
IrPrinter
{
public:
enum
class
OutputKind
{
CHeader
,
//! output the C header file.
CImpl
,
//! output the C implementation file.
};
explicit
CodeGenC
(
Target
target
);
void
Compile
(
const
ir
::
Module
&
module
,
const
Outputs
&
outputs
);
virtual
std
::
string
Compile
(
const
ir
::
Module
&
module
,
OutputKind
output_kind
);
//! Disable inline the builtin codes(too large) for simpler string comparison.
void
SetInlineBuiltinCodes
(
bool
x
=
true
)
{
inline_builtin_codes_
=
x
;
}
protected:
void
Compile
(
const
ir
::
LoweredFunc
&
function
);
void
GenerateHeaderFile
(
const
ir
::
Module
&
module
);
std
::
string
GetTypeName
(
Type
type
);
std
::
string
GetTypeRepr
(
Type
type
);
//! type cast, print like "int(x)"
// @{
void
PrintCastExpr
(
const
Type
&
type
,
Expr
e
);
void
PrintCastExpr
(
const
std
::
string
&
type
,
Expr
e
);
// @}
void
PrintFunctionDeclaration
(
const
ir
::
_LoweredFunc_
*
op
)
{
str_
+=
"void "
;
str_
+=
op
->
name
;
str_
+=
"("
;
str_
+=
"void* _args, int32_t num_args"
;
str_
+=
")"
;
}
void
PrintShape
(
const
std
::
vector
<
Expr
>&
shape
,
char
leftb
=
'{'
,
char
rightb
=
'}'
);
virtual
void
PrintIncludes
();
void
PrintBuiltinCodes
();
void
PrintFileGuardOpen
(
const
std
::
string
&
module_name
);
void
PrintFileGuardClose
(
const
std
::
string
&
module_name
);
//! Create the buffers in global scope(just creation without allocating them).
void
PrintBufferCreation
(
const
std
::
vector
<
ir
::
Buffer
>&
buffers
);
void
PrintBufferDestroy
(
const
std
::
vector
<
ir
::
Buffer
>&
buffers
);
void
PrintRuntimeType
(
const
cinn_type_t
&
type
);
//! Print different kinds of Calls.
// @{
void
PrintCallArgs
(
const
ir
::
Call
*
call
);
void
PrintCall_buffer_malloc
(
const
ir
::
Call
*
op
);
void
PrintCall_cinn_pod_value_to_
(
const
ir
::
Call
*
op
);
void
PrintCall_get_address
(
const
ir
::
Call
*
op
);
void
PrintCall_pod_values_to_array
(
const
ir
::
Call
*
op
);
// @}
#define __DEFINE_VISIT(op__) void Visit(const ir::op__* op) override;
NODETY_FORALL
(
__DEFINE_VISIT
)
#undef __DEFINE_VISIT
#define __DEFINE_VISIT(op__) \
void Visit(const ir::intrinsics::op__* op) override;
INTRINSIC_KIND_FOR_EACH
(
__DEFINE_VISIT
)
#undef __DEFINE_VISIT
void
PrintFuncArg
(
const
ir
::
Argument
&
arg
);
void
PrintStackVecType
(
Type
type
,
int
lanes
);
friend
class
ExternFunctionEmitter
;
protected:
Target
target_
;
std
::
stringstream
ss_
;
bool
inline_builtin_codes_
{
true
};
};
namespace
detail
{
Expr
StridedRampBase
(
Expr
e
,
int
stride
);
}
// namespace detail
}
// namespace backends
}
// namespace cinn
paddle/cinn/backends/codegen_c_test.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/backends/codegen_c.h"
#include <gtest/gtest.h>
#include <sstream>
#include <tuple>
#include "paddle/cinn/cinn.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/module.h"
#include "paddle/cinn/lang/builtin.h"
#include "paddle/cinn/lang/compute.h"
#include "paddle/cinn/lang/lower.h"
#include "paddle/cinn/lang/placeholder.h"
#include "paddle/cinn/optim/ir_simplify.h"
#include "paddle/cinn/runtime/cpu/use_extern_funcs.h"
namespace
cinn
{
namespace
backends
{
using
ir
::
Module
;
using
lang
::
Compute
;
using
lang
::
Lower
;
using
lang
::
Placeholder
;
using
utils
::
StringFormat
;
using
utils
::
Trim
;
std
::
tuple
<
ir
::
Tensor
,
ir
::
Tensor
,
ir
::
Tensor
,
lang
::
Buffer
>
CreateTensor1
()
{
Expr
M
(
100
);
Expr
N
(
20
);
Placeholder
<
float
>
A
(
"A"
,
{
M
,
N
});
Placeholder
<
float
>
B
(
"B"
,
{
M
,
N
});
lang
::
Buffer
C_buf
(
Float
(
32
));
auto
C
=
Compute
(
{
M
,
N
},
[
&
](
Var
i
,
Var
j
)
{
return
A
(
i
,
j
)
+
B
(
i
,
j
);
},
"C"
);
C
->
Bind
(
C_buf
);
return
std
::
make_tuple
(
A
,
B
,
C
,
C_buf
);
}
TEST
(
CodeGenC
,
module
)
{
ir
::
Tensor
A
,
B
,
C
;
lang
::
Buffer
C_buf
(
Float
(
32
));
std
::
tie
(
A
,
B
,
C
,
C_buf
)
=
CreateTensor1
();
LOG
(
INFO
)
<<
"C.body: "
<<
C
->
get_compute_op
()
->
body
.
front
();
Target
target
;
target
.
arch
=
Target
::
Arch
::
X86
;
target
.
bits
=
Target
::
Bit
::
k32
;
target
.
os
=
Target
::
OS
::
Linux
;
Module
::
Builder
builder
(
"module1"
,
target
);
auto
stages
=
CreateStages
({
A
,
B
,
C
});
auto
func
=
Lower
(
"add1"
,
stages
,
{
A
,
B
,
C
});
builder
.
AddFunction
(
func
);
{
CodeGenC
codegen
(
target
);
codegen
.
SetInlineBuiltinCodes
(
false
);
auto
out
=
codegen
.
Compile
(
builder
.
Build
(),
CodeGenC
::
OutputKind
::
CImpl
);
std
::
cout
<<
"codegen C:"
<<
std
::
endl
<<
out
<<
std
::
endl
;
std
::
string
target_str
=
R"ROC(
#include <cinn_runtime.h>
#include <stdio.h>
void add1(void* _args, int32_t num_args)
{
const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
cinn_buffer_malloc((void*)(0), _C);
const float* A = ((const float*)(_A->memory));
const float* B = ((const float*)(_B->memory));
float* C = ((float*)(_C->memory));
for (int32_t i = 0; i < 100; i += 1) {
for (int32_t j = 0; j < 20; j += 1) {
C[((20 * i) + j)] = (A[((20 * i) + j)] + B[((20 * i) + j)]);
};
};
cinn_buffer_free((void*)(0), _C);
}
)ROC"
;
EXPECT_EQ
(
utils
::
Trim
(
target_str
),
utils
::
Trim
(
out
));
}
{
CodeGenC
compiler
(
target
);
auto
out
=
compiler
.
Compile
(
builder
.
Build
(),
CodeGenC
::
OutputKind
::
CHeader
);
std
::
cout
<<
"header:
\n
"
<<
out
<<
std
::
endl
;
auto
target_str
=
R"ROC(
#ifndef _MODULE1_CINN_H_
#define _MODULE1_CINN_H_
#include <cinn_runtime.h>
#include <stdio.h>
void add1(void* _args, int32_t num_args);
#endif // _MODULE1_CINN_H_
)ROC"
;
EXPECT_EQ
(
utils
::
Trim
(
out
),
utils
::
Trim
(
target_str
));
}
{
CodeGenC
compiler
(
target
);
compiler
.
SetInlineBuiltinCodes
(
false
);
Outputs
outputs
;
outputs
=
outputs
.
c_header
(
"./generated_module1.h"
)
.
c_source
(
"./_generated_module1.cc"
);
compiler
.
Compile
(
builder
.
Build
(),
outputs
);
}
}
TEST
(
CodeGenC
,
matmul
)
{
using
namespace
ir
;
// NOLINT
Context
::
Global
().
ResetNameId
();
Placeholder
<
float
>
A
(
"A"
,
{
Expr
(
100
),
Expr
(
20
)});
Placeholder
<
float
>
B
(
"B"
,
{
Expr
(
20
),
Expr
(
50
)});
Target
target
{};
Module
::
Builder
builder
(
"module1"
,
target
);
// C = A * B
Var
k
(
20
,
"k0"
);
Tensor
C
=
Compute
(
{
Expr
(
100
),
Expr
(
50
)},
[
&
](
Var
i
,
Var
j
)
{
return
lang
::
ReduceSum
(
A
(
i
,
k
)
*
B
(
k
,
j
),
{
k
});
},
"C"
);
auto
stages
=
CreateStages
({
A
,
B
,
C
});
// Code gen
auto
func
=
Lower
(
"matmul"
,
stages
,
{
A
,
B
,
C
});
builder
.
AddFunction
(
func
);
builder
.
AddBuffer
(
C
->
buffer
);
{
// main
std
::
vector
<
lang
::
ReturnType
>
returns
(
{
lang
::
ReturnType
{
Float
(
32
),
C
->
shape
,
C
->
name
}});
auto
tensors
=
lang
::
CallLowered
(
"matmul"
,
{
A
,
B
},
returns
);
auto
C
=
tensors
[
0
];
C
->
WithBuffer
();
LOG
(
INFO
)
<<
"C.body: "
<<
C
->
body
();
auto
stages
=
CreateStages
({
C
});
auto
f
=
Lower
(
"main"
,
stages
,
{
A
,
B
,
C
},
{});
std
::
cout
<<
"f
\n
"
<<
Expr
(
f
)
<<
std
::
endl
;
builder
.
AddFunction
(
f
);
}
CodeGenC
codegen
(
target
);
codegen
.
SetInlineBuiltinCodes
(
false
);
auto
out
=
codegen
.
Compile
(
builder
.
Build
(),
CodeGenC
::
OutputKind
::
CImpl
);
std
::
cout
<<
"codegen C:"
<<
std
::
endl
<<
out
<<
std
::
endl
;
auto
tgt
=
R"ROC(
#include <cinn_runtime.h>
#include <stdio.h>
void matmul(void* _args, int32_t num_args)
{
const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
cinn_buffer_malloc((void*)(0), _C);
const float* A = ((const float*)(_A->memory));
const float* B = ((const float*)(_B->memory));
float* C = ((float*)(_C->memory));
float* C__reduce_init = ((float*)(_C->memory));
for (int32_t i = 0; i < 100; i += 1) {
for (int32_t j = 0; j < 50; j += 1) {
C__reduce_init[((50 * i) + j)] = 0.00000000f;
for (int32_t k0 = 0; k0 < 20; k0 += 1) {
C[((50 * i) + j)] = (C[((50 * i) + j)] + (A[((20 * i) + k0)] * B[((50 * k0) + j)]));
};
};
};
cinn_buffer_free((void*)(0), _C);
}
void main(void* _args, int32_t num_args)
{
const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
cinn_buffer_malloc((void*)(0), _C);
const float* A = ((const float*)(_A->memory));
const float* B = ((const float*)(_B->memory));
float* C = ((float*)(_C->memory));
cinn_pod_value_t _pod_val_;
buffer_p_to_cinn_pod_value(_A, &_pod_val_);
cinn_pod_value_t _pod_val__0;
buffer_p_to_cinn_pod_value(_B, &_pod_val__0);
cinn_pod_value_t _pod_val__1;
buffer_p_to_cinn_pod_value(_C, &_pod_val__1);
cinn_pod_value_t _pod_arr[3];
cinn_args_construct(_pod_arr, 3, &_pod_val_, &_pod_val__0, &_pod_val__1);
matmul(_pod_arr, 3);
cinn_buffer_free((void*)(0), _C);
}
)ROC"
;
ASSERT_EQ
(
Trim
(
tgt
),
Trim
(
out
));
}
// This matches output of competitor.
TEST
(
CodeGenC
,
matmul_tile
)
{
using
namespace
ir
;
// NOLINT
Expr
M
(
100
);
Expr
K
(
200
);
Expr
N
(
500
);
Expr
bn
(
32
);
Placeholder
<
float
>
A
(
"A"
,
{
M
,
K
});
Placeholder
<
float
>
B
(
"B"
,
{
K
,
N
});
// C = A * B
Var
k
(
K
.
as_int32
(),
"k0"
);
Tensor
C_init
=
Compute
(
{
M
,
N
},
[
&
](
Var
i
,
Var
j
)
{
return
Expr
(
0.
f
);
},
"C_init"
);
Tensor
C
=
Compute
(
{
M
,
N
},
[
&
](
Var
i
,
Var
j
)
{
return
lang
::
ReduceSum
(
A
(
i
,
k
)
*
B
(
k
,
j
),
{
k
});
},
"C"
);
auto
stages
=
CreateStages
({
C
,
C_init
});
stages
[
C
]
->
ShareBufferWith
(
stages
[
C_init
]);
{
auto
_i_outer_i_inner_j_outer_j_inner_
=
stages
[
C_init
]
->
Tile
(
0
,
1
,
bn
.
as_int32
(),
bn
.
as_int32
());
// NOLINT
auto
&
i_outer
=
std
::
get
<
0
>
(
_i_outer_i_inner_j_outer_j_inner_
);
auto
&
i_inner
=
std
::
get
<
1
>
(
_i_outer_i_inner_j_outer_j_inner_
);
auto
&
j_outer
=
std
::
get
<
2
>
(
_i_outer_i_inner_j_outer_j_inner_
);
auto
&
j_inner
=
std
::
get
<
3
>
(
_i_outer_i_inner_j_outer_j_inner_
);
stages
[
C_init
]
->
Reorder
({
i_outer
,
j_outer
,
i_inner
,
j_inner
});
}
{
auto
_i_outer_i_inner_j_outer_j_inner_
=
stages
[
C
]
->
Tile
(
0
,
1
,
bn
.
as_int32
(),
bn
.
as_int32
());
// NOLINT
auto
&
i_outer
=
std
::
get
<
0
>
(
_i_outer_i_inner_j_outer_j_inner_
);
auto
&
i_inner
=
std
::
get
<
1
>
(
_i_outer_i_inner_j_outer_j_inner_
);
auto
&
j_outer
=
std
::
get
<
2
>
(
_i_outer_i_inner_j_outer_j_inner_
);
auto
&
j_inner
=
std
::
get
<
3
>
(
_i_outer_i_inner_j_outer_j_inner_
);
auto
_k_outer_k_inner_
=
stages
[
C
]
->
Split
(
poly
::
Iterator
(
"k0"
),
4
);
// NOLINT
auto
&
k_outer
=
std
::
get
<
0
>
(
_k_outer_k_inner_
);
auto
&
k_inner
=
std
::
get
<
1
>
(
_k_outer_k_inner_
);
stages
[
C
]
->
Reorder
({
i_outer
,
j_outer
,
i_inner
,
j_inner
,
k_outer
,
k_inner
});
}
stages
[
C_init
]
->
ComputeAtSchedule
(
stages
[
C
],
3
,
poly
::
Stage
::
kComputeAtBefore
);
// Code gen
auto
func
=
Lower
(
"matmul"
,
stages
,
{
A
,
B
,
C
});
Target
target
=
common
::
DefaultHostTarget
();
Module
::
Builder
builder
(
"module1"
,
target
);
builder
.
AddFunction
(
func
);
builder
.
AddBuffer
(
C_init
->
buffer
);
CodeGenC
codegen
(
target
);
codegen
.
SetInlineBuiltinCodes
(
false
);
auto
out
=
codegen
.
Compile
(
builder
.
Build
(),
CodeGenC
::
OutputKind
::
CImpl
);
std
::
cout
<<
"codegen C:"
<<
std
::
endl
<<
out
<<
std
::
endl
;
auto
target_out
=
R"ROC(
#include <cinn_runtime.h>
#include <stdio.h>
void matmul(void* _args, int32_t num_args)
{
const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
cinn_buffer_malloc((void*)(0), _C);
const float* A = ((const float*)(_A->memory));
const float* B = ((const float*)(_B->memory));
float* C = ((float*)(_C->memory));
float* C__reduce_init = ((float*)(_C->memory));
float* C_init = ((float*)(_C->memory));
for (int32_t i_outer = 0; i_outer < 4; i_outer += 1) {
for (int32_t j_outer = 0; j_outer < 16; j_outer += 1) {
for (int32_t i_inner = 0; i_inner < cinn_min(32, (100 + (-32 * i_outer))); i_inner += 1) {
for (int32_t j_inner = 0; j_inner < cinn_min(32, (500 + (-32 * j_outer))); j_inner += 1) {
C__reduce_init[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))] = 0.00000000f;
C_init[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))] = 0.00000000f;
for (int32_t k0_outer = 0; k0_outer < 50; k0_outer += 1) {
for (int32_t k0_inner = 0; k0_inner < 4; k0_inner += 1) {
C[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))] = fma(A[((200 * i_inner) + ((6400 * i_outer) + ((4 * k0_outer) + k0_inner)))], B[((32 * j_outer) + ((500 * k0_inner) + ((2000 * k0_outer) + j_inner)))], C[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))]);
};
};
};
};
};
};
cinn_buffer_free((void*)(0), _C);
}
)ROC"
;
ASSERT_EQ
(
Trim
(
target_out
),
Trim
(
out
));
}
TEST
(
CodeGenC
,
matmul_packed
)
{
Expr
M
(
100
);
Expr
K
(
200
);
Expr
N
(
500
);
Expr
bn
(
32
);
Placeholder
<
float
>
A
(
"A"
,
{
M
,
K
});
Placeholder
<
float
>
B
(
"B"
,
{
K
,
N
});
// TODO(Superjomn) Make sure the domain works.
Var
k
(
K
.
as_int32
(),
"k0"
);
auto
packedB
=
Compute
(
{
N
/
bn
,
K
,
bn
},
[
&
](
Expr
x
,
Expr
y
,
Expr
z
)
{
return
B
(
y
,
x
*
bn
+
z
);
},
"PackedB"
);
auto
C
=
Compute
(
{
M
,
N
},
[
&
](
Expr
i
,
Expr
j
)
{
return
ReduceSum
(
A
(
i
,
k
)
*
packedB
(
j
/
bn
,
k
,
j
%
bn
),
{
k
});
},
"C"
);
auto
stages
=
CreateStages
({
packedB
,
C
});
{
auto
_i_outer_i_inner_j_outer_j_inner_
=
stages
[
C
]
->
Tile
(
0
,
1
,
bn
.
as_int32
(),
bn
.
as_int32
());
auto
&
i_outer
=
std
::
get
<
0
>
(
_i_outer_i_inner_j_outer_j_inner_
);
auto
&
i_inner
=
std
::
get
<
1
>
(
_i_outer_i_inner_j_outer_j_inner_
);
auto
&
j_outer
=
std
::
get
<
2
>
(
_i_outer_i_inner_j_outer_j_inner_
);
auto
&
j_inner
=
std
::
get
<
3
>
(
_i_outer_i_inner_j_outer_j_inner_
);
auto
_k_outer_k_inner_
=
stages
[
C
]
->
Split
(
poly
::
Iterator
(
"k0"
),
4
);
auto
&
k_outer
=
std
::
get
<
0
>
(
_k_outer_k_inner_
);
auto
&
k_inner
=
std
::
get
<
1
>
(
_k_outer_k_inner_
);
stages
[
C
]
->
Reorder
({
i_outer
,
j_outer
,
i_inner
,
j_inner
,
k_outer
,
k_inner
});
}
// Code gen
auto
func
=
Lower
(
"matmul_with_packing"
,
stages
,
{
A
,
B
,
packedB
,
C
});
Target
target
=
common
::
DefaultHostTarget
();
Module
::
Builder
builder
(
"module1"
,
target
);
builder
.
AddFunction
(
func
);
builder
.
AddBuffer
(
C
->
buffer
);
builder
.
AddBuffer
(
packedB
->
buffer
);
CodeGenC
codegen
(
target
);
codegen
.
SetInlineBuiltinCodes
(
false
);
auto
out
=
codegen
.
Compile
(
builder
.
Build
(),
CodeGenC
::
OutputKind
::
CImpl
);
std
::
cout
<<
"codegen C:"
<<
std
::
endl
<<
out
<<
std
::
endl
;
auto
target_out
=
R"ROC(
#include <cinn_runtime.h>
#include <stdio.h>
void matmul_with_packing(void* _args, int32_t num_args)
{
const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
cinn_buffer_t* _PackedB = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[3]));
cinn_buffer_malloc((void*)(0), _PackedB);
cinn_buffer_malloc((void*)(0), _C);
const float* A = ((const float*)(_A->memory));
const float* B = ((const float*)(_B->memory));
float* C = ((float*)(_C->memory));
float* C__reduce_init = ((float*)(_C->memory));
float* PackedB = ((float*)(_PackedB->memory));
for (int32_t i = 0; i < 15; i += 1) {
for (int32_t j = 0; j < 200; j += 1) {
for (int32_t k = 0; k < 32; k += 1) {
PackedB[((6400 * i) + ((32 * j) + k))] = B[((32 * i) + ((500 * j) + k))];
};
};
};
for (int32_t i_outer = 0; i_outer < 4; i_outer += 1) {
for (int32_t j_outer = 0; j_outer < 16; j_outer += 1) {
for (int32_t i_inner = 0; i_inner < cinn_min(32, (100 + (-32 * i_outer))); i_inner += 1) {
for (int32_t j_inner = 0; j_inner < cinn_min(32, (500 + (-32 * j_outer))); j_inner += 1) {
C__reduce_init[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))] = 0;
for (int32_t k0_outer = 0; k0_outer < 50; k0_outer += 1) {
for (int32_t k0_inner = 0; k0_inner < 4; k0_inner += 1) {
C[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))] = fma(A[((200 * i_inner) + ((6400 * i_outer) + ((4 * k0_outer) + k0_inner)))], PackedB[((6400 * (j_inner / 32)) + ((j_inner & 31) + ((6400 * j_outer) + ((32 * k0_inner) + (128 * k0_outer)))))], C[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))]);
};
};
};
};
};
};
cinn_buffer_free((void*)(0), _PackedB);
cinn_buffer_free((void*)(0), _C);
}
)ROC"
;
// ToDo @haoze @wangyue Check Codegen
// ASSERT_EQ(utils::Trim(target_out), utils::Trim(out));
}
TEST
(
CodeGenC
,
call_extern
)
{
Expr
M
(
100
);
Placeholder
<
float
>
x
(
"x"
,
{
M
});
ir
::
Tensor
y
=
Compute
(
{
M
},
[
=
](
Var
i
)
->
Expr
{
return
lang
::
CallExtern
(
"tanh"
,
{
x
(
i
)});
},
"y"
);
auto
stages
=
CreateStages
({
y
});
auto
yexpr
=
Lower
(
"yy"
,
stages
,
{
y
});
Module
::
Builder
builder
(
"module0"
,
common
::
DefaultHostTarget
());
builder
.
AddFunction
(
yexpr
);
CodeGenC
codegen
(
common
::
DefaultHostTarget
());
codegen
.
SetInlineBuiltinCodes
(
false
);
auto
out
=
codegen
.
Compile
(
builder
.
Build
(),
CodeGenC
::
OutputKind
::
CImpl
);
std
::
cout
<<
"codegen C:"
<<
std
::
endl
<<
out
<<
std
::
endl
;
}
}
// namespace backends
}
// namespace cinn
paddle/cinn/backends/codegen_c_x86.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/backends/codegen_c_x86.h"
namespace
cinn
{
namespace
backends
{
void
CodeGenCX86
::
Visit
(
const
ir
::
Add
*
op
)
{
VisitBinaryOp
(
op
,
op
->
a
(),
op
->
b
(),
"add"
);
}
void
CodeGenCX86
::
Visit
(
const
ir
::
Sub
*
op
)
{
VisitBinaryOp
(
op
,
op
->
a
(),
op
->
b
(),
"sub"
);
}
void
CodeGenCX86
::
Visit
(
const
ir
::
Mul
*
op
)
{
VisitBinaryOp
(
op
,
op
->
a
(),
op
->
b
(),
"mul"
);
}
void
CodeGenCX86
::
Visit
(
const
ir
::
Div
*
op
)
{
VisitBinaryOp
(
op
,
op
->
a
(),
op
->
b
(),
"div"
);
}
void
CodeGenCX86
::
Visit
(
const
ir
::
Load
*
op
)
{
Expr
dense_strided_ramp
=
detail
::
StridedRampBase
(
op
->
index
(),
1
);
if
(
dense_strided_ramp
.
defined
())
{
// Loading a continuous Ramp address.
CHECK
(
op
->
type
().
is_vector
());
int
bits
=
op
->
type
().
bits
()
*
op
->
type
().
lanes
();
if
(
SupportsAVX512
()
&&
bits
==
512
)
{
str_
+=
"cinn_avx512_load("
;
PrintAbsAddr
(
op
);
str_
+=
")"
;
}
else
if
(
SupportsAVX256
()
&&
bits
==
256
)
{
str_
+=
"cinn_avx256_load("
;
PrintAbsAddr
(
op
);
str_
+=
")"
;
}
else
{
CodeGenC
::
Visit
(
op
);
}
}
else
{
CodeGenC
::
Visit
(
op
);
}
}
void
CodeGenCX86
::
Visit
(
const
ir
::
Broadcast
*
op
)
{
CHECK_GT
(
op
->
type
().
lanes
(),
1
);
int
bits
=
op
->
type
().
bits
()
*
op
->
type
().
lanes
();
if
(
SupportsAVX512
()
&&
bits
==
512
)
{
str_
+=
"cinn_avx512_set1("
;
PrintCastExpr
(
op
->
value
.
type
().
ElementOf
(),
op
->
value
);
str_
+=
")"
;
}
else
if
(
SupportsAVX256
()
&&
bits
==
256
)
{
str_
+=
"cinn_avx256_set1("
;
PrintCastExpr
(
op
->
value
.
type
().
ElementOf
(),
op
->
value
);
str_
+=
")"
;
}
else
{
CodeGenC
::
Visit
(
op
);
}
}
void
CodeGenCX86
::
Visit
(
const
ir
::
Store
*
op
)
{
if
(
op
->
type
().
lanes
()
==
1
)
{
CodeGenC
::
Visit
(
op
);
return
;
}
int
bits
=
op
->
type
().
bits
()
*
op
->
type
().
lanes
();
if
(
SupportsAVX512
()
&&
bits
==
512
)
{
str_
+=
"cinn_avx512_store("
;
PrintAbsAddr
(
op
);
str_
+=
", "
;
IrPrinter
::
Visit
(
op
->
value
);
str_
+=
")"
;
}
else
if
(
SupportsAVX256
()
&&
bits
==
256
)
{
str_
+=
"cinn_avx256_store("
;
PrintAbsAddr
(
op
);
str_
+=
", "
;
IrPrinter
::
Visit
(
op
->
value
);
str_
+=
")"
;
}
else
{
CodeGenC
::
Visit
(
op
);
}
}
void
CodeGenCX86
::
PrintVecInputArgument
(
const
Expr
*
op
)
{
int
bits
=
op
->
type
().
bits
()
*
op
->
type
().
lanes
();
auto
*
broadcast_n
=
op
->
As
<
ir
::
Broadcast
>
();
if
(
op
->
type
().
lanes
()
==
1
||
broadcast_n
)
{
Expr
value
=
op
->
type
().
lanes
()
==
1
?
*
op
:
broadcast_n
->
value
;
if
(
SupportsAVX512
())
{
str_
+=
"cinn_avx512_set1("
;
IrPrinter
::
Visit
(
value
);
str_
+=
")"
;
}
else
if
(
SupportsAVX256
())
{
str_
+=
"cinn_avx256_set1("
;
IrPrinter
::
Visit
(
value
);
str_
+=
")"
;
}
else
{
CINN_NOT_IMPLEMENTED
}
}
else
{
IrPrinter
::
Visit
(
*
op
);
}
}
void
CodeGenCX86
::
Visit
(
const
ir
::
intrinsics
::
BuiltinIntrin
*
op
)
{
if
(
op
->
type
().
lanes
()
==
1
)
{
CodeGenC
::
Visit
(
op
);
return
;
}
int
bits
=
op
->
type
().
bits
()
*
op
->
type
().
lanes
();
if
(
SupportsAVX512
()
&&
bits
==
512
)
{
str_
+=
"cinn_avx512_"
;
str_
+=
op
->
name
;
str_
+=
"("
;
if
(
!
op
->
args
.
empty
())
{
for
(
int
i
=
0
;
i
<
op
->
args
.
size
()
-
1
;
i
++
)
{
PrintVecInputArgument
(
&
op
->
args
[
i
]);
str_
+=
", "
;
}
IrPrinter
::
Visit
(
op
->
args
.
back
());
}
str_
+=
")"
;
}
else
if
(
SupportsAVX256
()
&&
bits
==
256
)
{
str_
+=
"cinn_avx256_"
;
str_
+=
op
->
name
;
str_
+=
"("
;
if
(
!
op
->
args
.
empty
())
{
for
(
int
i
=
0
;
i
<
op
->
args
.
size
()
-
1
;
i
++
)
{
PrintVecInputArgument
(
&
op
->
args
[
i
]);
str_
+=
", "
;
}
PrintVecInputArgument
(
&
op
->
args
.
back
());
}
str_
+=
")"
;
}
else
if
(
bits
==
128
)
{
str_
+=
"cinn_avx128_"
;
str_
+=
op
->
name
;
str_
+=
"("
;
if
(
!
op
->
args
.
empty
())
{
for
(
int
i
=
0
;
i
<
op
->
args
.
size
()
-
1
;
i
++
)
{
PrintVecInputArgument
(
&
op
->
args
[
i
]);
str_
+=
", "
;
}
PrintVecInputArgument
(
&
op
->
args
.
back
());
}
str_
+=
")"
;
}
else
{
CodeGenC
::
Visit
(
op
);
}
}
}
// namespace backends
}
// namespace cinn
paddle/cinn/backends/codegen_c_x86.h
0 → 100644
View file @
992bec46
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/cinn/backends/codegen_c.h"
#include "paddle/cinn/ir/intrinsic_ops.h"
namespace
cinn
{
namespace
backends
{
/**
* C code generation with X86 instruction or math library support.
*/
class
CodeGenCX86
:
public
CodeGenC
{
public:
//! The X86 CPU supports some following features. We use SSE or AVX to
//! accelerate the basic operations if forloop is vectorized.
enum
class
Feature
:
int
{
None
=
0
,
SSE
=
1
,
//! support SSE instruction set.
AVX256
=
1
<<
1
,
// ! support AVX256 instruction set.
AVX512
=
1
<<
2
,
// ! support AVX512 instruction set.
BLAS
=
1
<<
3
,
// ! support BLAS library.
};
Feature
feature
{
Feature
::
None
};
/**
* constructor.
* @param target The device.
* @param features Features it supported.
*/
CodeGenCX86
(
Target
target
,
Feature
feature
)
:
CodeGenC
(
target
),
feature
(
feature
)
{}
protected:
void
Visit
(
const
ir
::
Add
*
op
)
override
;
void
Visit
(
const
ir
::
Sub
*
op
)
override
;
void
Visit
(
const
ir
::
Mul
*
op
)
override
;
void
Visit
(
const
ir
::
Div
*
op
)
override
;
void
Visit
(
const
ir
::
Mod
*
op
)
override
{
CodeGenC
::
Visit
(
op
);
}
void
Visit
(
const
ir
::
EQ
*
op
)
override
{
CodeGenC
::
Visit
(
op
);
}
void
Visit
(
const
ir
::
NE
*
op
)
override
{
CodeGenC
::
Visit
(
op
);
}
void
Visit
(
const
ir
::
LT
*
op
)
override
{
CodeGenC
::
Visit
(
op
);
}
void
Visit
(
const
ir
::
LE
*
op
)
override
{
CodeGenC
::
Visit
(
op
);
}
void
Visit
(
const
ir
::
GT
*
op
)
override
{
CodeGenC
::
Visit
(
op
);
}
void
Visit
(
const
ir
::
GE
*
op
)
override
{
CodeGenC
::
Visit
(
op
);
}
void
Visit
(
const
ir
::
And
*
op
)
override
{
CodeGenC
::
Visit
(
op
);
}
void
Visit
(
const
ir
::
Or
*
op
)
override
{
CodeGenC
::
Visit
(
op
);
}
void
Visit
(
const
ir
::
Load
*
op
)
override
;
void
Visit
(
const
ir
::
Store
*
op
)
override
;
void
Visit
(
const
ir
::
Broadcast
*
op
)
override
;
void
Visit
(
const
ir
::
intrinsics
::
BuiltinIntrin
*
op
);
//! Check the features.
// @{
bool
SupportsSSE
()
{
return
static_cast
<
int
>
(
feature
)
&
static_cast
<
int
>
(
Feature
::
SSE
);
}
bool
SupportsAVX256
()
{
return
static_cast
<
int
>
(
feature
)
&
static_cast
<
int
>
(
Feature
::
AVX256
);
}
bool
SupportsAVX512
()
{
return
static_cast
<
int
>
(
feature
)
&
static_cast
<
int
>
(
Feature
::
AVX512
);
}
bool
SupportsBLAS
()
{
return
static_cast
<
int
>
(
feature
)
&
static_cast
<
int
>
(
Feature
::
BLAS
);
}
// @}
//! Print (and prepare) a argument in vectorize type, for example:
// 3. -> set1(3.)
// a[i:j] -> load_ps(a+i)
void
PrintVecInputArgument
(
const
Expr
*
op
);
//! The output argument, such as the destination for Load.
void
PrintVecOutputArgument
(
const
Expr
*
op
);
template
<
typename
Op
>
void
PrintAbsAddr
(
const
Op
*
op
)
{
str_
+=
op
->
tensor
.
template
As
<
ir
::
_Tensor_
>()
->
name
;
str_
+=
" + "
;
auto
index
=
op
->
index
();
auto
*
ramp_n
=
index
.
template
As
<
ir
::
Ramp
>();
if
(
ramp_n
)
{
CHECK
(
!
ramp_n
->
base
.
template
As
<
ir
::
Ramp
>())
<<
"base of a Ramp node should not be Ramp type"
;
IrPrinter
::
Visit
(
ramp_n
->
base
);
}
else
{
IrPrinter
::
Visit
(
op
->
index
());
}
}
template
<
typename
Op
>
void
VisitBinaryOp
(
const
Op
*
op
,
Expr
a
,
Expr
b
,
const
std
::
string
&
op_repr
);
};
template
<
typename
Op
>
void
CodeGenCX86
::
VisitBinaryOp
(
const
Op
*
op
,
Expr
a
,
Expr
b
,
const
std
::
string
&
op_repr
)
{
CHECK_EQ
(
a
.
type
(),
b
.
type
())
<<
" a is : "
<<
a
<<
", and b is : "
<<
b
<<
". op_repr is : "
<<
op_repr
;
// scalar.
if
(
a
.
type
().
lanes
()
==
1
)
{
CodeGenC
::
Visit
(
op
);
return
;
}
// TODO(Superjomn) Consider support BLAS.
int
bits
=
a
.
type
().
bits
()
*
a
.
type
().
lanes
();
if
(
SupportsAVX512
()
&&
bits
==
512
)
{
str_
+=
"cinn_avx512_"
;
str_
+=
op_repr
;
str_
+=
"("
;
PrintVecInputArgument
(
&
a
);
str_
+=
", "
;
PrintVecInputArgument
(
&
b
);
str_
+=
")"
;
}
else
if
(
SupportsAVX256
()
&&
bits
==
256
)
{
str_
+=
"cinn_avx256_"
;
str_
+=
op_repr
;
str_
+=
"("
;
PrintVecInputArgument
(
&
a
);
str_
+=
", "
;
PrintVecInputArgument
(
&
b
);
str_
+=
")"
;
}
else
{
CodeGenC
::
Visit
(
op
);
}
}
}
// namespace backends
}
// namespace cinn
paddle/cinn/backends/codegen_c_x86_test.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/backends/codegen_c_x86.h"
#include <gtest/gtest.h>
#include "paddle/cinn/cinn.h"
#include "paddle/cinn/ir/module.h"
#include "paddle/cinn/lang/builtin.h"
#include "paddle/cinn/lang/compute.h"
#include "paddle/cinn/lang/lower.h"
#include "paddle/cinn/lang/placeholder.h"
#include "paddle/cinn/optim/ir_simplify.h"
#include "paddle/cinn/optim/transform_polyfor_to_for.h"
#include "paddle/cinn/optim/vectorize_loops.h"
namespace
cinn
{
namespace
backends
{
TEST
(
CodeGenCX86
,
basic
)
{
// create two forloops, check only one forloop is marked Vectorize.
Context
::
info_rgt
().
Clear
();
using
namespace
ir
;
// NOLINT
const
int
M
=
100
;
const
int
K
=
200
;
const
int
N
=
500
;
const
int
bn
=
32
;
Target
target
;
target
.
arch
=
Target
::
Arch
::
X86
;
target
.
bits
=
Target
::
Bit
::
k32
;
target
.
os
=
Target
::
OS
::
Linux
;
Placeholder
<
float
>
A
(
"A"
,
{
M
,
N
});
Placeholder
<
float
>
B
(
"B"
,
{
M
,
N
});
// C = A * B
Tensor
C
=
Compute
(
{
Expr
(
M
),
Expr
(
N
)},
[
&
](
Var
i
,
Var
j
)
{
return
A
(
i
,
j
)
*
B
(
i
,
j
);
},
"C"
);
Tensor
D
=
Compute
(
{
Expr
(
M
),
Expr
(
N
)},
[
&
](
Var
i
,
Var
j
)
{
return
A
(
i
,
j
)
*
B
(
i
,
j
);
},
"D"
);
auto
stages
=
CreateStages
({
C
,
D
});
// vectorize C, not D
stages
[
C
]
->
Vectorize
(
1
,
16
);
stages
[
C
]
->
Unroll
(
1
);
auto
func
=
Lower
(
"matmul"
,
stages
,
{
A
,
B
,
C
,
D
});
std
::
cout
<<
"before optim
\n
"
<<
func
->
body
<<
std
::
endl
;
ir
::
Module
::
Builder
builder
(
"module1"
,
target
);
builder
.
AddFunction
(
func
);
CodeGenCX86
codegen
(
target
,
CodeGenCX86
::
Feature
::
AVX512
);
codegen
.
SetInlineBuiltinCodes
(
false
);
auto
out
=
codegen
.
Compile
(
builder
.
Build
(),
CodeGenC
::
OutputKind
::
CImpl
);
std
::
cout
<<
"out:
\n
"
<<
out
;
}
}
// namespace backends
}
// namespace cinn
paddle/cinn/backends/codegen_cuda_dev.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/backends/codegen_cuda_dev.h"
#include <glog/logging.h>
#include <paddle/cinn/utils/string.h>
#include <fstream>
#include <set>
#include <unordered_set>
#include "paddle/cinn/ir/op/ir_operators.h"
#include "paddle/cinn/ir/utils/ir_verify.h"
#include "paddle/cinn/optim/ir_simplify.h"
#include "paddle/cinn/optim/remove_nested_block.h"
namespace
cinn
{
namespace
backends
{
const
std
::
string
CodeGenCUDA_Dev
::
source_header_
=
// NOLINT
R"(#include <cstdint>
#define CINN_WITH_CUDA
#include "bfloat16.h"
#include "float16.h"
using cinn::common::bfloat16;
using cinn::common::float16;
using cinn::common::half4;
using cinn::common::half8;
using cinn::common::float8;
#include "cinn_cuda_runtime_source.cuh"
)"
;
const
std
::
string
&
CodeGenCUDA_Dev
::
GetSourceHeader
()
{
return
source_header_
;
}
CodeGenCUDA_Dev
::
CodeGenCUDA_Dev
(
Target
target
)
:
CodeGenC
(
target
)
{}
std
::
string
CodeGenCUDA_Dev
::
Compile
(
const
ir
::
Module
&
module
,
bool
for_nvrtc
)
{
for_nvrtc_
=
for_nvrtc
;
auto
source
=
Compile
(
module
,
OutputKind
::
CImpl
);
return
source
;
}
void
CodeGenCUDA_Dev
::
Compile
(
const
ir
::
Module
&
module
,
const
Outputs
&
outputs
)
{
ir
::
IrVerify
(
Expr
(
module
));
CodeGenC
::
inline_builtin_codes_
=
false
;
if
(
!
outputs
.
c_header_name
.
empty
())
{
auto
source
=
Compile
(
module
,
OutputKind
::
CHeader
);
str_
=
""
;
std
::
ofstream
file
(
outputs
.
c_header_name
);
CHECK
(
file
.
is_open
())
<<
"failed to open file "
<<
outputs
.
c_header_name
;
file
<<
source
;
file
.
close
();
LOG
(
WARNING
)
<<
"Output C header to file "
<<
outputs
.
c_header_name
;
}
if
(
!
outputs
.
cuda_source_name
.
empty
())
{
auto
source
=
Compile
(
module
,
OutputKind
::
CImpl
);
str_
=
""
;
std
::
ofstream
file
(
outputs
.
cuda_source_name
);
CHECK
(
file
.
is_open
())
<<
"failed to open file "
<<
outputs
.
cuda_source_name
;
file
<<
source
;
file
.
close
();
LOG
(
WARNING
)
<<
"Output C source to file "
<<
outputs
.
cuda_source_name
;
}
}
void
CodeGenCUDA_Dev
::
Compile
(
const
ir
::
LoweredFunc
&
func
)
{
IrPrinter
::
Visit
(
Expr
(
func
));
}
std
::
vector
<
Expr
>
CodeGenCUDA_Dev
::
GenerateBufferAliasExprs
(
const
ir
::
_LoweredFunc_
*
op
,
const
std
::
vector
<
ir
::
Buffer
>
&
temp_buffers
)
{
std
::
set
<
ir
::
Buffer
>
temp_buffer_set
(
temp_buffers
.
begin
(),
temp_buffers
.
end
());
// prepare temp buffer alias
std
::
vector
<
Expr
>
buffer_alias
;
auto
tensors
=
ir
::
CollectIRNodes
(
op
->
body
,
[
&
](
const
Expr
*
x
)
{
return
x
->
as_tensor
()
&&
x
->
as_tensor
()
->
buffer
.
defined
()
&&
temp_buffer_set
.
count
(
x
->
as_tensor
()
->
buffer
);
});
// unique tensors
std
::
set
<
ir
::
Tensor
>
unique_tensors
;
for
(
auto
&
e
:
tensors
)
{
unique_tensors
.
insert
(
e
.
as_tensor_ref
());
}
for
(
auto
&
t
:
unique_tensors
)
{
auto
data_type
=
t
->
type
();
auto
data_ptr_type
=
data_type
;
data_ptr_type
.
set_cpp_handle
();
Var
t_var
(
t
->
name
,
data_ptr_type
);
Var
buf_var
(
t
->
buffer
->
name
,
data_ptr_type
);
buffer_alias
.
push_back
(
ir
::
Let
::
Make
(
t_var
,
buf_var
));
}
return
buffer_alias
;
}
void
CodeGenCUDA_Dev
::
Visit
(
const
ir
::
_LoweredFunc_
*
op
)
{
// clear names valid within scope when enter a new function
vectorized_tensor_names_
.
clear
();
str_
+=
"__global__
\n
"
;
PrintFunctionDeclaration
(
op
);
str_
+=
"
\n
"
;
DoIndent
();
std
::
vector
<
Expr
>
new_body
;
auto
alloca_temp_buffers
=
op
->
PrepareAllocTempBufferExprs
();
auto
temp_buffer_alias
=
GenerateBufferAliasExprs
(
op
,
op
->
temp_bufs
);
auto
alis_var_exprs
=
op
->
CudaAliasVarExprs
();
#define APPEND_TO_NEW_BODY(field__) \
new_body.insert(std::end(new_body), std::begin(field__), std::end(field__));
APPEND_TO_NEW_BODY
(
alloca_temp_buffers
)
APPEND_TO_NEW_BODY
(
temp_buffer_alias
)
APPEND_TO_NEW_BODY
(
alis_var_exprs
)
new_body
.
push_back
(
op
->
body
);
Expr
func_body
=
ir
::
Block
::
Make
(
new_body
);
optim
::
RemoveNestedBlock
(
&
func_body
);
// Make sure that the function's body is wrapped by a block
if
(
!
func_body
.
As
<
ir
::
Block
>
())
{
func_body
=
ir
::
Block
::
Make
({
func_body
});
}
IrPrinter
::
Visit
(
func_body
);
}
void
CodeGenCUDA_Dev
::
Visit
(
const
ir
::
_Var_
*
op
)
{
if
(
utils
::
Startswith
(
op
->
name
,
"threadIdx"
)
||
utils
::
Startswith
(
op
->
name
,
"blockIdx"
))
{
str_
+=
"(int)"
;
str_
+=
op
->
name
;
}
else
{
str_
+=
op
->
name
;
}
}
void
CodeGenCUDA_Dev
::
Visit
(
const
ir
::
Alloc
*
op
)
{
CHECK
(
op
->
destination
.
as_buffer
());
PrintTempBufferCreation
(
op
->
destination
.
as_buffer_ref
());
}
void
CodeGenCUDA_Dev
::
Visit
(
const
ir
::
Min
*
op
)
{
str_
+=
"min("
;
IrPrinter
::
Visit
(
op
->
a
());
str_
+=
", "
;
IrPrinter
::
Visit
(
op
->
b
());
str_
+=
")"
;
}
void
CodeGenCUDA_Dev
::
Visit
(
const
ir
::
Max
*
op
)
{
str_
+=
"max("
;
IrPrinter
::
Visit
(
op
->
a
());
str_
+=
", "
;
IrPrinter
::
Visit
(
op
->
b
());
str_
+=
")"
;
}
void
CodeGenCUDA_Dev
::
PrintFunctionDeclaration
(
const
ir
::
_LoweredFunc_
*
op
)
{
str_
+=
"void "
;
if
(
op
->
cuda_axis_info
.
valid
())
{
int
thread_num
=
1
;
for
(
int
i
=
0
;
i
<
3
;
i
++
)
{
thread_num
*=
op
->
cuda_axis_info
.
block_dim
(
i
);
}
str_
+=
"__launch_bounds__("
;
str_
+=
std
::
to_string
(
thread_num
);
str_
+=
") "
;
}
str_
+=
op
->
name
;
str_
+=
"("
;
for
(
int
i
=
0
;
i
<
op
->
args
.
size
()
-
1
;
i
++
)
{
auto
&
arg
=
op
->
args
[
i
];
PrintFuncArg
(
arg
);
str_
+=
", "
;
}
if
(
!
op
->
args
.
empty
())
{
PrintFuncArg
(
op
->
args
.
back
());
}
str_
+=
")"
;
}
void
CodeGenCUDA_Dev
::
PrintFuncArg
(
const
ir
::
Argument
&
arg
)
{
if
(
arg
.
is_buffer
())
{
// In CUDA kernel, only primitive type is supported, so we replace the
// buffer with T*j
if
(
arg
.
is_input
())
str_
+=
"const "
;
str_
+=
GetTypeRepr
(
arg
.
buffer_arg
()
->
dtype
);
str_
+=
"* "
;
str_
+=
kCKeywordRestrict
;
str_
+=
" "
;
str_
+=
ir
::
BufferGetTensorName
(
arg
.
buffer_arg
().
As
<
ir
::
_Buffer_
>
());
}
else
if
(
arg
.
is_var
())
{
if
(
arg
.
var_arg
()
->
type
().
is_cpp_handle
())
{
str_
+=
kCKeywordRestrict
;
}
str_
+=
GetTypeRepr
(
arg
.
type
());
str_
+=
" "
;
str_
+=
arg
.
name
();
}
else
{
CINN_NOT_IMPLEMENTED
}
}
void
CodeGenCUDA_Dev
::
PrintBuiltinCodes
()
{}
std
::
string
CodeGenCUDA_Dev
::
Compile
(
const
ir
::
Module
&
module
,
CodeGenC
::
OutputKind
output_kind
)
{
if
(
output_kind
==
OutputKind
::
CHeader
)
{
GenerateHeaderFile
(
module
);
}
else
if
(
output_kind
==
OutputKind
::
CImpl
)
{
PrintIncludes
();
if
(
for_nvrtc_
)
{
str_
+=
"
\n
extern
\"
C
\"
{
\n\n
"
;
}
PrintBuiltinCodes
();
for
(
auto
&
func
:
module
.
functions
())
{
Compile
(
func
);
}
}
else
{
LOG
(
FATAL
)
<<
"Not supported OutputKind"
;
}
if
(
for_nvrtc_
)
{
str_
+=
"
\n\n
}"
;
}
return
str_
;
}
void
CodeGenCUDA_Dev
::
PrintIncludes
()
{
str_
+=
GetSourceHeader
();
}
void
CodeGenCUDA_Dev
::
PrintTempBufferCreation
(
const
ir
::
Buffer
&
buffer
)
{
CHECK_NE
(
buffer
->
type
(),
Void
());
auto
print_gpu_memory
=
[
&
](
const
std
::
string
&
mark
)
{
str_
+=
mark
;
str_
+=
GetTypeRepr
(
buffer
->
dtype
);
str_
+=
" "
;
str_
+=
buffer
->
name
;
str_
+=
" "
;
str_
+=
"[ "
;
Expr
buffer_size
(
1
);
for
(
int
i
=
0
;
i
<
buffer
->
shape
.
size
();
i
++
)
{
buffer_size
=
buffer_size
*
buffer
->
shape
[
i
];
}
optim
::
Simplify
(
&
buffer_size
);
IrPrinter
::
Visit
(
buffer_size
);
str_
+=
" ]"
;
};
switch
(
buffer
->
memory_type
)
{
case
ir
::
MemoryType
::
GPUShared
:
print_gpu_memory
(
"__shared__ "
);
break
;
case
ir
::
MemoryType
::
GPULocal
:
print_gpu_memory
(
""
);
break
;
default:
LOG
(
FATAL
)
<<
"CUDA device codegen not support memory "
<<
buffer
->
name
<<
", type "
<<
buffer
->
memory_type
;
}
}
void
CodeGenCUDA_Dev
::
Visit
(
const
ir
::
Call
*
op
)
{
str_
+=
op
->
name
;
str_
+=
"("
;
if
(
!
op
->
read_args
.
empty
())
{
for
(
int
i
=
0
;
i
<
op
->
read_args
.
size
()
-
1
;
i
++
)
{
auto
&
arg
=
op
->
read_args
[
i
];
if
(
arg
.
as_tensor
())
{
str_
+=
arg
.
as_tensor
()
->
name
;
str_
+=
", "
;
}
else
{
IrPrinter
::
Visit
(
arg
);
str_
+=
", "
;
}
}
if
(
op
->
read_args
.
back
().
as_tensor
())
{
str_
+=
op
->
read_args
.
back
().
as_tensor
()
->
name
;
}
else
{
IrPrinter
::
Visit
(
op
->
read_args
.
back
());
}
}
if
(
!
op
->
write_args
.
empty
())
{
str_
+=
", "
;
for
(
int
i
=
0
;
i
<
op
->
write_args
.
size
()
-
1
;
i
++
)
{
auto
&
arg
=
op
->
write_args
[
i
];
if
(
arg
.
as_tensor
())
{
str_
+=
arg
.
as_tensor
()
->
name
;
str_
+=
", "
;
}
else
{
IrPrinter
::
Visit
(
arg
);
str_
+=
", "
;
}
}
if
(
op
->
write_args
.
back
().
as_tensor
())
{
str_
+=
op
->
write_args
.
back
().
as_tensor
()
->
name
;
}
else
{
IrPrinter
::
Visit
(
op
->
write_args
.
back
());
}
}
str_
+=
")"
;
}
void
CodeGenCUDA_Dev
::
Visit
(
const
ir
::
Let
*
op
)
{
CHECK
(
op
->
type
().
valid
());
// identify vectorized tensors by checking their dtypes are customized_type
// with customized_type::kcuda_builtin_vector_t prefix, and save their names
if
(
op
->
type
().
is_customized
()
&&
utils
::
Startswith
(
op
->
type
().
customized_type
(),
common
::
customized_type
::
kcuda_builtin_vector_t
))
{
str_
+=
GetTypeRepr
(
op
->
type
());
if
(
op
->
type
().
is_cpp_handle
())
{
str_
+=
" "
;
str_
+=
kCKeywordRestrict
;
}
str_
+=
" "
;
IrPrinter
::
Visit
(
op
->
symbol
);
vectorized_tensor_names_
.
insert
(
utils
::
GetStreamCnt
(
op
->
symbol
));
// skip "=0" in "half8 temp = 0;" sincethe operator= of half8 may not
// overloaded.
if
(
op
->
body
.
As
<
ir
::
IntImm
>
()
&&
op
->
body
.
As
<
ir
::
IntImm
>
()
->
value
==
0
)
{
return
;
}
str_
+=
" = "
;
IrPrinter
::
Visit
(
op
->
body
);
}
else
{
CodeGenC
::
Visit
(
op
);
}
}
bool
CodeGenCUDA_Dev
::
PrintBuiltinVectorAccess
(
const
ir
::
LoadStoreAddrMnger
*
op
,
ir
::
Expr
index_expr
,
bool
is_store
)
{
static
constexpr
char
index2suffix
[
8
]
=
{
'x'
,
'y'
,
'z'
,
'w'
,
'v'
,
'u'
,
't'
,
's'
};
// addr of op should be a place of tensor and the index is simple int number
if
(
!
op
->
is_addr_tensor
()
||
!
index_expr
.
As
<
ir
::
IntImm
>
())
{
return
false
;
}
auto
*
tensor
=
op
->
tensor
.
As
<
ir
::
_Tensor_
>
();
CHECK
(
tensor
);
// identify vectorized tensors by their names
if
(
!
vectorized_tensor_names_
.
count
(
tensor
->
name
))
{
return
false
;
}
// the index can't exceed the range of cuda built-in vector type
int
index
=
index_expr
.
As
<
ir
::
IntImm
>
()
->
value
;
if
(
index
<
0
||
index
>=
8
)
{
return
false
;
}
if
(
is_store
&&
tensor
->
type
().
is_cpp_handle
())
{
str_
+=
tensor
->
name
;
str_
+=
"["
;
str_
+=
std
::
to_string
(
index
);
str_
+=
"]"
;
}
else
{
str_
+=
tensor
->
name
;
str_
+=
(
tensor
->
type
().
is_cpp_handle
()
?
"->"
:
"."
);
str_
+=
index2suffix
[
index
];
}
return
true
;
}
void
CodeGenCUDA_Dev
::
Visit
(
const
ir
::
Load
*
op
)
{
// overload this visit function to especially deal with the case when it
// accesses element at a cuda built-in vector, others still resolve to
// CodeGenC
if
(
!
PrintBuiltinVectorAccess
(
op
,
op
->
index
(),
false
))
{
CodeGenC
::
Visit
(
op
);
}
}
void
CodeGenCUDA_Dev
::
Visit
(
const
ir
::
Store
*
op
)
{
// overload this visit function to especially deal with the case when it
// accesses element at a cuda built-in vector, others still resolve to
// CodeGenC
if
(
PrintBuiltinVectorAccess
(
op
,
op
->
index
(),
true
))
{
str_
+=
" = "
;
IrPrinter
::
Visit
(
op
->
value
);
}
else
{
CodeGenC
::
Visit
(
op
);
}
}
}
// namespace backends
}
// namespace cinn
paddle/cinn/backends/codegen_cuda_dev.h
0 → 100644
View file @
992bec46
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle/cinn/backends/codegen_c.h"
#include "paddle/cinn/common/common.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/lowered_func.h"
#include "paddle/cinn/ir/module.h"
#include "paddle/cinn/ir/utils/ir_printer.h"
#include "paddle/cinn/lang/packed_func.h"
#include "paddle/cinn/runtime/cinn_runtime.h"
namespace
cinn
::
ir
{
class
Module
;
}
// namespace cinn::ir
namespace
cinn
{
namespace
backends
{
/**
* CUDA device code generator.
*
* It generates the device function, e.g, the function called "myadd" will have
* a __global__ functon called "myadd_kernel", different from codegen_c, the
* declaration of the "myadd_kernel" function has an expanded argument list,
* which finally similar to `__global__ void myadd(float* __restrict__ A, float*
* __restrict__ B, int n);`
*/
class
CodeGenCUDA_Dev
:
public
CodeGenC
{
public:
explicit
CodeGenCUDA_Dev
(
Target
target
);
/**
* Compile the \p module to \p outputs.
*/
void
Compile
(
const
ir
::
Module
&
module
,
const
Outputs
&
outputs
);
//! Compile on NVRTC.
std
::
string
Compile
(
const
ir
::
Module
&
module
,
bool
for_nvrtc
=
true
);
void
Compile
(
const
ir
::
LoweredFunc
&
func
);
/**
* \brief Print a function argument in CUDA syntax. Currently, just some
* decoration of __restrict__.
* @param arg the argument.
* @return the representation in CUDA syntax.
*
* We make it a static to make the test easier.
*/
void
PrintFuncArg
(
const
ir
::
Argument
&
arg
);
std
::
string
Compile
(
const
ir
::
Module
&
module
,
OutputKind
output_kind
);
static
const
std
::
string
&
GetSourceHeader
();
protected:
void
Visit
(
const
ir
::
_Var_
*
op
)
override
;
void
Visit
(
const
ir
::
_LoweredFunc_
*
op
)
override
;
void
Visit
(
const
ir
::
Min
*
op
)
override
;
void
Visit
(
const
ir
::
Max
*
op
)
override
;
void
Visit
(
const
ir
::
Alloc
*
op
)
override
;
void
Visit
(
const
ir
::
Call
*
op
)
override
;
void
Visit
(
const
ir
::
Load
*
op
)
override
;
void
Visit
(
const
ir
::
Store
*
op
)
override
;
void
Visit
(
const
ir
::
Let
*
op
)
override
;
// Print element access at a cuda built-in vector on a load/store node
bool
PrintBuiltinVectorAccess
(
const
ir
::
LoadStoreAddrMnger
*
op
,
ir
::
Expr
index
,
bool
is_store
);
void
PrintBuiltinCodes
();
void
PrintIncludes
()
override
;
void
PrintTempBufferCreation
(
const
ir
::
Buffer
&
buffer
);
void
PrintTempBufferAliasDefinition
(
const
ir
::
Buffer
&
buffer
);
std
::
vector
<
Expr
>
GenerateBufferAliasExprs
(
const
ir
::
_LoweredFunc_
*
op
,
const
std
::
vector
<
ir
::
Buffer
>&
temp_buffers
);
/**
* Print the function declaration, this is different from C, we expand the
* arguments and get something like
* `__global__ void myadd(float* __restrict__ A, float* __restrict__ B, int
* n);`
*/
void
PrintFunctionDeclaration
(
const
ir
::
_LoweredFunc_
*
op
);
private:
Target
target_
;
bool
for_nvrtc_
{
false
};
// names of vectorized tensors from `Let` statments where dtypes of the
// tensors are customized_type with customized_type::kcuda_builtin_vector_t
// prefix
std
::
unordered_set
<
std
::
string
>
vectorized_tensor_names_
;
static
const
std
::
string
source_header_
;
};
}
// namespace backends
}
// namespace cinn
paddle/cinn/backends/codegen_cuda_generate_test.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <stdlib.h>
#include <fstream>
#include <tuple>
#include <vector>
#include "paddle/cinn/backends/codegen_cuda_dev.h"
#include "paddle/cinn/backends/codegen_cuda_host.h"
#include "paddle/cinn/backends/codegen_cuda_util.h"
#include "paddle/cinn/backends/extern_func_jit_register.h"
#include "paddle/cinn/backends/llvm/execution_engine.h"
#include "paddle/cinn/backends/llvm/simple_jit.h"
#include "paddle/cinn/cinn.h"
#include "paddle/cinn/common/ir_util.h"
#include "paddle/cinn/common/test_helper.h"
#include "paddle/cinn/hlir/pe/nn.h"
#include "paddle/cinn/hlir/pe/schedule.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/utils/ir_printer.h"
#include "paddle/cinn/lang/lower.h"
#include "paddle/cinn/optim/ir_simplify.h"
#include "paddle/cinn/utils/timer.h"
namespace
cinn
{
namespace
backends
{
TEST
(
CUDAFile
,
Module_output
)
{
std
::
string
cuda_source_name
=
"_generated1.cu"
;
std
::
string
cuda_source_code
=
R"ROC(
extern "C" {
__global__
void __launch_bounds__(200) elementwise_mul(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ C)
{
if (((int)blockIdx.x < 100)) {
if (((int)threadIdx.x < 200)) {
C[((200 * (int)blockIdx.x) + (int)threadIdx.x)] = (A[((200 * (int)blockIdx.x) + (int)threadIdx.x)] * B[((200 * (int)blockIdx.x) + (int)threadIdx.x)]);
};
};
}
}
)ROC"
;
std
::
ofstream
file
(
cuda_source_name
);
CHECK
(
file
.
is_open
())
<<
"failed to open file "
<<
cuda_source_name
;
file
<<
CodeGenCUDA_Dev
::
GetSourceHeader
();
file
<<
cuda_source_code
;
file
.
close
();
LOG
(
WARNING
)
<<
"Output C source to file "
<<
cuda_source_name
;
}
}
// namespace backends
}
// namespace cinn
paddle/cinn/backends/codegen_cuda_host.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/backends/codegen_cuda_host.h"
#include <algorithm>
#include <string>
#include <unordered_map>
#include "paddle/cinn/backends/codegen_cuda_util.h"
#include "paddle/cinn/backends/extern_func_emitter_builtin.h"
#include "paddle/cinn/backends/extern_func_jit_register.h"
#include "paddle/cinn/backends/llvm/llvm_util.h"
#include "paddle/cinn/runtime/intrinsic.h"
namespace
cinn
{
namespace
backends
{
using
cinn
::
common
::
bfloat16
;
using
cinn
::
common
::
float16
;
const
int
kArgsArrayMaxLen
=
20
;
llvm
::
Value
*
CodeGenCUDA_Host
::
LowerGPUKernelLauncher
(
const
ir
::
_LoweredFunc_
*
func
)
{
auto
body
=
func
->
body
;
auto
*
call_ir
=
body
.
As
<
ir
::
Call
>
();
CHECK
(
call_ir
);
// Create the function
// @{
auto
*
function_type
=
GenFunctionTypeFromCinnFunction
(
func
,
true
);
llvm
::
Function
*
function
=
llvm
::
Function
::
Create
(
function_type
,
llvm
::
Function
::
ExternalLinkage
,
func
->
name
,
m_
);
function
->
setCallingConv
(
llvm
::
CallingConv
::
C
);
function
->
setHasUWTable
();
std
::
vector
<
llvm
::
Value
*>
ll_function_args
;
std
::
transform
(
function
->
arg_begin
(),
function
->
arg_end
(),
std
::
back_inserter
(
ll_function_args
),
[](
auto
&
arg
)
{
return
std
::
addressof
(
arg
);
});
// @}
llvm
::
BasicBlock
*
entry
=
llvm
::
BasicBlock
::
Create
(
/*Context=*/
b_
->
getContext
(),
/*Name=*/
"entry"
,
/*Parent=*/
function
,
/*InsertBefore=*/
nullptr
);
b_
->
SetInsertPoint
(
entry
);
auto
*
kernel_args
=
ll_function_args
[
0
];
auto
*
kernel_args_count
=
ll_function_args
[
1
];
llvm
::
Value
*
kernel_stream
=
nullptr
;
if
(
ll_function_args
.
size
()
==
3
)
{
kernel_stream
=
ll_function_args
[
2
];
CHECK_EQ
(
kernel_stream
->
getType
(),
ll_void_p_ty
());
// void* stream
}
CHECK_EQ
(
kernel_args
->
getType
(),
ll_void_p_ty
());
// void* args
CHECK_EQ
(
kernel_args_count
->
getType
(),
ll_int32_ty
());
// int32
std
::
unordered_map
<
std
::
string
,
llvm
::
Value
*>
global_args
=
{
{
KERNEL_ARGS
,
kernel_args
},
{
KERNEL_ARGS_NUM
,
kernel_args_count
},
{
KERNEL_STREAM
,
kernel_stream
}};
auto
ret_type
=
CinnTypeToLLVMType
(
Void
(),
m_
);
std
::
vector
<
llvm
::
Type
*>
args_type
;
for
(
auto
r_arg
:
call_ir
->
read_args
)
{
if
(
r_arg
.
is_var
())
{
if
(
r_arg
.
as_var
()
->
type
().
is_cpp_handle
()
||
r_arg
.
as_var
()
->
type
().
is_string
())
{
args_type
.
push_back
(
CinnTypeToLLVMType
(
type_of
<
void
*>
(),
m_
));
}
else
if
(
r_arg
.
as_var
()
->
type
().
is_int
(
32
))
{
args_type
.
push_back
(
CinnTypeToLLVMType
(
type_of
<
int32_t
>
(),
m_
));
}
else
{
CINN_NOT_IMPLEMENTED
;
}
}
else
{
if
(
r_arg
.
type
().
is_bool
())
{
args_type
.
push_back
(
CinnTypeToLLVMType
(
type_of
<
bool
>
(),
m_
));
}
else
if
(
r_arg
.
type
().
is_uint
(
8
))
{
args_type
.
push_back
(
CinnTypeToLLVMType
(
type_of
<
uint8_t
>
(),
m_
));
}
else
if
(
r_arg
.
type
().
is_uint
(
16
))
{
args_type
.
push_back
(
CinnTypeToLLVMType
(
type_of
<
uint16_t
>
(),
m_
));
}
else
if
(
r_arg
.
type
().
is_uint
(
32
))
{
args_type
.
push_back
(
CinnTypeToLLVMType
(
type_of
<
uint32_t
>
(),
m_
));
}
else
if
(
r_arg
.
type
().
is_uint
(
64
))
{
args_type
.
push_back
(
CinnTypeToLLVMType
(
type_of
<
uint64_t
>
(),
m_
));
}
else
if
(
r_arg
.
type
().
is_int
(
8
))
{
args_type
.
push_back
(
CinnTypeToLLVMType
(
type_of
<
int8_t
>
(),
m_
));
}
else
if
(
r_arg
.
type
().
is_int
(
16
))
{
args_type
.
push_back
(
CinnTypeToLLVMType
(
type_of
<
int16_t
>
(),
m_
));
}
else
if
(
r_arg
.
type
().
is_int
(
32
))
{
args_type
.
push_back
(
CinnTypeToLLVMType
(
type_of
<
int32_t
>
(),
m_
));
}
else
if
(
r_arg
.
type
().
is_int
(
64
))
{
args_type
.
push_back
(
CinnTypeToLLVMType
(
type_of
<
int64_t
>
(),
m_
));
}
else
if
(
r_arg
.
type
().
is_float
(
32
))
{
args_type
.
push_back
(
CinnTypeToLLVMType
(
type_of
<
float
>
(),
m_
));
}
else
if
(
r_arg
.
type
().
is_float
(
64
))
{
args_type
.
push_back
(
CinnTypeToLLVMType
(
type_of
<
double
>
(),
m_
));
}
else
if
(
r_arg
.
type
().
is_bfloat16
())
{
args_type
.
push_back
(
CinnTypeToLLVMType
(
type_of
<
bfloat16
>
(),
m_
));
}
else
if
(
r_arg
.
type
().
is_float16
())
{
args_type
.
push_back
(
CinnTypeToLLVMType
(
type_of
<
float16
>
(),
m_
));
}
else
{
CINN_NOT_IMPLEMENTED
;
}
}
}
auto
func_type
=
llvm
::
FunctionType
::
get
(
ret_type
,
args_type
,
false
);
auto
call_func
=
m_
->
getOrInsertFunction
(
call_ir
->
name
,
func_type
);
std
::
vector
<
llvm
::
Value
*>
call_args
;
for
(
auto
&
r_arg
:
call_ir
->
read_args
)
{
if
(
r_arg
.
is_var
())
{
if
(
r_arg
.
as_var
()
->
type
().
is_string
())
{
auto
kvalue
=
m_
->
getOrInsertGlobal
(
r_arg
.
as_var
()
->
name
+
"_ptr_"
,
b_
->
getInt8PtrTy
());
call_args
.
push_back
(
b_
->
CreateLoad
(
b_
->
getInt8PtrTy
(),
kvalue
,
r_arg
.
as_var
()
->
name
+
"_ptr_load"
));
}
else
if
(
r_arg
.
as_var
()
->
type
().
is_cpp_handle
()
||
r_arg
.
as_var
()
->
type
().
is_int
(
32
))
{
CHECK
(
global_args
.
count
(
r_arg
.
as_var
()
->
name
));
call_args
.
push_back
(
global_args
[
r_arg
.
as_var
()
->
name
]);
}
else
{
CINN_NOT_IMPLEMENTED
;
}
}
else
{
if
(
r_arg
.
type
().
is_bool
())
{
call_args
.
push_back
(
b_
->
getInt1
(
r_arg
.
as_bool
()));
}
else
if
(
r_arg
.
type
().
is_int
(
8
))
{
call_args
.
push_back
(
b_
->
getInt8
(
r_arg
.
as_int8
()));
}
else
if
(
r_arg
.
type
().
is_int
(
16
))
{
call_args
.
push_back
(
b_
->
getInt16
(
r_arg
.
as_int16
()));
}
else
if
(
r_arg
.
type
().
is_int
(
32
))
{
call_args
.
push_back
(
b_
->
getInt32
(
r_arg
.
as_int32
()));
}
else
if
(
r_arg
.
type
().
is_int
(
64
))
{
call_args
.
push_back
(
b_
->
getInt64
(
r_arg
.
as_int64
()));
}
else
if
(
r_arg
.
type
().
is_uint
(
8
))
{
call_args
.
push_back
(
b_
->
getInt8
(
r_arg
.
as_uint8
()));
}
else
if
(
r_arg
.
type
().
is_uint
(
16
))
{
call_args
.
push_back
(
b_
->
getInt16
(
r_arg
.
as_uint16
()));
}
else
if
(
r_arg
.
type
().
is_uint
(
32
))
{
call_args
.
push_back
(
b_
->
getInt32
(
r_arg
.
as_uint32
()));
}
else
if
(
r_arg
.
type
().
is_uint
(
64
))
{
call_args
.
push_back
(
b_
->
getInt64
(
r_arg
.
as_uint64
()));
}
else
if
(
r_arg
.
type
().
is_float
(
32
))
{
call_args
.
push_back
(
llvm
::
ConstantFP
::
get
(
b_
->
getFloatTy
(),
llvm
::
APFloat
(
r_arg
.
as_float
())));
}
else
if
(
r_arg
.
type
().
is_float
(
64
))
{
call_args
.
push_back
(
llvm
::
ConstantFP
::
get
(
b_
->
getDoubleTy
(),
llvm
::
APFloat
(
r_arg
.
as_double
())));
}
else
if
(
r_arg
.
type
().
is_bfloat16
())
{
call_args
.
push_back
(
llvm
::
ConstantFP
::
get
(
b_
->
getBFloatTy
(),
llvm
::
APFloat
(
static_cast
<
float
>
(
r_arg
.
as_bfloat16
()))));
}
else
if
(
r_arg
.
type
().
is_float16
())
{
call_args
.
push_back
(
llvm
::
ConstantFP
::
get
(
b_
->
getHalfTy
(),
llvm
::
APFloat
(
static_cast
<
float
>
(
r_arg
.
as_float16
()))));
}
else
{
CINN_NOT_IMPLEMENTED
;
}
}
}
b_
->
CreateCall
(
call_func
,
call_args
);
RetVoid
();
return
function
;
}
}
// namespace backends
}
// namespace cinn
paddle/cinn/backends/codegen_cuda_host.h
0 → 100644
View file @
992bec46
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <absl/container/flat_hash_map.h>
#include <memory>
#include <string>
#include <tuple>
#include <vector>
#include "paddle/cinn/backends/llvm/codegen_llvm.h"
namespace
cinn
{
namespace
backends
{
/**
* CodeGenCUDA takes a CINN Module with host functions and output a LLVM module.
*/
class
CodeGenCUDA_Host
:
public
CodeGenLLVM
{
public:
explicit
CodeGenCUDA_Host
(
llvm
::
Module
*
m
,
llvm
::
IRBuilder
<>
*
b
,
const
std
::
shared_ptr
<
SymbolTable
>
&
vars
=
nullptr
)
:
CodeGenLLVM
(
m
,
b
,
vars
)
{}
using
CodeGenLLVM
::
Visit
;
llvm
::
Value
*
Visit
(
const
ir
::
_LoweredFunc_
*
func
)
override
{
return
LowerGPUKernelLauncher
(
func
);
}
private:
/**
* Lower a CUDA kernel launcher.
*
* We launch a CUDA kernel in the following way:
*
* 1. a GPU function (called fn) will compiled to PTX and lower by CUDA driver
* to a function pointer, which we store as a `void*` type global variable
* [fn_kernel_ptr] in LLVM module.
* 2. when lower the host launcher, we replace the Call of the original kernel
* [fn] to a Call of `cinn_call_cuda_kernel` method which is registered as an
* external function.
*
*/
llvm
::
Value
*
LowerGPUKernelLauncher
(
const
ir
::
_LoweredFunc_
*
func
);
};
}
// namespace backends
}
// namespace cinn
paddle/cinn/backends/codegen_cuda_util.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/backends/codegen_cuda_util.h"
#include "paddle/cinn/backends/cuda_util.h"
#include "paddle/cinn/ir/utils/ir_mutator.h"
namespace
cinn
{
namespace
backends
{
std
::
tuple
<
ir
::
Module
,
ir
::
Module
>
SplitCudaAndHostModule
(
ir
::
Module
module
)
{
detail
::
CollectHostFunctionVisitor
visitor
(
module
->
name
);
Expr
expr
(
module
);
return
visitor
(
&
expr
);
}
}
// namespace backends
}
// namespace cinn
paddle/cinn/backends/codegen_cuda_util.h
0 → 100644
View file @
992bec46
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <absl/container/flat_hash_map.h>
#include <string>
#include <tuple>
#include <vector>
#include "paddle/cinn/cinn.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/utils/ir_copy.h"
#include "paddle/cinn/ir/utils/ir_mutator.h"
namespace
cinn
{
namespace
backends
{
#define KERNEL_ARGS "kernel_args"
#define KERNEL_ARGS_NUM "kernel_args_num"
#define KERNEL_STREAM "kernel_stream"
/**
* Split a CINN Module into two separate modules, one cantains the host
* functions, the other contains the device kernels.
*
* This contains some process:
*
* - replace the original kernel function with a Call node and add it to the
* first module, add a device kernel function to the second module.
*/
std
::
tuple
<
ir
::
Module
,
ir
::
Module
>
SplitCudaAndHostModule
(
ir
::
Module
module
);
namespace
detail
{
struct
CollectHostFunctionVisitor
:
public
ir
::
IRMutator
<>
{
explicit
CollectHostFunctionVisitor
(
const
std
::
string
&
module_name
)
:
host_module_builder
(
module_name
+
"_host"
,
common
::
DefaultHostTarget
()),
device_module_builder
(
module_name
+
"_gpu_device"
,
common
::
DefaultNVGPUTarget
())
{}
std
::
tuple
<
ir
::
Module
,
ir
::
Module
>
operator
()(
Expr
*
expr
)
{
ir
::
IRMutator
<>::
Visit
(
expr
,
expr
);
return
std
::
make_tuple
(
host_module_builder
.
Build
(),
device_module_builder
.
Build
());
}
private:
void
Visit
(
const
ir
::
_LoweredFunc_
*
op
,
Expr
*
expr
)
override
{
if
(
op
->
body
.
As
<
ir
::
Call
>
())
{
host_module_builder
.
AddFunctionWithoutOptim
(
expr
->
as_lowered_func_ref
());
}
else
{
if
(
!
op
->
cuda_axis_info
.
valid
())
{
expr
->
as_lowered_func_ref
()
->
cuda_axis_info
.
set_valid
(
true
);
}
auto
host_func
=
CreateHostFunctionGivenDeviceKernel
(
op
);
host_module_builder
.
AddFunctionWithoutOptim
(
host_func
.
as_lowered_func_ref
());
device_module_builder
.
AddFunctionWithoutOptim
(
CreateDeviceFunctionGivenDeviceKernel
(
*
expr
).
as_lowered_func_ref
());
}
}
/**
* Create a wrapper function for a kernel.
*
* For example, we get a kernel function:
*
* \code
* __global__
* void fn (float* a, float* out) { ... }
* \endcode
*
* A host wrapper function will generate for it
*
* \code
* void fn (cinn_buffer_t* a, cinn_buffer_t* out) {
* Call(fn_kernel);
* }
* \endcode
*/
Expr
CreateHostFunctionGivenDeviceKernel
(
const
ir
::
_LoweredFunc_
*
func
)
{
// std::vector<Expr> args;
// NOTE the suffix `__ptr` makes this argument lower to a pointer in LLVM
// backend. args.push_back(Var("args__ptr", type_of<cinn_pod_value_t*>()));
// args.push_back(Var("num_args", type_of<int32_t>()));
ir
::
Var
kernel_ptr
(
GenDeviceKernelName
(
func
->
name
),
type_of
<
std
::
string
>
());
ir
::
Var
kernel_args
(
KERNEL_ARGS
,
type_of
<
void
*>
());
ir
::
Var
kernel_args_num
(
KERNEL_ARGS_NUM
,
type_of
<
int
>
());
ir
::
Var
kernel_stream
(
KERNEL_STREAM
,
type_of
<
void
*>
());
auto
call_extern_api
=
ir
::
Call
::
Make
(
Void
(),
runtime
::
intrinsic
::
call_cuda_kernel
,
{
kernel_ptr
,
kernel_args
,
kernel_args_num
,
Expr
(
func
->
cuda_axis_info
.
grid_dim
(
0
)),
// grid_x
Expr
(
func
->
cuda_axis_info
.
grid_dim
(
1
)),
// grid_y
Expr
(
func
->
cuda_axis_info
.
grid_dim
(
2
)),
// grid_z
Expr
(
func
->
cuda_axis_info
.
block_dim
(
0
)),
// block_x
Expr
(
func
->
cuda_axis_info
.
block_dim
(
1
)),
// block_y
Expr
(
func
->
cuda_axis_info
.
block_dim
(
2
)),
// block_z
kernel_stream
},
{},
ir
::
CallType
::
Extern
,
ir
::
FunctionRef
(),
0
);
std
::
vector
<
ir
::
Argument
>
arguments
=
{
ir
::
Argument
(
kernel_args
,
ir
::
Argument
::
IO
::
kOutput
),
ir
::
Argument
(
kernel_args_num
,
ir
::
Argument
::
IO
::
kInput
),
ir
::
Argument
(
kernel_stream
,
ir
::
Argument
::
IO
::
kOutput
)};
return
ir
::
_LoweredFunc_
::
Make
(
func
->
name
,
arguments
,
call_extern_api
,
{});
}
Expr
CreateDeviceFunctionGivenDeviceKernel
(
Expr
expr
)
{
auto
copied
=
optim
::
IRCopy
(
expr
);
auto
*
lowered_func
=
copied
.
as_lowered_func
();
lowered_func
->
name
=
GenDeviceKernelName
(
lowered_func
->
name
);
return
copied
;
}
inline
std
::
string
GenDeviceKernelName
(
const
std
::
string
&
fn
)
{
return
fn
+
"_kernel"
;
}
private:
ir
::
Module
::
Builder
host_module_builder
;
ir
::
Module
::
Builder
device_module_builder
;
};
}
// namespace detail
}
// namespace backends
}
// namespace cinn
paddle/cinn/backends/codegen_debug_test.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <iostream>
#include <vector>
#include "paddle/cinn/backends/codegen_cuda_dev.h"
#include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
#include "paddle/cinn/common/context.h"
#include "paddle/cinn/runtime/cuda/cuda_module.h"
namespace
cinn
{
namespace
backends
{
/**
* This file is not a common test, it is used as a util for developers to
* write source CUDA code to debug whether it runs correctly during runtime
*/
using
runtime
::
cuda
::
CUDAModule
;
/**
* Utility function to create cuda memory of non-empty shape.
*
* @param shape: a non-empty shape for the created cuda memory
* @param data: the data to initialize the cuda memory. Function doesn't
* initailize if it is nullptr
* @return the CUdeviceptr pointing to the created memory
*/
template
<
typename
T
>
CUdeviceptr
CreateCudaMemory
(
const
std
::
vector
<
int
>&
shape
,
const
T
*
data
)
{
CHECK
(
!
shape
.
empty
())
<<
"Couldn't create CUDA memory for empty shape"
;
CUDA_CALL
(
cudaDeviceSynchronize
());
int
numel
=
1
;
for
(
int
s
:
shape
)
{
numel
=
numel
*
s
;
}
CUdeviceptr
cuda_ptr
=
cuMemAlloc
(
&
cuda_ptr
,
numel
*
sizeof
(
T
));
if
(
data
!=
nullptr
)
{
CUDA_CALL
(
cudaMemcpy
(
reinterpret_cast
<
void
*>
(
cuda_ptr
),
data
,
numel
*
sizeof
(
T
),
cudaMemcpyHostToDevice
));
}
return
cuda_ptr
;
}
TEST
(
CodeGenDebug
,
RunCudaSourceCode
)
{
common
::
Context
::
Global
().
ResetNameId
();
std
::
string
source_code
=
R"ROC(
extern "C" {
__global__
void __launch_bounds__(512) fn_relu_1_kernel(const float* __restrict__ var_1, float* __restrict__ Relu_output)
{
for (int32_t j_0 = 0; j_0 < 8; j_0 += 1) {
for (int32_t j_1 = 0; j_1 < 1; j_1 += 1) {
for (int32_t j_2 = 0; j_2 < 1; j_2 += 1) {
for (int32_t j_3 = 0; j_3 < 8; j_3 += 1) {
for (int32_t j_4 = 0; j_4 < 1; j_4 += 1) {
for (int32_t k_0 = 0; k_0 < 1; k_0 += 1) {
for (int32_t k_1 = 0; k_1 < 7; k_1 += 1) {
for (int32_t k_2 = 0; k_2 < 4; k_2 += 1) {
for (int32_t k_3 = 0; k_3 < 4; k_3 += 1) {
for (int32_t k_4 = 0; k_4 < 1; k_4 += 1) {
for (int32_t a_0 = 0; a_0 < 16; a_0 += 1) {
for (int32_t a_1 = 0; a_1 < 1; a_1 += 1) {
for (int32_t a_2 = 0; a_2 < 1; a_2 += 1) {
for (int32_t a_3 = 0; a_3 < 1; a_3 += 1) {
for (int32_t a_4 = 0; a_4 < 7; a_4 += 1) {
Relu_output[((7 * a_0) + ((7 * a_1) + ((7 * a_2) + ((7 * a_3) + ((100352 * j_0) + ((100352 * j_1) + ((100352 * j_2) + ((12544 * j_3) + ((12544 * j_4) + ((12544 * k_0) + ((1792 * k_1) + ((448 * k_2) + ((112 * k_3) + ((112 * k_4) + a_4))))))))))))))] = max(var_1[((7 * a_0) + ((7 * a_1) + ((7 * a_2) + ((7 * a_3) + ((100352 * j_0) + ((100352 * j_1) + ((100352 * j_2) + ((12544 * j_3) + ((12544 * j_4) + ((12544 * k_0) + ((1792 * k_1) + ((448 * k_2) + ((112 * k_3) + ((112 * k_4) + a_4))))))))))))))], 0.00000000f);
};
};
};
};
};
};
};
};
};
};
};
};
};
};
};
}
}
)ROC"
;
backends
::
nvrtc
::
Compiler
compiler
;
std
::
string
ptx
=
compiler
(
CodeGenCUDA_Dev
::
GetSourceHeader
()
+
source_code
);
ASSERT_FALSE
(
ptx
.
empty
());
CUDAModule
cuda_module
(
ptx
,
CUDAModule
::
Kind
::
PTX
);
CUdeviceptr
var
=
CreateCudaMemory
<
float
>
(
/* shape */
{
64
*
112
*
112
},
/* data */
nullptr
);
CUdeviceptr
out
=
CreateCudaMemory
<
float
>
(
/* shape */
{
64
*
112
*
112
},
/* data */
nullptr
);
void
*
args
[]
=
{
&
var
,
&
out
};
dim3
grid
(
512
,
1
,
1
);
dim3
block
(
512
,
1
,
1
);
cuda_module
.
LaunchKernel
(
/*device_id*/
0
,
"fn_relu_1_kernel"
,
grid
,
block
,
args
);
}
}
// namespace backends
}
// namespace cinn
paddle/cinn/backends/compiler.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/backends/compiler.h"
#include <fstream>
#include "paddle/cinn/backends/llvm/runtime_symbol_registry.h"
#include "paddle/cinn/common/context.h"
#include "paddle/cinn/hlir/framework/visualize_helper.h"
#ifdef CINN_WITH_CUDA
#include "paddle/cinn/backends/codegen_cuda_dev.h"
#include "paddle/cinn/backends/codegen_cuda_host.h"
#include "paddle/cinn/backends/codegen_cuda_util.h"
#include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
#include "paddle/cinn/runtime/cuda/cuda_module.h"
#include "paddle/cinn/runtime/cuda/cuda_util.h"
#include "paddle/cinn/runtime/flags.h"
#endif
DECLARE_string
(
cinn_source_code_save_path
);
DECLARE_string
(
cinn_dump_group_lowered_func
);
DECLARE_string
(
cinn_dump_group_source_code
);
DECLARE_string
(
cinn_dump_group_ptx
);
DECLARE_string
(
cinn_dump_group_instruction
);
namespace
cinn
{
namespace
backends
{
using
ir
::
Module
;
static
constexpr
int
DebugLogMaxLen
=
30000
;
void
CompilationInfoDumper
::
DumpLoweredFunc
()
{
if
(
FLAGS_cinn_dump_group_lowered_func
.
empty
())
{
return
;
}
for
(
int
idx
=
0
;
idx
<
info_
.
lowered_funcs
.
size
();
++
idx
)
{
std
::
stringstream
content
;
content
<<
info_
.
lowered_funcs
[
idx
].
front
();
Dump
(
FLAGS_cinn_dump_group_lowered_func
,
idx
,
"lowered_function.txt"
,
content
.
str
());
}
}
void
CompilationInfoDumper
::
DumpSourceCode
()
{
if
(
FLAGS_cinn_dump_group_source_code
.
empty
())
{
return
;
}
for
(
int
idx
=
0
;
idx
<
info_
.
source_codes
.
size
();
++
idx
)
{
Dump
(
FLAGS_cinn_dump_group_source_code
,
idx
,
"source_code.cu"
,
info_
.
source_codes
[
idx
]);
}
}
void
CompilationInfoDumper
::
DumpPtxCode
()
{
if
(
FLAGS_cinn_dump_group_ptx
.
empty
())
{
return
;
}
for
(
int
idx
=
0
;
idx
<
info_
.
source_ptxs
.
size
();
++
idx
)
{
Dump
(
FLAGS_cinn_dump_group_ptx
,
idx
,
"source_ptx.ptx"
,
info_
.
source_ptxs
[
idx
]);
}
}
void
CompilationInfoDumper
::
DumpInstruction
()
{
if
(
FLAGS_cinn_dump_group_instruction
.
empty
())
{
return
;
}
for
(
int
idx
=
0
;
idx
<
info_
.
instructions
.
size
();
++
idx
)
{
Dump
(
FLAGS_cinn_dump_group_instruction
,
idx
,
"instruction.txt"
,
info_
.
instructions
[
idx
]
->
DumpInstruction
());
}
}
void
CompilationInfoDumper
::
Dump
(
const
std
::
string
&
base_path
,
const
int
idx
,
const
std
::
string
&
file_name
,
const
std
::
string
&
content
)
{
auto
dump_path
=
utils
::
StringFormat
(
"%s/fusion_group_%d"
,
base_path
.
c_str
(),
idx
);
if
(
!
hlir
::
framework
::
MakeDirectory
(
dump_path
,
S_IRWXU
|
S_IRGRP
|
S_IXGRP
|
S_IROTH
|
S_IXOTH
))
{
LOG
(
WARNING
)
<<
"Failed to make directory:
\"
"
<<
dump_path
<<
"
\"
, the instruction for this group will not dump."
;
}
else
{
auto
dump_file
=
utils
::
StringFormat
(
"%s/%s"
,
dump_path
.
c_str
(),
file_name
.
c_str
());
VLOG
(
7
)
<<
"Dump instruction to: "
<<
dump_file
;
std
::
ofstream
of
(
dump_file
,
std
::
ios_base
::
out
);
if
(
of
.
is_open
())
{
of
<<
content
;
of
.
close
();
}
else
{
LOG
(
WARNING
)
<<
"Failed to open file: "
<<
dump_file
<<
", please check your path."
;
}
}
}
SourceCodePrint
::
SourceCodePrint
()
{
if
(
!
FLAGS_cinn_source_code_save_path
.
empty
())
{
LOG
(
INFO
)
<<
"The CINN auto generated source code will writing into file:
\"
"
<<
FLAGS_cinn_source_code_save_path
<<
"
\"
"
;
of
.
open
(
FLAGS_cinn_source_code_save_path
,
std
::
ios_base
::
out
);
}
}
SourceCodePrint
::~
SourceCodePrint
()
{
if
(
of
.
is_open
())
{
of
.
close
();
}
}
void
SourceCodePrint
::
write
(
const
std
::
string
&
source_code
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
mtx_
);
if
(
of
.
is_open
())
{
of
<<
source_code
<<
std
::
endl
;
}
else
if
(
!
FLAGS_cinn_source_code_save_path
.
empty
())
{
LOG
(
WARNING
)
<<
"Failed to open
\"
"
<<
FLAGS_cinn_source_code_save_path
<<
"
\"
, source code will print."
;
if
(
source_code
.
size
()
>
DebugLogMaxLen
)
{
LOG
(
INFO
)
<<
"[CUDA] source code-0:
\n
"
<<
source_code
.
substr
(
0
,
DebugLogMaxLen
);
for
(
int
i
=
1
;
i
*
DebugLogMaxLen
<
source_code
.
size
();
++
i
)
{
LOG
(
INFO
)
<<
"[CUDA] source code-"
<<
i
<<
":
\n
"
<<
source_code
.
substr
(
DebugLogMaxLen
*
i
,
DebugLogMaxLen
);
}
}
else
{
LOG
(
INFO
)
<<
"[CUDA] source code:
\n
"
<<
source_code
;
}
}
}
void
Compiler
::
Build
(
const
Module
&
module
,
const
std
::
string
&
code
)
{
if
(
target_
.
arch
==
Target
::
Arch
::
NVGPU
)
{
CompileCudaModule
(
module
,
code
);
}
else
if
(
target_
.
arch
==
Target
::
Arch
::
X86
)
{
CompileX86Module
(
module
);
}
else
{
CINN_NOT_IMPLEMENTED
}
}
std
::
string
Compiler
::
GetSourceCode
(
const
ir
::
Module
&
module
)
{
if
(
target_
.
arch
==
Target
::
Arch
::
NVGPU
)
{
#ifdef CINN_WITH_CUDA
auto
_host_module_device_module_
=
SplitCudaAndHostModule
(
module
);
// NOLINT
auto
&
host_module
=
std
::
get
<
0
>
(
_host_module_device_module_
);
auto
&
device_module
=
std
::
get
<
1
>
(
_host_module_device_module_
);
CodeGenCUDA_Dev
codegen
(
target_
);
auto
source_code
=
codegen
.
Compile
(
device_module
);
return
source_code
;
#else
CINN_NOT_IMPLEMENTED
#endif
}
else
{
CINN_NOT_IMPLEMENTED
}
}
void
Compiler
::
BuildDefault
(
const
Module
&
module
)
{
if
(
target_
.
arch
==
Target
::
Arch
::
NVGPU
)
{
CompileCudaModule
(
module
);
}
else
if
(
target_
.
arch
==
Target
::
Arch
::
X86
)
{
CompileX86Module
(
module
);
}
else
{
CINN_NOT_IMPLEMENTED
}
}
void
Compiler
::
CompileCudaModule
(
const
Module
&
module
,
const
std
::
string
&
code
)
{
#ifdef CINN_WITH_CUDA
auto
_host_module_device_module_
=
SplitCudaAndHostModule
(
module
);
// NOLINT
auto
&
host_module
=
std
::
get
<
0
>
(
_host_module_device_module_
);
auto
&
device_module
=
std
::
get
<
1
>
(
_host_module_device_module_
);
VLOG
(
3
)
<<
"[CUDA] host module:
\n
"
<<
host_module
;
VLOG
(
3
)
<<
"[CUDA] device module:
\n
"
<<
device_module
;
std
::
string
source_code
;
if
(
code
.
empty
())
{
CodeGenCUDA_Dev
codegen
(
target_
);
source_code
=
codegen
.
Compile
(
device_module
);
}
else
{
source_code
=
code
;
}
CHECK
(
!
source_code
.
empty
())
<<
"Compile CUDA C code failed from device module:
\n
"
<<
device_module
;
VLOG
(
3
)
<<
"[CUDA] C:
\n
"
<<
source_code
;
SourceCodePrint
::
GetInstance
()
->
write
(
source_code
);
using
runtime
::
cuda
::
CUDAModule
;
nvrtc
::
Compiler
compiler
;
auto
ptx
=
compiler
(
source_code
);
CHECK
(
!
ptx
.
empty
())
<<
"Compile PTX failed from source code:
\n
"
<<
source_code
;
cuda_module_
.
reset
(
new
CUDAModule
(
ptx
,
compiler
.
compile_to_cubin
()
?
CUDAModule
::
Kind
::
CUBIN
:
CUDAModule
::
Kind
::
PTX
));
RuntimeSymbols
symbols
;
for
(
auto
&
fn
:
device_module
.
functions
())
{
std
::
string
kernel_fn_name
=
fn
->
name
;
auto
fn_kernel
=
cuda_module_
->
GetFunction
(
0
,
kernel_fn_name
);
CHECK
(
fn_kernel
);
symbols
.
RegisterVar
(
kernel_fn_name
+
"_ptr_"
,
reinterpret_cast
<
void
*>
(
fn_kernel
));
}
engine_
=
ExecutionEngine
::
Create
(
ExecutionOptions
(),
std
::
move
(
symbols
));
engine_
->
Link
<
CodeGenCUDA_Host
>
(
host_module
);
#else
CINN_NOT_IMPLEMENTED
#endif
}
void
Compiler
::
CompileX86Module
(
const
Module
&
module
)
{
engine_
->
Link
<
CodeGenX86
>
(
module
);
}
void
Compiler
::
ExportObject
(
const
std
::
string
&
path
)
{
engine_
->
ExportObject
(
path
);
}
void
*
Compiler
::
Lookup
(
absl
::
string_view
fn_name
)
{
CHECK
(
engine_
);
if
(
engine_
->
Lookup
(
fn_name
)
!=
nullptr
)
{
return
engine_
->
Lookup
(
fn_name
);
}
return
nullptr
;
}
}
// namespace backends
}
// namespace cinn
paddle/cinn/backends/compiler.h
0 → 100644
View file @
992bec46
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <absl/strings/string_view.h>
#include <fstream>
#include <memory>
#include <mutex>
#include <string>
#include "paddle/cinn/backends/llvm/codegen_llvm.h"
#include "paddle/cinn/backends/llvm/execution_engine.h"
#include "paddle/cinn/backends/llvm/simple_jit.h"
#include "paddle/cinn/hlir/framework/parallel_compiler.h"
#include "paddle/cinn/lang/packed_func.h"
#ifdef CINN_WITH_CUDA
#include "paddle/cinn/runtime/cuda/cuda_module.h"
#endif
namespace
cinn
{
namespace
backends
{
/**
* A class for dumping the code after compilation.
* Use FLAGS_cinn_dump_group_lowered_func to specify the directory to dump
* lowered function. Use FLAGS_cinn_dump_group_source_code to specify the
* directory to dump the source code. Use FLAGS_cinn_dump_group_ptx to specify
* the directory to dump ptx. Use FLAGS_cinn_dump_group_instruction to specify
* the directory to dump instruction.
*/
class
CompilationInfoDumper
{
public:
explicit
CompilationInfoDumper
(
const
hlir
::
framework
::
ParallelCompiler
::
CompilationResult
&
info
)
:
info_
(
info
)
{
DumpLoweredFunc
();
DumpSourceCode
();
DumpPtxCode
();
DumpInstruction
();
}
private:
void
DumpLoweredFunc
();
void
DumpSourceCode
();
void
DumpPtxCode
();
void
DumpInstruction
();
void
Dump
(
const
std
::
string
&
base_path
,
const
int
idx
,
const
std
::
string
&
file_name
,
const
std
::
string
&
content
);
const
hlir
::
framework
::
ParallelCompiler
::
CompilationResult
&
info_
;
};
class
SourceCodePrint
{
public:
static
SourceCodePrint
*
GetInstance
()
{
static
SourceCodePrint
print
;
return
&
print
;
}
void
write
(
const
std
::
string
&
source_code
);
private:
SourceCodePrint
();
~
SourceCodePrint
();
std
::
ofstream
of
;
std
::
mutex
mtx_
;
};
class
Compiler
final
{
public:
static
std
::
unique_ptr
<
Compiler
>
Create
(
const
Target
&
target
)
{
return
std
::
unique_ptr
<
Compiler
>
(
new
Compiler
(
target
));
}
/**
* Compile and link to a CINN module.
*/
void
Build
(
const
ir
::
Module
&
module
,
const
std
::
string
&
code
=
""
);
void
ExportObject
(
const
std
::
string
&
path
);
std
::
string
GetSourceCode
(
const
ir
::
Module
&
module
);
void
BuildDefault
(
const
ir
::
Module
&
module
);
/**
* Retrieve a function by \p fn_name.
* @return function address or null if not exists.
*/
void
*
Lookup
(
absl
::
string_view
fn_name
);
private:
void
CompileCudaModule
(
const
ir
::
Module
&
module
,
const
std
::
string
&
code
=
""
);
void
CompileX86Module
(
const
ir
::
Module
&
module
);
explicit
Compiler
(
const
Target
&
target
)
:
target_
(
target
),
engine_
(
ExecutionEngine
::
Create
(
ExecutionOptions
()))
{}
CINN_DISALLOW_COPY_AND_ASSIGN
(
Compiler
);
private:
Target
target_
;
std
::
unique_ptr
<
ExecutionEngine
>
engine_
;
#ifdef CINN_WITH_CUDA
std
::
unique_ptr
<
runtime
::
cuda
::
CUDAModule
>
cuda_module_
;
#endif
};
}
// namespace backends
}
// namespace cinn
paddle/cinn/backends/compiler_test.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/backends/compiler.h"
#include <gtest/gtest.h>
#include <vector>
#include "paddle/cinn/cinn.h"
#include "paddle/cinn/common/test_helper.h"
#include "paddle/cinn/hlir/pe/elementwise.h"
#include "paddle/cinn/hlir/pe/nn.h"
#include "paddle/cinn/runtime/use_extern_funcs.h"
#include "paddle/cinn/utils/timer.h"
namespace
cinn
{
namespace
backends
{
TEST
(
Compiler
,
x86
)
{
Expr
M
(
1024
),
N
(
1024
);
auto
create_module
=
[
&
]()
{
Placeholder
<
float
>
A
(
"A"
,
{
M
,
N
});
Placeholder
<
float
>
B
(
"B"
,
{
M
,
N
});
auto
C
=
Compute
(
{
M
,
N
},
[
=
](
Expr
i
,
Expr
j
)
{
return
A
(
i
,
j
)
+
B
(
i
,
j
);
},
"C"
);
return
std
::
make_tuple
(
A
,
B
,
C
);
};
{
// test x86
auto
_A_B_C_
=
create_module
();
// NOLINT
auto
&
A
=
std
::
get
<
0
>
(
_A_B_C_
);
auto
&
B
=
std
::
get
<
1
>
(
_A_B_C_
);
auto
&
C
=
std
::
get
<
2
>
(
_A_B_C_
);
auto
stages
=
CreateStages
({
C
});
auto
fn
=
Lower
(
"fn"
,
stages
,
{
A
,
B
,
C
});
ir
::
Module
::
Builder
builder
(
"some_module"
,
common
::
DefaultHostTarget
());
builder
.
AddFunction
(
fn
);
auto
compiler
=
Compiler
::
Create
(
common
::
DefaultHostTarget
());
compiler
->
Build
(
builder
.
Build
());
auto
*
fnp
=
compiler
->
Lookup
(
"fn"
);
ASSERT_TRUE
(
fnp
);
auto
*
Ab
=
common
::
BufferBuilder
(
Float
(
32
),
{
M
.
as_int32
(),
N
.
as_int32
()})
.
set_random
()
.
Build
();
auto
*
Bb
=
common
::
BufferBuilder
(
Float
(
32
),
{
M
.
as_int32
(),
N
.
as_int32
()})
.
set_random
()
.
Build
();
auto
*
Cb
=
common
::
BufferBuilder
(
Float
(
32
),
{
M
.
as_int32
(),
N
.
as_int32
()})
.
set_zero
()
.
Build
();
auto
args
=
common
::
ArgsBuilder
().
Add
(
Ab
).
Add
(
Bb
).
Add
(
Cb
).
Build
();
reinterpret_cast
<
void
(
*
)(
void
*
,
int
)
>
(
fnp
)(
args
.
data
(),
args
.
size
());
// test result
auto
*
Ad
=
reinterpret_cast
<
float
*>
(
Ab
->
memory
);
auto
*
Bd
=
reinterpret_cast
<
float
*>
(
Bb
->
memory
);
auto
*
Cd
=
reinterpret_cast
<
float
*>
(
Cb
->
memory
);
for
(
int
i
=
0
;
i
<
Ab
->
num_elements
();
i
++
)
{
ASSERT_NEAR
(
Ad
[
i
]
+
Bd
[
i
],
Cd
[
i
],
1e-5
);
}
}
}
#ifdef CINN_WITH_CUDA
TEST
(
Compiler
,
cuda
)
{
Expr
M
(
1024
),
N
(
1024
);
auto
create_module
=
[
&
]()
{
Placeholder
<
float
>
A
(
"A"
,
{
M
,
N
});
Placeholder
<
float
>
B
(
"B"
,
{
M
,
N
});
auto
C
=
Compute
(
{
M
,
N
},
[
=
](
Expr
i
,
Expr
j
)
{
return
A
(
i
,
j
)
+
B
(
i
,
j
);
},
"C"
);
return
std
::
make_tuple
(
A
,
B
,
C
);
};
{
// cuda
auto
_A_B_C_
=
create_module
();
// NOLINT
auto
&
A
=
std
::
get
<
0
>
(
_A_B_C_
);
auto
&
B
=
std
::
get
<
1
>
(
_A_B_C_
);
auto
&
C
=
std
::
get
<
2
>
(
_A_B_C_
);
auto
stages
=
CreateStages
({
C
});
stages
[
C
]
->
Bind
(
0
,
"blockIdx.x"
);
stages
[
C
]
->
Bind
(
1
,
"threadIdx.x"
);
auto
fn
=
Lower
(
"fn"
,
stages
,
{
A
,
B
,
C
});
ir
::
Module
::
Builder
builder
(
"some_module"
,
common
::
DefaultHostTarget
());
builder
.
AddFunction
(
fn
);
auto
compiler
=
Compiler
::
Create
(
common
::
DefaultNVGPUTarget
());
compiler
->
Build
(
builder
.
Build
());
auto
*
fnp
=
compiler
->
Lookup
(
"fn"
);
ASSERT_TRUE
(
fnp
);
auto
*
Ab
=
common
::
BufferBuilder
(
Float
(
32
),
{
M
.
as_int32
(),
N
.
as_int32
()})
.
set_random
()
.
Build
();
auto
*
Bb
=
common
::
BufferBuilder
(
Float
(
32
),
{
M
.
as_int32
(),
N
.
as_int32
()})
.
set_random
()
.
Build
();
auto
*
Cb
=
common
::
BufferBuilder
(
Float
(
32
),
{
M
.
as_int32
(),
N
.
as_int32
()})
.
set_zero
()
.
Build
();
// allocate CUDA buffer
void
*
Ag
,
*
Bg
,
*
Cg
;
const
int
num_bytes
=
Ab
->
num_elements
()
*
sizeof
(
float
);
cudaMalloc
(
&
Ag
,
num_bytes
);
cudaMalloc
(
&
Bg
,
num_bytes
);
cudaMalloc
(
&
Cg
,
num_bytes
);
CUDA_CALL
(
cudaMemcpy
(
Ag
,
Ab
->
memory
,
num_bytes
,
cudaMemcpyHostToDevice
));
CUDA_CALL
(
cudaMemcpy
(
Bg
,
Bb
->
memory
,
num_bytes
,
cudaMemcpyHostToDevice
));
CUDA_CALL
(
cudaMemcpy
(
Cg
,
Cb
->
memory
,
num_bytes
,
cudaMemcpyHostToDevice
));
cinn_buffer_t
Abb
;
Abb
.
memory
=
reinterpret_cast
<
uint8_t
*>
(
Ag
);
cinn_buffer_t
Bbb
;
Bbb
.
memory
=
reinterpret_cast
<
uint8_t
*>
(
Bg
);
cinn_buffer_t
Cbb
;
Cbb
.
memory
=
reinterpret_cast
<
uint8_t
*>
(
Cg
);
auto
args
=
common
::
ArgsBuilder
().
Add
(
&
Abb
).
Add
(
&
Bbb
).
Add
(
&
Cbb
).
Build
();
utils
::
Timer
timer
;
timer
.
Start
();
void
*
stream
=
nullptr
;
for
(
int
i
=
0
;
i
<
1000
;
i
++
)
{
reinterpret_cast
<
void
(
*
)(
void
*
,
int
,
void
*
)
>
(
fnp
)(
args
.
data
(),
args
.
size
(),
stream
);
}
CUDA_CALL
(
cudaDeviceSynchronize
());
float
latency
=
timer
.
Stop
();
LOG
(
INFO
)
<<
"latency: "
<<
latency
/
1000
;
std
::
vector
<
float
>
ch
(
M
.
as_int32
()
*
N
.
as_int32
(),
0.
f
);
CUDA_CALL
(
cudaMemcpy
(
ch
.
data
(),
Cg
,
ch
.
size
()
*
sizeof
(
float
),
cudaMemcpyDeviceToHost
));
auto
*
Ad
=
reinterpret_cast
<
float
*>
(
Ab
->
memory
);
auto
*
Bd
=
reinterpret_cast
<
float
*>
(
Bb
->
memory
);
for
(
int
i
=
0
;
i
<
Ab
->
num_elements
();
i
++
)
{
ASSERT_NEAR
(
Ad
[
i
]
+
Bd
[
i
],
ch
[
i
],
1e-5
);
}
}
}
#endif
TEST
(
Compiler
,
sqrt
)
{
Expr
N
(
100
);
Expr
C
(
10
);
Expr
H
(
10
);
Expr
W
(
10
);
Placeholder
<
float
>
input
(
"input"
,
{
N
,
C
,
H
,
W
});
Placeholder
<
float
>
mean
(
"mean"
,
{
C
});
Placeholder
<
float
>
scale
(
"scale"
,
{
C
});
Placeholder
<
float
>
variance
(
"variance"
,
{
C
});
Placeholder
<
float
>
bias
(
"bias"
,
{
C
});
float
epsilon
=
0.1
f
;
auto
A
=
Compute
(
{
N
,
C
,
H
,
W
},
[
=
](
Expr
n
,
Expr
c
,
Expr
h
,
Expr
w
)
{
return
(
input
(
n
,
c
,
h
,
w
)
-
mean
(
c
))
*
scale
(
c
)
/
lang
::
Sqrt
(
variance
(
c
)
+
Expr
(
epsilon
))
+
bias
(
c
);
},
"A"
);
auto
B
=
hlir
::
pe
::
Pool2d
(
input
,
{
3
,
3
},
{
1
,
1
},
{
1
,
1
,
1
,
1
},
"max"
,
false
,
false
);
auto
BB
=
hlir
::
pe
::
BatchNorm_NCHW
(
input
,
scale
,
bias
,
mean
,
variance
,
epsilon
,
"batchnorm"
);
auto
stages
=
CreateStages
({
input
,
mean
,
scale
,
variance
,
A
,
bias
,
B
[
0
],
BB
});
auto
fn
=
Lower
(
"fn"
,
stages
,
{
input
,
mean
,
scale
,
bias
,
variance
,
A
,
B
[
0
],
BB
});
Module
::
Builder
builder
(
"some"
,
common
::
DefaultHostTarget
());
builder
.
AddFunction
(
fn
);
auto
compiler
=
Compiler
::
Create
(
common
::
DefaultHostTarget
());
compiler
->
Build
(
builder
.
Build
());
}
}
// namespace backends
}
// namespace cinn
paddle/cinn/backends/cuda_util.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/backends/cuda_util.h"
#include <glog/logging.h>
#include "paddle/cinn/backends/extern_func_jit_register.h"
#include "paddle/cinn/common/target.h"
namespace
cinn
{
namespace
backends
{
std
::
string
cuda_thread_axis_name
(
int
level
)
{
switch
(
level
)
{
case
0
:
return
"threadIdx.x"
;
break
;
case
1
:
return
"threadIdx.y"
;
break
;
case
2
:
return
"threadIdx.z"
;
break
;
}
return
""
;
}
std
::
string
cuda_block_axis_name
(
int
level
)
{
switch
(
level
)
{
case
0
:
return
"blockIdx.x"
;
break
;
case
1
:
return
"blockIdx.y"
;
break
;
case
2
:
return
"blockIdx.z"
;
break
;
}
return
""
;
}
}
// namespace backends
}
// namespace cinn
paddle/cinn/backends/cuda_util.h
0 → 100644
View file @
992bec46
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef CINN_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#include <cudnn.h>
#include <curand.h>
#include <glog/logging.h>
#include <string>
#include <tuple>
#include <vector>
#include "paddle/cinn/runtime/cinn_runtime.h"
#define CUDA_DRIVER_CALL(func) \
{ \
auto status = func; \
if (status != CUDA_SUCCESS) { \
const char* msg; \
cuGetErrorString(status, &msg); \
LOG(FATAL) << "CUDA Driver Error: " #func " failed with error: " << msg; \
} \
}
#define CUDA_CALL(func) \
{ \
auto status = func; \
if (status != cudaSuccess) { \
LOG(FATAL) << "CUDA Error : " << cudaGetErrorString(status); \
} \
}
#define CURAND_CALL(func) \
{ \
auto status = func; \
if (status != CURAND_STATUS_SUCCESS) { \
LOG(FATAL) << "CURAND Error : " << status; \
} \
}
#define CUSOLVER_CALL(func) \
{ \
auto status = func; \
if (status != CUSOLVER_STATUS_SUCCESS) { \
LOG(FATAL) << "CUSOLVER Error: " << status; \
} \
}
#define CUBLAS_CALL(func) \
{ \
auto status = func; \
if (status != CUBLAS_STATUS_SUCCESS) { \
LOG(FATAL) << "CUBLAS Error!"; \
} \
}
#define CUDNN_CALL(func) \
{ \
auto status = func; \
if (status != CUDNN_STATUS_SUCCESS) { \
LOG(FATAL) << "CUDNN Error : " << cudnnGetErrorString(status); \
} \
}
#define NVRTC_CALL(func) \
{ \
auto status = func; \
if (status != NVRTC_SUCCESS) { \
LOG(FATAL) << "NVRTC Error : " << nvrtcGetErrorString(status); \
} \
}
namespace
cinn
{
namespace
backends
{
// CUDA syntax for thread axis.
std
::
string
cuda_thread_axis_name
(
int
level
);
// CUDA syntax for block axis.
std
::
string
cuda_block_axis_name
(
int
level
);
}
// namespace backends
}
// namespace cinn
#endif // CINN_WITH_CUDA
Prev
1
…
8
9
10
11
12
13
14
15
16
…
18
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment