Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
efa4f0f4
Unverified
Commit
efa4f0f4
authored
Sep 29, 2025
by
spike-zhu
Committed by
GitHub
Sep 29, 2025
Browse files
issue/453: add AWQ-dequantize in iluvatar gpu
parent
fbfb0ef6
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
209 additions
and
0 deletions
+209
-0
src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_iluvatar.cu
...ops/dequantize_awq/iluvatar/dequantize_w42f16_iluvatar.cu
+127
-0
src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_iluvatar.cuh
...ps/dequantize_awq/iluvatar/dequantize_w42f16_iluvatar.cuh
+8
-0
src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_kernel.cuh
.../ops/dequantize_awq/iluvatar/dequantize_w42f16_kernel.cuh
+56
-0
src/infiniop/ops/dequantize_awq/operator.cc
src/infiniop/ops/dequantize_awq/operator.cc
+15
-0
xmake/iluvatar.lua
xmake/iluvatar.lua
+3
-0
No files found.
src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_iluvatar.cu
0 → 100644
View file @
efa4f0f4
#include "../../../devices/nvidia/nvidia_handle.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "dequantize_w42f16_iluvatar.cuh"
#include "dequantize_w42f16_kernel.cuh"
#include "../dequantize_awq.h"
#include <cuda_fp16.h>
__global__
void
__launch_bounds__
(
64
)
dequantize_weights
(
int
*
__restrict__
B
,
half
*
__restrict__
scaling_factors
,
int
*
__restrict__
zeros
,
half
*
__restrict__
C
,
int
G
)
{
// static constexpr uint32_t ZERO = 0x0;
half
B_shared
[
32
*
(
128
+
8
)];
half
*
B_shared_ptr2
=
B_shared
;
int
N
=
blockDim
.
x
*
gridDim
.
x
;
// 2
int
col
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
int
row
=
(
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
);
int
index1
=
8
*
col
+
8
*
row
*
N
;
half
*
C_ptr2
=
C
+
index1
;
int
index2
=
col
+
row
*
N
;
int
*
B_ptr2
=
B
+
index2
;
int
index3
=
col
+
(
int
)(
row
/
G
)
*
N
;
int
*
zeros_ptr2
=
zeros
+
index3
;
int
index4
=
8
*
col
+
(
int
)(
row
/
G
)
*
N
*
8
;
half
*
scaling_factors_ptr2
=
scaling_factors
+
index4
;
uint32_t
zeros_loaded
=
*
(
uint32_t
*
)(
zeros_ptr2
);
uint4
B_loaded_zero
=
dequantize_s4_to_fp16x2
(
zeros_loaded
);
uint4
B_loaded_scale
=
*
(
uint4
*
)(
scaling_factors_ptr2
);
uint32_t
B_loaded
=
*
(
uint32_t
*
)
B_ptr2
;
uint4
B_loaded_fp16
=
dequantize_s4_to_fp16x2
(
B_loaded
);
// Reinterpret uint4 components as __half2
__half2
*
B_loaded_fp16_h2
=
reinterpret_cast
<
__half2
*>
(
&
B_loaded_fp16
);
__half2
*
B_loaded_zero_h2
=
reinterpret_cast
<
__half2
*>
(
&
B_loaded_zero
);
__half2
*
B_loaded_scale_h2
=
reinterpret_cast
<
__half2
*>
(
&
B_loaded_scale
);
// Replace PTX sub.f16x2 with __hsub2 for each component
B_loaded_fp16_h2
[
0
]
=
__hsub2
(
B_loaded_fp16_h2
[
0
],
B_loaded_zero_h2
[
0
]);
B_loaded_fp16_h2
[
1
]
=
__hsub2
(
B_loaded_fp16_h2
[
1
],
B_loaded_zero_h2
[
1
]);
B_loaded_fp16_h2
[
2
]
=
__hsub2
(
B_loaded_fp16_h2
[
2
],
B_loaded_zero_h2
[
2
]);
B_loaded_fp16_h2
[
3
]
=
__hsub2
(
B_loaded_fp16_h2
[
3
],
B_loaded_zero_h2
[
3
]);
// Replace PTX fma.rn.f16x2 with __hfma2 for each component
B_loaded_fp16_h2
[
0
]
=
__hfma2
(
B_loaded_fp16_h2
[
0
],
B_loaded_scale_h2
[
0
],
__float2half2_rn
(
0.0
f
));
B_loaded_fp16_h2
[
1
]
=
__hfma2
(
B_loaded_fp16_h2
[
1
],
B_loaded_scale_h2
[
1
],
__float2half2_rn
(
0.0
f
));
B_loaded_fp16_h2
[
2
]
=
__hfma2
(
B_loaded_fp16_h2
[
2
],
B_loaded_scale_h2
[
2
],
__float2half2_rn
(
0.0
f
));
B_loaded_fp16_h2
[
3
]
=
__hfma2
(
B_loaded_fp16_h2
[
3
],
B_loaded_scale_h2
[
3
],
__float2half2_rn
(
0.0
f
));
// Store back to shared memory
*
(
uint4
*
)
B_shared_ptr2
=
B_loaded_fp16
;
for
(
int
i
=
0
;
i
<
8
;
++
i
)
{
*
(
C_ptr2
+
i
)
=
B_shared
[
i
];
}
}
namespace
op
::
dequantize_awq
::
iluvatar
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
nvidia
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
infiniopTensorDescriptor_t
qweight_desc
,
infiniopTensorDescriptor_t
scales_desc
,
infiniopTensorDescriptor_t
zeros_desc
)
{
auto
handle
=
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle_
);
auto
result
=
DequantizeAWQInfo
::
create
(
out_desc
,
qweight_desc
,
scales_desc
,
zeros_desc
);
*
desc_ptr
=
new
Descriptor
(
0
,
new
Opaque
{
handle
->
internal
()},
result
.
take
(),
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
out
,
const
void
*
qweight
,
const
void
*
scales
,
const
void
*
zeros
,
void
*
stream
)
const
{
int
in_features
=
_info
.
in_features
();
int
out_features
=
_info
.
out_features
();
int
group_size
=
in_features
/
_info
.
num_groups
();
// ==================== 默认配置, 固定为 8 ====================
constexpr
int
BLOCK_X
=
8
;
constexpr
int
BLOCK_Y
=
8
;
int
x_blocks
=
(
out_features
+
BLOCK_X
-
1
)
/
BLOCK_X
;
int
y_blocks
=
(
in_features
+
BLOCK_Y
-
1
)
/
BLOCK_Y
;
dim3
num_blocks
(
x_blocks
,
y_blocks
);
dim3
threads_per_block
(
BLOCK_X
,
BLOCK_Y
);
// =====================================================
half
*
out_
=
reinterpret_cast
<
half
*>
(
out
);
int
*
qweight_
=
const_cast
<
int
*>
(
reinterpret_cast
<
const
int
*>
(
qweight
));
half
*
scales_
=
const_cast
<
half
*>
(
reinterpret_cast
<
const
half
*>
(
scales
));
int
*
zeros_
=
const_cast
<
int
*>
(
reinterpret_cast
<
const
int
*>
(
zeros
));
dequantize_weights
<<<
num_blocks
,
threads_per_block
,
0
,
reinterpret_cast
<
cudaStream_t
>
(
stream
)
>>>
(
qweight_
,
scales_
,
zeros_
,
out_
,
group_size
);
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::dequantize_awq::iluvatar
src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_iluvatar.cuh
0 → 100644
View file @
efa4f0f4
#ifndef __DEQUANTIZE_AWQ_ILUVATAR_CUH__
#define __DEQUANTIZE_AWQ_ILUVATAR_CUH__
#include "../dequantize_awq.h"
DESCRIPTOR
(
iluvatar
)
#endif // __DEQUANTIZE_AWQ_ILUVATAR_CUH__
src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_kernel.cuh
0 → 100644
View file @
efa4f0f4
#pragma once
#include <cuda_fp16.h> // 需要此头文件来支持 __half 和 __half2 类型
/**
* @brief 将一个包含8个4-bit整数的uint32_t反量化为8个half精度浮点数。
*
* 这是一个通用的 CUDA C++ 实现,用于替代原有的 PTX 汇编版本,
* 以便在不支持高级 PTX 指令(如 lop3.b32)的 GPU 上运行。
* 输出顺序匹配 PTX 的交错打包:v0, v4, v1, v5, v2, v6, v3, v7(经 signed 调整后)。
*
* @param source 输入的32位无符号整数,它打包了8个4-bit的数据。
* @return 一个 uint4 变量,其中包含8个反量化后的 half 值。
*/
__device__
__forceinline__
uint4
dequantize_s4_to_fp16x2
(
uint32_t
const
&
source
)
{
// 步骤 1: 从一个 32-bit 源数据中解包出 8 个 4-bit 无符号整数。
// 源数据的内存布局被假定为 [v7, v6, v5, v4, v3, v2, v1, v0],
// 其中每个 'v' 都是一个 4-bit 的半字节 (nibble)。
const
unsigned
int
v0
=
(
source
>>
0
)
&
0x0F
;
const
unsigned
int
v1
=
(
source
>>
4
)
&
0x0F
;
const
unsigned
int
v2
=
(
source
>>
8
)
&
0x0F
;
const
unsigned
int
v3
=
(
source
>>
12
)
&
0x0F
;
const
unsigned
int
v4
=
(
source
>>
16
)
&
0x0F
;
const
unsigned
int
v5
=
(
source
>>
20
)
&
0x0F
;
const
unsigned
int
v6
=
(
source
>>
24
)
&
0x0F
;
const
unsigned
int
v7
=
(
source
>>
28
)
&
0x0F
;
// 步骤 2: 对于 signed 4-bit (s4),减去 8 以映射到 [-8, 7] 范围。
// 定义偏移量
__half
offset
=
__half
(
8
);
// 计算 signed 值
__half
hv0
=
__half
(
v0
)
-
offset
;
__half
hv1
=
__half
(
v1
)
-
offset
;
__half
hv2
=
__half
(
v2
)
-
offset
;
__half
hv3
=
__half
(
v3
)
-
offset
;
__half
hv4
=
__half
(
v4
)
-
offset
;
__half
hv5
=
__half
(
v5
)
-
offset
;
__half
hv6
=
__half
(
v6
)
-
offset
;
__half
hv7
=
__half
(
v7
)
-
offset
;
// 步骤 3: 将 half 值按 PTX 交错顺序打包成 __half2 并存入 result 中。
// 顺序:result_ptr[0]: low=hv0, high=hv4
// result_ptr[1]: low=hv1, high=hv5
// result_ptr[2]: low=hv2, high=hv6
// result_ptr[3]: low=hv3, high=hv7
// __halves2half2 函数:low 为第一个参数,high 为第二个参数。
uint4
result
;
__half2
*
result_ptr
=
reinterpret_cast
<
__half2
*>
(
&
result
);
result_ptr
[
0
]
=
__halves2half2
(
hv0
,
hv4
);
result_ptr
[
1
]
=
__halves2half2
(
hv1
,
hv5
);
result_ptr
[
2
]
=
__halves2half2
(
hv2
,
hv6
);
result_ptr
[
3
]
=
__halves2half2
(
hv3
,
hv7
);
return
result
;
}
src/infiniop/ops/dequantize_awq/operator.cc
View file @
efa4f0f4
...
...
@@ -8,6 +8,9 @@
#ifdef ENABLE_MOORE_API
#include "moore/dequantize_w42f16_moore.h"
#endif
#ifdef ENABLE_ILUVATAR_API
#include "iluvatar/dequantize_w42f16_iluvatar.cuh"
#endif
__C
infiniStatus_t
infiniopCreateDequantizeAWQDescriptor
(
infiniopHandle_t
handle
,
...
...
@@ -33,6 +36,9 @@ __C infiniStatus_t infiniopCreateDequantizeAWQDescriptor(
#endif
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
iluvatar
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -54,6 +60,9 @@ __C infiniStatus_t infiniopGetDequantizeAWQWorkspaceSize(infiniopDequantizeAWQDe
#endif
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
iluvatar
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -82,6 +91,9 @@ __C infiniStatus_t infiniopDequantizeAWQ(
#endif
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
iluvatar
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -104,6 +116,9 @@ infiniopDestroyDequantizeAWQDescriptor(infiniopDequantizeAWQDescriptor_t desc) {
#endif
#ifdef ENABLE_MOORE_API
DELETE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
iluvatar
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
xmake/iluvatar.lua
View file @
efa4f0f4
...
...
@@ -50,6 +50,9 @@ target("infiniop-iluvatar")
-- set_languages("cxx17") 天数似乎不能用这个配置
add_files
(
"../src/infiniop/devices/nvidia/*.cu"
,
"../src/infiniop/ops/*/nvidia/*.cu"
)
-- 天数平台不支持部分 NVIDIA PTX 指令,AWQ 反量化改用 CUDA C++ 实现
add_files
(
"../src/infiniop/ops/dequantize_awq/iluvatar/*.cu"
)
if
has_config
(
"ninetoothed"
)
then
add_files
(
"../build/ninetoothed/*.c"
,
{
cxflags
=
{
"-Wno-return-type"
}})
end
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment