Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Lmdeploy
Commits
09a80a64
Commit
09a80a64
authored
May 28, 2024
by
gaoqiong
Browse files
增加awq相关so 以及更新so存放位置,取消awq_sugon文件夹,相关函数合入kernels/gemm_s_f16
parent
617e86ea
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
83 additions
and
244 deletions
+83
-244
setup.py
setup.py
+15
-13
src/turbomind/kernels/gemm_s_f16/CMakeLists.txt
src/turbomind/kernels/gemm_s_f16/CMakeLists.txt
+1
-1
src/turbomind/kernels/gemm_s_f16/format.cu
src/turbomind/kernels/gemm_s_f16/format.cu
+37
-1
src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h
src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h
+18
-0
src/turbomind/models/llama/CMakeLists.txt
src/turbomind/models/llama/CMakeLists.txt
+2
-2
src/turbomind/models/llama/LlamaLinear.h
src/turbomind/models/llama/LlamaLinear.h
+0
-1
src/turbomind/models/llama/awq_sugon/gemm_w4_dequation.cu
src/turbomind/models/llama/awq_sugon/gemm_w4_dequation.cu
+0
-143
src/turbomind/models/llama/awq_sugon/gemm_w4_dequation.cuh
src/turbomind/models/llama/awq_sugon/gemm_w4_dequation.cuh
+0
-36
src/turbomind/models/llama/awq_sugon/lmdeploy_sugon.cuh
src/turbomind/models/llama/awq_sugon/lmdeploy_sugon.cuh
+0
-25
src/turbomind/utils/cublasMMWrapper.cc
src/turbomind/utils/cublasMMWrapper.cc
+8
-20
src/turbomind/utils/cublasMMWrapper.h
src/turbomind/utils/cublasMMWrapper.h
+2
-2
No files found.
setup.py
View file @
09a80a64
...
...
@@ -69,20 +69,7 @@ def get_version_add(sha: Optional[str] = None) -> str:
file
.
writelines
(
lines
)
file
.
close
()
def
copy_ck_so
():
lmdeploy_root
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
so_path
=
os
.
path
.
join
(
os
.
path
.
join
(
lmdeploy_root
,
"3rdparty"
),
"libgemm_multiB_int4.so"
)
# dtk version
if
os
.
getenv
(
"ROCM_PATH"
):
rocm_path
=
os
.
getenv
(
'ROCM_PATH'
,
""
)
rocm_so_path
=
os
.
path
.
join
(
rocm_path
,
'lib'
)
print
(
"rocm_so_path:"
,
rocm_so_path
)
shutil
.
copy
(
so_path
,
rocm_so_path
)
else
:
shutil
.
copy
(
so_path
,
"usr/local/lib"
)
def
get_version
():
copy_ck_so
()
get_version_add
()
version_file
=
'lmdeploy/version.py'
with
open
(
version_file
,
encoding
=
'utf-8'
)
as
f
:
...
...
@@ -197,9 +184,24 @@ def parse_requirements(fname='requirements.txt', with_version=True):
packages
+=
cuda_pkgs
return
packages
def
copy_ck_so
():
lmdeploy_root
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
so_path
=
os
.
path
.
join
(
os
.
path
.
join
(
lmdeploy_root
,
"3rdparty"
,
"composable_kernel"
),
"libgemm_multiB_int4.so"
)
# dtk version
target_path
=
os
.
path
.
join
(
lmdeploy_root
,
"lmdeploy"
,
"lib"
)
if
os
.
path
.
exists
(
target_path
):
shutil
.
copy
(
so_path
,
target_path
)
elif
os
.
getenv
(
"ROCM_PATH"
):
rocm_path
=
os
.
getenv
(
'ROCM_PATH'
,
""
)
rocm_so_path
=
os
.
path
.
join
(
rocm_path
,
'lib'
)
print
(
"rocm_so_path:"
,
rocm_so_path
)
shutil
.
copy
(
so_path
,
rocm_so_path
)
else
:
shutil
.
copy
(
so_path
,
"usr/local/lib"
)
if
__name__
==
'__main__'
:
lmdeploy_package_data
=
[
'lmdeploy/bin/llama_gemm'
]
copy_ck_so
()
setup
(
name
=
'lmdeploy'
,
version
=
get_version
(),
...
...
src/turbomind/kernels/gemm_s_f16/CMakeLists.txt
View file @
09a80a64
...
...
@@ -2,7 +2,7 @@
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC"
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-fPIC"
)
add_library
(
gemm_s4_f16 STATIC gemm_s4_f16.cu format.cu
../../models/llama/awq_sugon/gemm_w4_dequation.cu
)
add_library
(
gemm_s4_f16 STATIC gemm_s4_f16.cu format.cu
)
target_compile_options
(
gemm_s4_f16 PRIVATE
--generate-line-info -O3 -use_fast_math -Xptxas=-v --expt-relaxed-constexpr
)
set_property
(
TARGET gemm_s4_f16 PROPERTY POSITION_INDEPENDENT_CODE ON
)
...
...
src/turbomind/kernels/gemm_s_f16/format.cu
View file @
09a80a64
// Copyright (c) OpenMMLab. All rights reserved.
#include "common.h"
#include "src/turbomind/models/llama/awq_sugon/gemm_w4_dequation.cuh"
#include <iostream>
#define BLOCKSIZE 256
namespace
turbomind
{
...
...
@@ -245,4 +245,40 @@ void addFusedSiluActivation(cudaStream_t stream,half* output, const half* src,in
return
;
}
}
template
<
typename
T
>
__global__
void
input_padding_kernel
(
int
num_kernels
,
T
*
output
,
const
T
*
input
,
int
m
,
int
k
,
int
group_size
,
int
count
)
{
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
id
>=
num_kernels
)
return
;
int
j
=
id
%
(
k
+
count
*
group_size
);
int
i
=
id
/
(
k
+
count
*
group_size
);
if
(
j
<
k
)
{
output
[
i
*
(
k
+
count
*
group_size
)
+
j
]
=
input
[
i
*
(
k
)
+
j
];
}
else
{
output
[
i
*
(
k
+
count
*
group_size
)
+
j
]
=
0.
f
;
}
}
template
<
typename
T
>
void
input_padding
(
cudaStream_t
stream
,
T
*
output
,
const
T
*
input
,
int
m
,
int
k
,
int
group_size
,
int
pad_groupcount
)
{
//input的size是[m,k],output的size是[m,n+group_size]
//
int
num_kernels
=
m
*
(
k
+
pad_groupcount
*
group_size
);
input_padding_kernel
<<<
(
num_kernels
+
BLOCKSIZE
-
1
)
/
BLOCKSIZE
,
BLOCKSIZE
,
0
,
stream
>>>
(
num_kernels
,
output
,
input
,
m
,
k
,
group_size
,
pad_groupcount
);
}
#define INSTANTIATEINPUTPADING(T) \
template void input_padding(cudaStream_t stream, T* output,const T* input,int m,int k,int group_size,int pad_groupcount);
INSTANTIATEINPUTPADING
(
__half
)
}
// namespace turbomind
src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h
View file @
09a80a64
...
...
@@ -9,6 +9,22 @@
#include <memory>
#include <vector>
typedef
struct
ihipStream_t
*
hipStream_t
;
extern
void
run_weight_only_gemm
(
const
void
*
A
,
const
void
*
B0
,
const
void
*
B1
,
void
*
C
,
int
M
,
int
N
,
int
K
,
int
StrideA
,
int
StrideB
,
int
StrideB_padded
,
int
StrideC
,
int
Group
,
void
*
splitK_padA_workspace
,
int
splitK_padA_workspace_elementSize
,
hipStream_t
stream_id
=
0
);
namespace
turbomind
{
...
...
@@ -17,6 +33,8 @@ void dequant_w4_gemm(cudaStream_t stream, half* output,const uint32_t* weight,co
void
addFusedSiluActivation
(
cudaStream_t
stream
,
half
*
output
,
const
half
*
src
,
int
m
,
int
n
,
int
type
);
void
dequant_w4_gemm_colmajor
(
cudaStream_t
stream
,
half
*
output
,
const
uint32_t
*
weight
,
const
half2
*
zeros_and_scales
,
int
k
,
int
n
,
int
group_size
);
template
<
typename
T
>
void
input_padding
(
cudaStream_t
stream
,
T
*
output
,
const
T
*
input
,
int
m
,
int
k
,
int
group_size
,
int
pad_groupcount
);
class
GemmS4F16
{
public:
...
...
src/turbomind/models/llama/CMakeLists.txt
View file @
09a80a64
...
...
@@ -20,12 +20,12 @@ add_library(Llama STATIC
llama_kernels.cu
llama_decoder_kernels.cu
llama_utils.cu
./awq_sugon/gemm_w4_dequation.cu
)
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC"
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-fPIC"
)
#set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_directories
(
Llama PUBLIC ../../../../3rdparty/
)
target_link_directories
(
Llama PUBLIC ../../../../3rdparty/
composable_kernel/
)
target_link_libraries
(
Llama PUBLIC cudart
gemm_s4_f16
cublasMMWrapper
...
...
src/turbomind/models/llama/LlamaLinear.h
View file @
09a80a64
...
...
@@ -2,7 +2,6 @@
#pragma once
#include "src/turbomind/models/llama/awq_sugon/gemm_w4_dequation.cuh"
#include "src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h"
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/models/llama/llama_kernels.h"
...
...
src/turbomind/models/llama/awq_sugon/gemm_w4_dequation.cu
deleted
100644 → 0
View file @
617e86ea
#include "src/turbomind/models/llama/awq_sugon/lmdeploy_sugon.cuh"
#include "src/turbomind/models/llama/awq_sugon/gemm_w4_dequation.cuh"
template
<
typename
T
>
__global__
void
add_kernel
(
int
n
,
T
*
A
,
const
T
*
B
)
{
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
id
>=
n
)
return
;
A
[
id
]
=
A
[
id
]
+
B
[
id
];
}
template
<
typename
T
>
__global__
void
assign_kernel
(
int
n
,
T
*
A
,
const
T
*
B
)
{
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
id
>=
n
)
return
;
A
[
id
]
=
B
[
id
];
}
template
<
typename
T
>
void
assign_fun
(
cudaStream_t
stream
,
T
*
A
,
const
T
*
B
,
int
size
)
{
int
num_kernels
=
size
;
assign_kernel
<<<
(
num_kernels
+
BLOCKSIZE
-
1
)
/
BLOCKSIZE
,
BLOCKSIZE
,
0
,
stream
>>>
(
num_kernels
,
A
,
B
);
}
#define INSTANTIATEASSIGN(T) \
template void assign_fun(cudaStream_t stream, T* A,const T* B,int size);
INSTANTIATEASSIGN
(
__half
)
INSTANTIATEASSIGN
(
float
)
INSTANTIATEASSIGN
(
half2
)
INSTANTIATEASSIGN
(
uint
)
template
<
typename
T
>
void
PrintScale
(
cudaStream_t
stream
,
const
T
*
data
,
int
size
,
int
flag
,
int
m
,
int
n
){
printf
(
"start printf ****
\n
"
);
int
input_size
=
size
;
T
*
h_data
;
h_data
=
new
T
[
input_size
];
T
*
d_data
;
cudaMalloc
((
void
**
)
&
d_data
,
input_size
*
sizeof
(
T
));
//进行初始化
// for(int i=0;i<input_size;i++)
// {
// h_data[i] = __float2half(2.0f);
// }
// cudaMemcpy(d_data, h_data, input_size * sizeof(T), cudaMemcpyHostToDevice);
assign_fun
<
T
>
(
stream
,
d_data
,
data
,
input_size
);
cudaStreamSynchronize
(
stream
);
cudaMemcpy
(
h_data
,
d_data
,
input_size
*
sizeof
(
T
),
cudaMemcpyDeviceToHost
);
if
(
flag
!=
0
)
{
std
::
string
file_name
=
"/FrameWork/nvidia_file/elsetest/data"
+
std
::
to_string
(
flag
)
+
".bin"
;
std
::
ofstream
outfile
(
file_name
,
std
::
ios
::
binary
);
if
(
!
outfile
)
{
std
::
cerr
<<
"Failed to open the file for writing."
<<
std
::
endl
;
}
outfile
.
write
(
reinterpret_cast
<
const
char
*>
(
h_data
),
m
*
n
*
sizeof
(
T
));
outfile
.
close
();
}
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
for
(
int
i
=
0
;
i
<
input_size
;
i
++
)
{
printf
(
"%f "
,
__half2float
(
h_data
[
i
]));
}
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
for
(
int
i
=
0
;
i
<
input_size
;
i
++
)
{
printf
(
"x:%f y:%f "
,
__half2float
(
h_data
[
i
].
data
[
0
]),
__half2float
(
h_data
[
i
].
data
[
1
]));
}
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
uint
>
)
{
for
(
int
i
=
0
;
i
<
input_size
;
i
++
)
{
printf
(
" %u "
,
h_data
[
i
]);
}
}
printf
(
"
\n
"
);
delete
[]
h_data
;
cudaFree
(
d_data
);
return
;
}
#define INSTANTIATEPRINT(T) \
template void PrintScale(cudaStream_t stream,const T* data,int size,int flag,int m,int n);
INSTANTIATEPRINT
(
__half
)
INSTANTIATEPRINT
(
float
)
INSTANTIATEPRINT
(
half2
)
INSTANTIATEPRINT
(
uint32_t
)
template
<
typename
T
>
__global__
void
input_padding_kernel
(
int
num_kernels
,
T
*
output
,
const
T
*
input
,
int
m
,
int
k
,
int
group_size
,
int
count
)
{
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
id
>=
num_kernels
)
return
;
int
j
=
id
%
(
k
+
count
*
group_size
);
int
i
=
id
/
(
k
+
count
*
group_size
);
if
(
j
<
k
)
{
output
[
i
*
(
k
+
count
*
group_size
)
+
j
]
=
input
[
i
*
(
k
)
+
j
];
}
else
{
output
[
i
*
(
k
+
count
*
group_size
)
+
j
]
=
0.
f
;
}
}
template
<
typename
T
>
void
input_padding
(
cudaStream_t
stream
,
T
*
output
,
const
T
*
input
,
int
m
,
int
k
,
int
group_size
,
int
pad_groupcount
)
{
//input的size是[m,k],output的size是[m,n+group_size]
//
int
num_kernels
=
m
*
(
k
+
pad_groupcount
*
group_size
);
input_padding_kernel
<<<
(
num_kernels
+
BLOCKSIZE
-
1
)
/
BLOCKSIZE
,
BLOCKSIZE
,
0
,
stream
>>>
(
num_kernels
,
output
,
input
,
m
,
k
,
group_size
,
pad_groupcount
);
}
#define INSTANTIATEINPUTPADING(T) \
template void input_padding(cudaStream_t stream, T* output,const T* input,int m,int k,int group_size,int pad_groupcount);
INSTANTIATEINPUTPADING
(
__half
)
src/turbomind/models/llama/awq_sugon/gemm_w4_dequation.cuh
deleted
100644 → 0
View file @
617e86ea
#pragma once
#include "src/turbomind/models/llama/awq_sugon/lmdeploy_sugon.cuh"
#include <string>
#include <iostream>
#include <fstream>
typedef
struct
ihipStream_t
*
hipStream_t
;
// template <typename T>
// void dequant_w4_gemm(cudaStream_t stream, T* output,const uint32_t* weight,const half2* zeros_and_scales,int k,int n,int group_size);
template
<
typename
T
>
void
PrintScale
(
cudaStream_t
stream
,
const
T
*
data
,
int
size
,
int
flag
,
int
m
,
int
n
);
template
<
typename
T
>
void
assign_fun
(
cudaStream_t
stream
,
T
*
A
,
const
T
*
B
,
int
size
);
extern
void
run_weight_only_gemm
(
const
void
*
A
,
const
void
*
B0
,
const
void
*
B1
,
void
*
C
,
int
M
,
int
N
,
int
K
,
int
StrideA
,
int
StrideB
,
int
StrideB_padded
,
// 输入的权重矩阵添加pad后的K
int
StrideC
,
int
Group
,
void
*
splitK_padA_workspace
,
// 用于SplitK和tensorA添加pad的显存空间
int
splitK_padA_workspace_elementSize
,
// workspace有多少个bit
hipStream_t
stream_id
=
0
);
template
<
typename
T
>
void
input_padding
(
cudaStream_t
stream
,
T
*
output
,
const
T
*
input
,
int
m
,
int
k
,
int
group_size
,
int
pad_groupcount
);
src/turbomind/models/llama/awq_sugon/lmdeploy_sugon.cuh
deleted
100644 → 0
View file @
617e86ea
#define BLOCKSIZE 256
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <cassert>
#include <cstdint>
#include <type_traits>
#include <sys/time.h>
#pragma once
struct
my_timer
{
timeval
ts
,
te
;
//起始时刻,终止时刻
float
dt
;
// 时间间隔,单位毫秒(ms)
void
start
(){
gettimeofday
(
&
ts
,
NULL
);
}
void
stop
(){
gettimeofday
(
&
te
,
NULL
);
long
int
dt_sec
=
te
.
tv_sec
-
ts
.
tv_sec
;
long
int
dt_usec
=
te
.
tv_usec
-
ts
.
tv_usec
;
dt
=
dt_sec
*
1.0e3
+
dt_usec
/
1.0e3
;
}
};
\ No newline at end of file
src/turbomind/utils/cublasMMWrapper.cc
View file @
09a80a64
...
...
@@ -46,21 +46,17 @@ cublasMMWrapper::cublasMMWrapper(cublasHandle_t cublas_handle,
m_weightlayout_switch
=
std
::
stoi
(
env_weightlayout_str
);
}
const
char
*
env_dump_str
=
std
::
getenv
(
"LMDEPLOY_DUMP_SWITCH"
);
if
(
env_dump_str
!=
nullptr
)
{
m_dump_switch
=
std
::
stoi
(
env_dump_str
);
}
TM_LOG_DEBUG
(
__PRETTY_FUNCTION__
);
if
(
allocator_
!=
nullptr
)
{
cublas_workspace_
=
allocator_
->
reMalloc
(
cublas_workspace_
,
CUBLAS_WORKSPACE_SIZE
,
false
);
//当采用rocblas的时候或者采用ck并开启dump功能的时候需要申请反量化模块
if
(
m_weightlayout_switch
==
1
||
m_weightlayout_switch
==
0
||
(
m_weightlayout_switch
==
2
&&
m_dump_switch
==
1
)
)
if
(
m_weightlayout_switch
==
1
||
m_weightlayout_switch
==
0
)
{
//需要反量化后weight临时存储的空间
printf
(
"alloc space for deqeight
\n
"
);
deweight_workspace_
=
allocator_
->
reMalloc
(
deweight_workspace_
,
DEQ_WORKSPACE_SIZE
,
false
);
if
(
m_weightlayout_switch
==
1
||
(
m_weightlayout_switch
==
2
&&
m_dump_switch
==
1
)
)
if
(
m_weightlayout_switch
==
1
)
{
printf
(
"alloc space for xpading
\n
"
);
printf
(
"weight layout is tn pading rocblas
\n
"
);
...
...
@@ -109,21 +105,17 @@ cublasMMWrapper::cublasMMWrapper(cublasHandle_t cublas_handle,
m_weightlayout_switch
=
std
::
stoi
(
env_weightlayout_str
);
}
const
char
*
env_dump_str
=
std
::
getenv
(
"LMDEPLOY_DUMP_SWITCH"
);
if
(
env_dump_str
!=
nullptr
)
{
m_dump_switch
=
std
::
stoi
(
env_dump_str
);
}
TM_LOG_DEBUG
(
__PRETTY_FUNCTION__
);
if
(
allocator_
!=
nullptr
)
{
cublas_workspace_
=
allocator_
->
reMalloc
(
cublas_workspace_
,
CUBLAS_WORKSPACE_SIZE
,
false
);
//当采用rocblas的时候或者采用ck并开启dump功能的时候需要申请反量化模块
if
(
m_weightlayout_switch
==
1
||
m_weightlayout_switch
==
0
||
(
m_weightlayout_switch
==
2
&&
m_dump_switch
==
1
)
)
if
(
m_weightlayout_switch
==
1
||
m_weightlayout_switch
==
0
)
{
//需要反量化后weight临时存储的空间
printf
(
"alloc space for deqeight
\n
"
);
deweight_workspace_
=
allocator_
->
reMalloc
(
deweight_workspace_
,
DEQ_WORKSPACE_SIZE
,
false
);
if
(
m_weightlayout_switch
==
1
||
(
m_weightlayout_switch
==
2
&&
m_dump_switch
=
1
)
)
if
(
m_weightlayout_switch
==
1
)
{
printf
(
"alloc space for xpading
\n
"
);
printf
(
"weight layout is tn pading rocblas
\n
"
);
...
...
@@ -146,12 +138,12 @@ cublasMMWrapper::~cublasMMWrapper()
mu_
=
nullptr
;
if
(
allocator_
!=
nullptr
)
{
allocator_
->
free
((
void
**
)(
&
cublas_workspace_
));
if
(
m_weightlayout_switch
==
1
||
m_weightlayout_switch
==
0
||
(
m_weightlayout_switch
==
2
&&
m_dump_switch
==
1
)
)
if
(
m_weightlayout_switch
==
1
||
m_weightlayout_switch
==
0
)
{
//需要反量化后weight临时存储的空间
printf
(
"free space for deqeight
\n
"
);
allocator_
->
free
((
void
**
)(
&
deweight_workspace_
));
if
(
m_weightlayout_switch
==
1
||
(
m_weightlayout_switch
==
2
&&
m_dump_switch
==
1
)
)
if
(
m_weightlayout_switch
==
1
)
{
printf
(
"free space for xpading
\n
"
);
allocator_
->
free
((
void
**
)(
&
xpading_workspace_
));
...
...
@@ -182,21 +174,17 @@ cublasMMWrapper::cublasMMWrapper(const cublasMMWrapper& wrapper):
m_weightlayout_switch
=
std
::
stoi
(
env_weightlayout_str
);
}
const
char
*
env_dump_str
=
std
::
getenv
(
"LMDEPLOY_DUMP_SWITCH"
);
if
(
env_dump_str
!=
nullptr
)
{
m_dump_switch
=
std
::
stoi
(
env_dump_str
);
}
TM_LOG_DEBUG
(
__PRETTY_FUNCTION__
);
if
(
allocator_
!=
nullptr
)
{
cublas_workspace_
=
allocator_
->
reMalloc
(
cublas_workspace_
,
CUBLAS_WORKSPACE_SIZE
,
false
);
//当采用rocblas的时候或者采用ck并开启dump功能的时候需要申请反量化模块
if
(
m_weightlayout_switch
==
1
||
m_weightlayout_switch
==
0
||
(
m_weightlayout_switch
==
2
&&
m_dump_switch
==
1
)
)
if
(
m_weightlayout_switch
==
1
||
m_weightlayout_switch
==
0
)
{
//需要反量化后weight临时存储的空间
printf
(
"alloc space for deqeight
\n
"
);
deweight_workspace_
=
allocator_
->
reMalloc
(
deweight_workspace_
,
DEQ_WORKSPACE_SIZE
,
false
);
if
(
m_weightlayout_switch
==
1
||
(
m_weightlayout_switch
==
2
&&
m_dump_switch
==
1
)
)
if
(
m_weightlayout_switch
==
1
)
{
printf
(
"alloc space for xpading
\n
"
);
printf
(
"weight layout is tn pading rocblas
\n
"
);
...
...
src/turbomind/utils/cublasMMWrapper.h
View file @
09a80a64
...
...
@@ -74,8 +74,8 @@ public:
//x的pad
void
*
xpading_workspace_
=
nullptr
;
void
*
deweight_workspace_
=
nullptr
;
int
m_weightlayout_switch
=
1
;
int
m_dump_switch
=
0
;
int
m_weightlayout_switch
=
0
;
cublasMMWrapper
(
cublasHandle_t
cublas_handle_
,
cublasLtHandle_t
cublaslt_handle_
,
cudaStream_t
stream
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment